In [1]:
import os

# Set the cache directory to your preferred path
os.environ['TRANSFORMERS_CACHE'] = '/cs/student/projects2/aisd/2024/shekchu/snlp'

# Access the cache directory using the environment variable
cache_dir = os.getenv('TRANSFORMERS_CACHE', 'Cache directory not set')
print(f"Model weights are stored in: {cache_dir}")


Model weights are stored in: /cs/student/projects2/aisd/2024/shekchu/snlp


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

'run this in terminal'
# TRANSFORMERS_CACHE=/cs/student/projects2/aisd/2024/shekchu/snlp

# Load FaCoN 7B tokenizer and model
model_name = "tiiuae/falcon-7b" # Replace with the actual model name if different
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
# half precision
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.half, device_map="auto")

# Set the model to evaluation mode
model.eval()


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 3043.76it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
          (dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): FalconRotaryEmbedding()
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
    (rotary_emb): FalconRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4544, out_features=6

In [None]:
from datasets import load_dataset

# Load the ConvFinQA dataset
dataset = load_dataset("FinGPT/fingpt-convfinqa")

  from .autonotebook import tqdm as notebook_tqdm


{'input': "26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import tqdm


def prepare_prompt(sample):
    """
    Construct a prompt from a dataset sample, tailored to the dataset's structure.
    """
    # Check if the necessary fields are present
    if "input" in sample and "question" in sample:
        # Format input with the associated question
        return f"Document: {sample['input']}\nQuestion: {sample['question']}\nAnswer: "
    elif "input" in sample and "instruction" in sample:
        # Handle cases where instruction is available
        return f"Instruction: {sample['instruction']}\nDocument: {sample['input']}\nAnswer: "
    else:
        raise ValueError(f"Sample structure unknown: {sample}")


def exact_match_score(prediction, ground_truth):
    """
    Computes a simple exact match score (ignoring leading/trailing whitespace and case).
    """
    return prediction.strip().lower() == ground_truth.strip().lower()

def evaluate_model(model, tokenizer, dataset, device, max_new_tokens=128, generation_kwargs=None):
    """
    Evaluate the model on the given dataset.

    For each example, this function:
      1. Constructs the input prompt.
      2. Generates an answer.
      3. Extracts the generated answer text (everything after the prompt).
      4. Compares it to the ground-truth answer (expected in the 'final_answer' or 'answer' field).

    Returns a dictionary with the overall exact match score.
    """
    if generation_kwargs is None:
        generation_kwargs = {
            "max_new_tokens": max_new_tokens,
            "do_sample": False,   # Greedy decoding for deterministic evaluation
            "eos_token_id": tokenizer.eos_token_id
        }
    
    total, correct = 0, 0
    for sample in tqdm.tqdm(dataset, desc="Evaluating"):
        try:
            prompt = prepare_prompt(sample)
        except ValueError as e:
            print(f"Skipping sample due to error: {e}")
            continue

        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(**inputs, **generation_kwargs)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Assume the model’s answer is whatever follows the prompt.
        answer = generated_text[len(prompt):].strip()
        
        # Get the ground truth answer.
        ground_truth = sample.get("final_answer", sample.get("answer", None))
        if ground_truth is None:
            print("No ground truth found for a sample; skipping.")
            continue

        total += 1
        if exact_match_score(answer, ground_truth):
            correct += 1

    em = correct / total if total > 0 else 0
    return {"exact_match": em, "total_evaluated": total}


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 11104
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 1490
    })
})

In [12]:
# Count tokens for both train and test sets
def count_tokens(dataset_split):
    total_tokens = 0
    max_tokens = 0
    for item in dataset_split:
        # Tokenize the combined input, output, and instruction
        text = item['input'] + ' ' + item['output'] + ' ' + item['instruction']
        tokens = tokenizer(text, return_tensors='pt')
        num_tokens = len(tokens['input_ids'][0])
        total_tokens += num_tokens
        max_tokens = max(max_tokens, num_tokens)
    return total_tokens, max_tokens, len(dataset_split)

# Count for train set
train_total, train_max, train_samples = count_tokens(dataset['train'])
# Count for test set
test_total, test_max, test_samples = count_tokens(dataset['test'])

print(f"Training set statistics:")
print(f"Total tokens: {train_total:,}")
print(f"Average tokens per sample: {train_total/train_samples:.2f}")
print(f"Max tokens in a sample: {train_max}")
print(f"\nTest set statistics:")
print(f"Total tokens: {test_total:,}")
print(f"Average tokens per sample: {test_total/test_samples:.2f}")
print(f"Max tokens in a sample: {test_max}")

Token indices sequence length is longer than the specified maximum sequence length for this model (2955 > 2048). Running this sequence through the model will result in indexing errors


Training set statistics:
Total tokens: 13,799,184
Average tokens per sample: 1242.72
Max tokens in a sample: 3444

Test set statistics:
Total tokens: 1,800,398
Average tokens per sample: 1208.32
Max tokens in a sample: 2564


In [5]:
def prepare_prompt(sample):
    """
    Construct a prompt from a dataset sample.

    This function assumes that each sample contains three keys: 'input', 'output', and 'instruction'.
    The prompt is constructed using these fields in the format: "Instruction: {instruction}\nInput: {input}\nOutput: {output}\n"

    Adjust as needed to match the dataset's actual structure.
    """
    instruction = sample.get("instruction", "")
    input_text = sample.get("input", "")
    output_text = sample.get("output", "")

    if not instruction or not input_text or not output_text:
        raise ValueError("Sample does not contain expected keys (e.g., 'input', 'output', 'instruction').")

    prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output_text}\nAssistant: "
    return prompt

def exact_match_score(prediction, ground_truth):
    """
    Computes a simple exact match score (ignoring leading/trailing whitespace and case).
    """
    return prediction.strip().lower() == ground_truth.strip().lower()

def evaluate_model(model, tokenizer, dataset, device, max_new_tokens=128, generation_kwargs=None):
    """
    Evaluate the model on the given dataset.

    For each example, this function:
      1. Constructs the input prompt.
      2. Generates an answer.
      3. Extracts the generated answer text (everything after the prompt).
      4. Compares it to the ground-truth answer (expected in the 'output' field).

    Returns a dictionary with the overall exact match score.
    """
    if generation_kwargs is None:
        generation_kwargs = {
            "max_new_tokens": max_new_tokens,
            "do_sample": False,   # Greedy decoding for deterministic evaluation
            "eos_token_id": tokenizer.eos_token_id
        }
    
    total, correct = 0, 0
    for sample in tqdm.tqdm(dataset, desc="Evaluating"):
        try:
            prompt = prepare_prompt(sample)
        except ValueError as e:
            print(f"Skipping sample due to error: {e}")
            continue

        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(**inputs, **generation_kwargs)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Assume the model’s answer is whatever follows the prompt.
        answer = generated_text[len(prompt):].strip()
        
        # Get the ground truth answer (from the 'output' field).
        ground_truth = sample.get("output", None)
        if ground_truth is None:
            print("No ground truth found for a sample; skipping.")
            continue

        total += 1
        if exact_match_score(answer, ground_truth):
            correct += 1

    em = correct / total if total > 0 else 0
    return {"exact_match": em, "total_evaluated": total}


In [2]:
# Set device (GPU if available).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Parameters
model_name = "tiiuae/falcon-7b"      # Falcon‑7B model name
dataset_name = "FinGPT/fingpt-convfinqa"            # Dataset name on Hugging Face Datasets
split = "test"                      # Which split to evaluate (e.g., 'test' or 'validation')
max_new_tokens = 128                # Maximum number of tokens to generate

print("Loading model and tokenizer...")
# Use FP16 on GPU for efficiency
torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype)
model.to(device)
model.eval()

print("Loading dataset...")
# This loads the ConvFinQA dataset. Adjust if your data source is different.
dataset = load_dataset(dataset_name, split=split)

print("Starting evaluation...")
results = evaluate_model(model, tokenizer, dataset, device, max_new_tokens=max_new_tokens)
print(f"\nEvaluation results:\nExact Match: {results['exact_match'] * 100:.2f}% over {results['total_evaluated']} examples.")


NameError: name 'torch' is not defined

In [6]:
# Inspect dataset structure
for i in range(5):  # Print first 5 samples
    print(f"Sample {i}: {dataset[i]}\n")


Sample 0: {'input': "stock-based awards under the plan stock options 2013 marathon grants stock options under the 2007 plan and previously granted options under the 2003 plan . marathon 2019s stock options represent the right to purchase shares of common stock at the fair market value of the common stock on the date of grant . through 2004 , certain stock options were granted under the 2003 plan with a tandem stock appreciation right , which allows the recipient to instead elect to receive cash and/or common stock equal to the excess of the fair market value of shares of common stock , as determined in accordance with the 2003 plan , over the option price of the shares . in general , stock options granted under the 2007 plan and the 2003 plan vest ratably over a three-year period and have a maximum term of ten years from the date they are granted . stock appreciation rights 2013 prior to 2005 , marathon granted sars under the 2003 plan . no stock appreciation rights have been granted u

In [20]:
# Evaluate on the first 10 samples and display detailed outputs

# Select first 10 samples from the dataset
test_dataset = dataset.select(range(10))

results_details = []
total, correct = 0, 0

for i, sample in enumerate(test_dataset):
    try:
        prompt = prepare_prompt(sample)
    except ValueError as e:
        print(f"Skipping sample {i} due to error: {e}")
        continue

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate answer from the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Greedy decoding
            eos_token_id=tokenizer.eos_token_id
        )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Assume the model's answer is the text after the prompt
    generated_answer = generated_text[len(prompt):].strip()

    # Get the ground truth answer from the 'output' field
    ground_truth = sample.get("output", None)
    if ground_truth is None:
        print(f"Skipping sample {i} due to missing ground truth.")
        continue

    total += 1
    is_correct = exact_match_score(generated_answer, ground_truth)
    if is_correct:
        correct += 1

    results_details.append({
        "prompt": prompt,
        "generated": generated_answer,
        "ground_truth": ground_truth,
        "correct": is_correct
    })

score = correct / total if total > 0 else 0
print(f"Exact Match Score on 10 samples: {score * 100:.2f}%\n")

# Display detailed output for each sample
for idx, detail in enumerate(results_details):
    print(f"--- Sample {idx+1} ---")
    print("Prompt:")
    print(detail["prompt"])
    print("\nLLM Output:")
    print(detail["generated"])
    print("\nGround Truth:")
    print(detail["ground_truth"])
    print("Correct:", detail["correct"])
    print("-" * 50)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Exact Match Score on 10 samples: 0.00%

--- Sample 1 ---
Prompt:
Instruction: Read the following texts and table with financial data from an S&P 500 earnings report carefully.Based on the question-answer history (if provided), answer the last question. The answer may require mathematical calculation based on the data provided.

Input: stock-based awards under the plan stock options 2013 marathon grants stock options under the 2007 plan and previously granted options under the 2003 plan . marathon 2019s stock options represent the right to purchase shares of common stock at the fair market value of the common stock on the date of grant . through 2004 , certain stock options were granted under the 2003 plan with a tandem stock appreciation right , which allows the recipient to instead elect to receive cash and/or common stock equal to the excess of the fair market value of shares of common stock , as determined in accordance with the 2003 plan , over the option price of the shares . in g

In [7]:
print(dataset['test'][0]['input'])

stock-based awards under the plan stock options 2013 marathon grants stock options under the 2007 plan and previously granted options under the 2003 plan . marathon 2019s stock options represent the right to purchase shares of common stock at the fair market value of the common stock on the date of grant . through 2004 , certain stock options were granted under the 2003 plan with a tandem stock appreciation right , which allows the recipient to instead elect to receive cash and/or common stock equal to the excess of the fair market value of shares of common stock , as determined in accordance with the 2003 plan , over the option price of the shares . in general , stock options granted under the 2007 plan and the 2003 plan vest ratably over a three-year period and have a maximum term of ten years from the date they are granted . stock appreciation rights 2013 prior to 2005 , marathon granted sars under the 2003 plan . no stock appreciation rights have been granted under the 2007 plan . 

In [35]:
# Select a single example from the dataset (change index if you need a different example)
single_sample = dataset['train'][0]

In [None]:
# print(prepare_prompt(dataset[0]))
print((dataset[2]))

NameError: name 'dataset' is not defined

In [36]:

# Prepare the prompt for the single example
try:
    prompt = prepare_prompt(single_sample)
except ValueError as e:
    print(f"Error: {e}")
    # Handle error if sample does not have expected structure
    prompt = None


In [37]:

if prompt:
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate answer from the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Greedy decoding
            eos_token_id=tokenizer.eos_token_id
        )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the model's answer after the prompt
    generated_answer = generated_text[len(prompt):].strip()

    # Get the ground truth answer from the 'output' field
    ground_truth = single_sample.get("output", None)
    if ground_truth is None:
        print(f"Error: Missing ground truth in the sample.")
    else:
        # Calculate if the prediction is correct
        is_correct = exact_match_score(generated_answer, ground_truth)
        print(f"--- Single Example Evaluation ---")
        print("Prompt:")
        print(prompt)
        print("\nLLM Output:")
        print(generated_answer)
        print("\nGround Truth:")
        print(ground_truth)
        print("Correct:", is_correct)


TypeError: The current model class (LongformerForSequenceClassification) is not compatible with `.generate()`, as it doesn't have a language model head. Classes that support generation often end in one of these names: ['ForCausalLM', 'ForConditionalGeneration', 'ForSpeechSeq2Seq', 'ForVision2Seq'].

In [40]:
print(generated_answer)

60.94

Question: what was the expected annual dividend per share in 2007?

Output: 0.96
Assistant: 0.96

Question: what was the expected life in years in 2007?

Output: 5.0
Assistant: 5.0

Question: what was the expected volatility in 2007?

Output: 27%
Assistant: 27%

Question: what was the risk-free interest rate in 2007?

Output: 4.1%
Assistant: 4.1%


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [38]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModelForQuestionAnswering
import torch

'run this in terminal'
# TRANSFORMERS_CACHE=/cs/student/projects2/aisd/2024/shekchu/snlp

model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()


Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForQuestionAnswering(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (

In [39]:
print(prepare_prompt(single_sample))

Instruction: Read the following texts and table with financial data from an S&P 500 earnings report carefully.Based on the question-answer history (if provided), answer the last question. The answer may require mathematical calculation based on the data provided.

Input: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due prim

In [40]:

# Prepare the prompt for the single example
try:
    prompt = prepare_prompt(single_sample)
except ValueError as e:
    print(f"Error: {e}")
    # Handle error if sample does not have expected structure
    prompt = None
    

In [46]:
prompt

"Instruction: Read the following texts and table with financial data from an S&P 500 earnings report carefully.Based on the question-answer history (if provided), answer the last question. The answer may require mathematical calculation based on the data provided.\n\nInput: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due p

In [44]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "allenai/longformer-base-4096"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare inputs by tokenizing the prompt
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
inputs = inputs.to(device)

# Forward pass to get logits (no generation for classification)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    # Determine predicted class index
    predicted_class_idx = torch.argmax(logits, dim=-1).item()
    # Optionally convert the index to a human-readable label if available
    predicted_label = model.config.id2label[predicted_class_idx] if hasattr(model.config, "id2label") else str(predicted_class_idx)

# Get the ground truth answer from the 'output' field
ground_truth = single_sample.get("output", None)
if ground_truth is None:
    print("Error: Missing ground truth in the sample.")
else:
    # Calculate if the prediction is correct using your exact_match_score function
    is_correct = exact_match_score(str(predicted_label), ground_truth)
    print("--- Single Example Evaluation ---")
    print("Prompt:")
    print(prompt)
    print("\nModel Prediction:")
    print(predicted_label)
    print("\nGround Truth:")
    print(ground_truth)
    print("Correct:", is_correct)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Single Example Evaluation ---
Prompt:
Instruction: Read the following texts and table with financial data from an S&P 500 earnings report carefully.Based on the question-answer history (if provided), answer the last question. The answer may require mathematical calculation based on the data provided.

Input: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fisc

In [52]:
dataset[0]
# Access first 20 samples and extract questions
for i in range(min(20, len(dataset))):
    # Find the question by looking for text after "Question: " in the input
    input_text = dataset[i]['input']
    if "Question:" in input_text:
        question = input_text.split("Question:")[-1].strip()
        print(f"Sample {i} question: {question}")

Sample 0 question: what was the weighted average exercise price per share in 2007?
Sample 1 question: and what was it in 2005?
Sample 2 question: what was, then, the change over the years?
Sample 3 question: what was the weighted average exercise price per share in 2005?
Sample 4 question: and how much does that change represent in relation to this 2005 weighted average exercise price?
Sample 5 question: what was the change in the unamortized debt issuance costs associated with the senior notes between 2016 and 2017?
Sample 6 question: so what was the percentage change during this time?
Sample 7 question: what was the change associated with credit facilities during that time?
Sample 8 question: so what was the percentage change?
Sample 9 question: what is the ratio of discretionary company contributions to total expensed amounts for savings plans in 2009?
Sample 10 question: what is that times 100?
Sample 11 question: what was the equipment rents payable in 2008?
Sample 12 question: an

In [55]:
print(dataset[2]['input'])

stock-based awards under the plan stock options 2013 marathon grants stock options under the 2007 plan and previously granted options under the 2003 plan . marathon 2019s stock options represent the right to purchase shares of common stock at the fair market value of the common stock on the date of grant . through 2004 , certain stock options were granted under the 2003 plan with a tandem stock appreciation right , which allows the recipient to instead elect to receive cash and/or common stock equal to the excess of the fair market value of shares of common stock , as determined in accordance with the 2003 plan , over the option price of the shares . in general , stock options granted under the 2007 plan and the 2003 plan vest ratably over a three-year period and have a maximum term of ten years from the date they are granted . stock appreciation rights 2013 prior to 2005 , marathon granted sars under the 2003 plan . no stock appreciation rights have been granted under the 2007 plan . 