In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load required libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import numpy as np
import time
import re

In [None]:
# Load model and tokenizer
load_directory = "PATH_TO_YOUR_MODEL"  # Update this path
tokenizer = AutoTokenizer.from_pretrained(load_directory)
model = AutoModelForCausalLM.from_pretrained(load_directory).to("cuda")

# Ensure the tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
def generate_math_prompt(question):
    prompt = f"""Solve this math word problem step by step:

{question}

Let's solve this step by step:
1) First, understand what is being asked
2) Then, identify the important information
3) Solve step by step
4) Finally, state the answer in the format: [ANSWER]<number>[/ANSWER]

Show your work:"""
    return prompt

def extract_answer(response):
    """Extract the numerical answer from between [ANSWER] tags"""
    try:
        # Look for answer between tags
        pattern = r'\[ANSWER\](\d+\.?\d*)\[\/ANSWER\]'
        match = re.search(pattern, response)
        if match:
            return float(match.group(1))

        # Fallback: look for last number after "answer is" or similar phrases
        patterns = [
            r'answer is[^\d]*(\d+\.?\d*)',
            r'answer:[^\d]*(\d+\.?\d*)',
            r'equals[^\d]*(\d+\.?\d*)',
            r'=\s*(\d+\.?\d*)'
        ]

        for pattern in patterns:
            match = re.search(pattern, response.lower())
            if match:
                return float(match.group(1))

        # Final fallback: last number in response
        numbers = re.findall(r'\d+\.?\d*', response)
        if numbers:
            return float(numbers[-1])

        return None
    except:
        return None

In [None]:
# Load evaluation dataset
dataset_path = "PATH_TO_YOUR_EVAL_DATASET"  # Update this path
df = pd.read_csv(dataset_path)

# Format questions
def format_question(row):
    numbers = row['Numbers'].split()
    body = row['Body']
    question = row['Ques']

    for i, num in enumerate(numbers):
        body = body.replace(f'number{i}', num)
        question = question.replace(f'number{i}', num)

    return f"Problem: {body}\nQuestion: {question}"

df['formatted_question'] = df.apply(format_question, axis=1)
print(f"Total evaluation examples: {len(df)}")

In [None]:
# Run evaluation
predictions = []
actual = []
start_time = time.time()

# ANSI color codes for output formatting
BLUE = '\033[94m'
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
CYAN = '\033[96m'
PURPLE = '\033[95m'
ENDC = '\033[0m'

total_questions = len(df)

for idx, row in df.iterrows():
    question = row['formatted_question']
    prompt = generate_math_prompt(question)

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
    output = model.generate(
        inputs.input_ids,
        max_length=512,
        temperature=0.1,
        do_sample=True,
        num_beams=3,
        top_p=0.9,
        repetition_penalty=1.2
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract and evaluate answer
    predicted = extract_answer(response)
    if predicted is not None:
        predictions.append(predicted)
        actual.append(float(row['Answer']))

        is_correct = abs(predicted - float(row['Answer'])) < 0.01
        correct_color = GREEN if is_correct else RED

        # Print progress
        questions_done = len(predictions)
        questions_left = total_questions - questions_done
        elapsed_time = time.time() - start_time
        avg_time_per_question = elapsed_time / questions_done if questions_done > 0 else 0
        estimated_time_left = questions_left * avg_time_per_question

        print(f"\n{CYAN}Progress: {questions_done}/{total_questions} questions{ENDC}")
        print(f"{CYAN}Average time per question: {avg_time_per_question:.1f} seconds{ENDC}")
        print(f"{CYAN}Estimated time remaining: {estimated_time_left/60:.1f} minutes{ENDC}\n")

        # Print results
        print(f"\n{YELLOW}Question {idx}:{ENDC}")
        print(f"{BLUE}Prompt: {question}{ENDC}")
        print(f"{YELLOW}Model response:{ENDC}")
        print(f"{PURPLE}{response}{ENDC}")
        print(f"{correct_color}Correct: {is_correct}{ENDC}")

In [None]:
# Calculate and display metrics
predictions = np.array(predictions)
actual = np.array(actual)
errors = np.abs(predictions - actual)

print("=== Evaluation Metrics ===")
print(f"Total examples evaluated: {len(predictions)}")
print(f"Mean absolute error: {errors.mean():.2f}")
print(f"Median absolute error: {np.median(errors):.2f}")
print(f"Exact matches: {(errors < 0.01).sum()}/{len(predictions)} ({(errors < 0.01).mean()*100:.1f}%)")