In [1]:
!pip install torch transformers datasets evaluate matplotlib seaborn nltk rouge-score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-p

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Enable CUDA debugging

In [3]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from typing import Dict, List, Tuple
import time
import gc

In [4]:
from huggingface_hub import login
api_token = "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"
login(api_token)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import re
import torch

# Check for GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B").to(device)
print("Model loaded successfully in GPU!!!!!!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import re
from datasets import load_dataset

# Load dataset and select first 100 entries
dataset = load_dataset("qintongli/GSM-Plus")
data = dataset['test'].select(range(100))

results = []

for example in data:
    # Construct explicit prompt
    question = example['question']
    prompt = f"Answer this question with only the numerical result and no other text: {question}\nAnswer:"

    # Tokenize and move to GPU
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output with constrained parameters
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=10,
        num_beams=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )

    # Decode and clean output
    full_answer = tokenizer.decode(outputs[0][inputs.input_ids.size(1):], skip_special_tokens=True)

    # Extract numerical answer using regex
    numerical_answer = re.search(r'[-+]?\d*\.?\d+', full_answer)
    clean_answer = numerical_answer.group() if numerical_answer else "NA"

    # Store results
    results.append({
        'question': question,
        'generated_answer': clean_answer,
        'reference_answer': example['answer']
    })

# Create DataFrame and save to CSV
df = pd.DataFrame(results)
df.to_csv('Llama3.2-3B_gsmplus_inference.csv', index=False)

print("Inference complete. Results saved to Llama3.2-3B_gsmplus_inference.csv")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Load results
df = pd.read_csv("Llama3.2-3B_gsmplus_inference.csv")  # Changed to match your output filename

# Custom numerical evaluation functions
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def normalize_number(num_str):
    """Remove trailing .0 from whole numbers and standardize format"""
    if not is_numeric(num_str):
        return num_str
    num = float(num_str)
    if num.is_integer():
        return str(int(num))
    return str(num)

def calculate_num_match(pred, ref):
    """Strict numerical comparison with normalization"""
    try:
        return int(float(pred) == float(ref))
    except (ValueError, TypeError):
        return 0

def calculate_num_close(pred, ref, rel_tol=1e-3):
    """Check if numbers are close (for floating point answers)"""
    try:
        pred_f = float(pred)
        ref_f = float(ref)
        return int(abs(pred_f - ref_f) <= rel_tol * abs(ref_f))
    except (ValueError, TypeError):
        return 0

# Calculate metrics
results = []
for _, row in df.iterrows():
    pred = str(row['generated_answer']).strip()
    ref = str(row['reference_answer']).strip()

    results.append({
        'exact_match': calculate_num_match(pred, ref),
        'close_match': calculate_num_close(pred, ref),
        'is_numeric': is_numeric(pred),  # Track if model produced valid number
        'pred_normalized': normalize_number(pred),
        'ref_normalized': normalize_number(ref)
    })

# Add metrics to original dataframe
metrics_df = pd.DataFrame(results)
final_df = pd.concat([df, metrics_df], axis=1)

# Save detailed results
final_df.to_csv("Llama3.2-3B_gsmplus_evaluated.csv", index=False)

# Calculate aggregate metrics
aggregate_metrics = {
    'Exact Match': np.mean(metrics_df['exact_match']),
    'Close Match (±0.1%)': np.mean(metrics_df['close_match']),
    'Valid Numerical Output': np.mean(metrics_df['is_numeric']),
    'Accuracy': accuracy_score(
        metrics_df['ref_normalized'],
        metrics_df['pred_normalized']
    )
}

print("\nNumerical Answer Evaluation Metrics:")
for metric, value in aggregate_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save aggregate metrics
pd.DataFrame([aggregate_metrics]).to_csv("Llama3.2-3B_aggregate_metrics.csv", index=False)