<a href="https://colab.research.google.com/github/Akhilvallala2023/Thesis/blob/main/Light_eval_verification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installation

In [None]:
!pip install tiktoken
# Step 1: Install LightEval with Accelerate
!pip install lighteval[accelerate]

#Testing on SmolLM-135M

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM-135M"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|hellaswag|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

#Testing both the models at same time

In [None]:
# Set up the models and output directories
MODELS = ["gpt2", "HuggingFaceTB/SmolLM-135M"]
OUTPUT_DIR = "/content/output"

for model in MODELS:
    !lighteval accelerate \
        --model_args "pretrained=$model" \
        --tasks "leaderboard|hellaswag|0|0" \
        --override_batch_size 16 \
        --output_dir ${OUTPUT_DIR}/$model


#MMLU physics on smolLM models

In [None]:
MODEL = "HuggingFaceTB/SmolLM-135M"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


In [None]:
MODEL = "HuggingFaceTB/SmolLM-360M"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


In [None]:
MODEL = "HuggingFaceTB/SmolLM-1.7B"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


In [None]:
MODEL = "HuggingFaceTB/SmolLM-135M"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


##MMLU physics on smolLM2 models

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM2-135M"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM2-360M"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM2-1.7B"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

#smolLM2 on Hellaswag

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM2-135M"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

In [None]:
# Step 2: Set up model and output directory
MODEL = "HuggingFaceTB/SmolLM-135M"
OUTPUT_DIR = "/content/output"

# Step 3: Run evaluation script for Hellaswag with SmolLM
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|hellaswag|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

#mmlu (cloze)

In [None]:
MODEL = "HuggingFaceTB/SmolLM-135M"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "TIGER-Lab/MMLU-Pro" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


#Saved Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the path to your local model
model_name = "./fine_tuned_smolLM2_360M"

# Load the tokenizer and model from the local directory, enforcing local loading
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True)

# Save the model in PyTorch binary format if needed
model.save_pretrained(model_name, safe_serialization=False)
tokenizer.save_pretrained(model_name)


In [None]:
# Set up model and output directory
MODEL = "./fine_tuned_smolLM2_360M"  # Use your local model
OUTPUT_DIR = "/content/output"

# Run evaluation script with your saved model
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR


In [None]:
!pip install datasets
!pip install transformers

#Evaluation on MMLU (cloze) cais/mmlu

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the SmolLM-135M model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# Set the pad_token if it's not already set
tokenizer.pad_token = tokenizer.eos_token or tokenizer.bos_token or "[PAD]"


# Load the MMLU dataset with the "all" configuration to get all subjects
dataset = load_dataset("cais/mmlu", "all", split="validation")  # Use "test" for the test set

# Evaluate function for a single question
def evaluate_question(question, choices, correct_answer):
    # Prepare the input in a cloze or multiple-choice prompt format
    prompt = f"Q: {question} Options: " + " ".join([f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices)])

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    # Get model logits for each choice
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # Get the logits for the last token

    # Calculate scores for each choice by running the model on each option
    choice_scores = []
    for choice in choices:
        choice_prompt = f"{question} {choice}"
        choice_input_ids = tokenizer(choice_prompt, return_tensors="pt").input_ids.to(model.device)
        with torch.no_grad():
            choice_output = model(choice_input_ids)
        choice_score = choice_output.logits[:, -1, :].mean().item()
        choice_scores.append(choice_score)

    # Get the choice with the highest score
    predicted_choice = torch.argmax(torch.tensor(choice_scores)).item()

    # Check if the predicted choice matches the correct answer
    is_correct = (predicted_choice == correct_answer)
    return is_correct

# Iterate through the dataset and evaluate each question
correct_count = 0
total_count = 0

for example in dataset:
    question = example["question"]
    choices = example["choices"]
    correct_answer = example["answer"]

    is_correct = evaluate_question(question, choices, correct_answer)
    if is_correct:
        correct_count += 1
    total_count += 1

# Calculate accuracy
accuracy = correct_count / total_count
print(f"Accuracy on MMLU (All Subjects): {accuracy * 100:.2f}%")


##Evaluation on MMLU (cloze) TIGER-Lab/MMLU-Pro

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# Load the SmolLM-135M model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# Set the pad_token if it's not already set
tokenizer.pad_token = tokenizer.eos_token or tokenizer.bos_token or "[PAD]"

# Load the MMLU dataset with the "all" configuration to get all subjects from TIGER-Lab/MMLU-Pro
dataset = load_dataset("TIGER-Lab/MMLU-Pro",split="validation")  # Use "test" for the test set if needed

# Evaluate function for a single question
def evaluate_question(question, options, correct_answer_index):
    # Prepare the input in a cloze or multiple-choice prompt format
    prompt = f"Q: {question} Options: " + " ".join([f"{chr(65 + i)}) {option}" for i, option in enumerate(options)])

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    # Get model logits for each choice
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # Get the logits for the last token

    # Calculate scores for each option by running the model on each choice
    choice_scores = []
    for option in options:
        choice_prompt = f"{question} {option}"
        choice_input_ids = tokenizer(choice_prompt, return_tensors="pt").input_ids.to(model.device)
        with torch.no_grad():
            choice_output = model(choice_input_ids)
        choice_score = choice_output.logits[:, -1, :].mean().item()
        choice_scores.append(choice_score)

    # Get the choice with the highest score
    predicted_choice_index = torch.argmax(torch.tensor(choice_scores)).item()

    # Check if the predicted choice matches the correct answer
    is_correct = (predicted_choice_index == correct_answer_index)
    return is_correct

# Iterate through the dataset and evaluate each question with tqdm progress bar
correct_count = 0
total_count = 0

for example in tqdm(dataset, desc="Evaluating questions"):
    question = example["question"]
    options = example["options"]
    correct_answer_index = example["answer_index"]  # Use 'answer_index' for the correct option

    is_correct = evaluate_question(question, options, correct_answer_index)
    if is_correct:
        correct_count += 1
    total_count += 1

# Calculate accuracy
accuracy = correct_count / total_count if total_count > 0 else 0
print(f"Accuracy on MMLU (All Subjects): {accuracy * 100:.2f}%")


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# Load the SmolLM-135M model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

# Set the pad_token if it's not already set
tokenizer.pad_token = tokenizer.eos_token or tokenizer.bos_token or "[PAD]"

# Load the MMLU dataset with the "all" configuration to get all subjects from TIGER-Lab/MMLU-Pro
dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")  # Use "test" for the test set if needed

# Evaluate function for a single question
def evaluate_question(question, options, correct_answer_index):
    # Prepare the input in a cloze or multiple-choice prompt format
    prompt = f"Q: {question} Options: " + " ".join([f"{chr(65 + i)}) {option}" for i, option in enumerate(options)])

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    # Get model logits for each choice
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # Get the logits for the last token

    # Calculate scores for each option by running the model on each choice
    choice_scores = []
    for option in options:
        choice_prompt = f"{question} {option}"
        choice_input_ids = tokenizer(choice_prompt, return_tensors="pt").input_ids.to(model.device)
        with torch.no_grad():
            choice_output = model(choice_input_ids)
        choice_score = choice_output.logits[:, -1, :].mean().item()
        choice_scores.append(choice_score)

    # Get the choice with the highest score
    predicted_choice_index = torch.argmax(torch.tensor(choice_scores)).item()

    # Check if the predicted choice matches the correct answer
    is_correct = (predicted_choice_index == correct_answer_index)
    return is_correct

# Iterate through the dataset and evaluate each question with tqdm progress bar
correct_count = 0
total_count = 0

for example in tqdm(dataset, desc="Evaluating questions"):
    question = example["question"]
    options = example["options"]
    correct_answer_index = example["answer_index"]  # Use 'answer_index' for the correct option

    # Evaluate the question and print debugging info if needed
    is_correct = evaluate_question(question, options, correct_answer_index)
    if is_correct:
        correct_count += 1
    total_count += 1

    # Optional debugging output for the first few examples
    if total_count <= 5:  # Adjust the number as needed
        print(f"Question: {question}")
        print(f"Options: {options}")
        print(f"Correct Answer Index: {correct_answer_index}")
        print(f"Correct: {is_correct}")

# Calculate accuracy
accuracy = correct_count / total_count if total_count > 0 else 0
print(f"Correct Count: {correct_count}")
print(f"Total Count: {total_count}")
print(f"Accuracy on MMLU (All Subjects): {accuracy * 100:.2f}%")
