<a href="https://colab.research.google.com/github/AlperYildirim1/gemma-pipeline/blob/main/makale_sft_gemma_3_4b_test_pipeline_MMLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import os
import pandas as pd
import json
from tqdm.auto import tqdm
import torch
from datasets import load_dataset
from unsloth import FastModel
from openai import OpenAI


MODEL_NAME = "unsloth/gemma-3-4b-it"

DATASET_NAME = "Yujivus/mmlu_grpo_test"

DATASET_CONFIG = "pqa_labeled"
DATASET_SPLIT = "train" # Using the official test split


os.environ["OPENAI_API_KEY"] = "" # <--- PASTE YOUR KEY HERE
JUDGE_MODEL = "gpt-4.1-2025-04-14"

# File Paths for Outputs
OUTPUT_DIR = "/content/drive/MyDrive/gemma sft cevaplar"
FINAL_RESULTS_JSON = os.path.join(OUTPUT_DIR, "final_evaluation_results_mmlu_base.json")
INTERMEDIATE_CSV = os.path.join(OUTPUT_DIR, "intermediate_generated_answers_base_grpo.csv")

MAX_SAMPLES = 100

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configuration loaded. Model: {MODEL_NAME}, Dataset: {DATASET_NAME}")
print(f"Final output will be saved to: {FINAL_RESULTS_JSON}")


In [None]:
# ==============================================================================
# SECTION 2: LOAD MODEL AND DATASET
# ==============================================================================
print("\nLoading model and tokenizer with Unsloth...")

# --- Using your correct, full-precision model loading ---
model, tokenizer = FastModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=4096,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
)
print("✅ Model loaded successfully in full precision.")



print(f"\nLoading the 'train' split from dataset '{DATASET_NAME}'...")
test_data = load_dataset(DATASET_NAME, split="train")


if MAX_SAMPLES is not None:
    print(f"Selecting the first {MAX_SAMPLES} samples for this run.")
    test_data = test_data.select(range(MAX_SAMPLES))

print(f"✅ Dataset ready. Using {len(test_data)} samples for evaluation.")

In [None]:
# ==============================================================================
# SECTION 3: PHASE 1 - GENERATE ANSWERS
# ==============================================================================
print("\n--- Starting Phase 1: Generating Answers ---")

generated_results = []

# Loop through the combined dataset
for idx, item in enumerate(tqdm(test_data, desc="Generating Answers")):
    # CHANGED: Using the new column names from your MMLU dataset
    question_text = item['question']
    ground_truth_answer = item['answer']
    subject = item['subject']

    # CHANGED: The prompt is simpler as there is no separate context.
    # The entire problem, including options, is in the 'question' field.
    user_prompt = f"Please answer the following multiple-choice question:\n\n{question_text}"

    messages = [
        {"role": "user", "content": user_prompt}
    ]

    text_input = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer([text_input], return_tensors="pt").to("cuda")

    # Generate the model's response
    outputs = model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.0,
        do_sample=False
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # The model's answer is everything after the initial prompt
    model_full_answer = full_response.split(user_prompt)[-1].strip()

    # CHANGED: Saving the relevant columns for this dataset
    generated_results.append({
        "question_number": idx,
        "question": question_text,
        "subject": subject,
        "ground_truth_answer": ground_truth_answer,
        "model_full_answer": model_full_answer,
    })

# Save intermediate results to a CSV file
df_answers = pd.DataFrame(generated_results)
df_answers.to_csv(INTERMEDIATE_CSV, index=False)

print(f"\n✅ Phase 1 Complete. {len(generated_results)} answers saved to {INTERMEDIATE_CSV}")

In [None]:
# ==============================================================================
# SECTION 4: PHASE 2 - EVALUATE WITH LLM-AS-JUDGE
# ==============================================================================
print("\n--- Starting Phase 2: Evaluating and Combining Results ---")

if not os.environ.get("OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY") == "":
    print("🚨 WARNING: OpenAI API key is not set. Skipping Phase 2.")
else:
    try:
        df_to_judge = pd.read_csv(INTERMEDIATE_CSV)
        client = OpenAI()

        final_results_list = []

        # NEW: A judge prompt specifically for multiple-choice question evaluation
        judge_system_prompt = """You are an expert, impartial judge for evaluating an AI model's answer to a multiple-choice question.
You will be given the original question (which includes the options), the ground-truth answer, and the model's full response.

Your task is to determine if the model's answer is correct.
- Focus on the selected option (e.g., 'A', 'B', 'C', or 'D').
- The model might provide extra reasoning. The reasoning is not important for the correctness score, only the final chosen option.
- Be lenient with formatting. For example, if the model says "The correct answer is D" and the ground truth is "#### D. Vitamin B12", this is correct. If the model says "A" and the ground truth is "D", this is incorrect.

You MUST respond ONLY with a valid JSON object with the following structure:
{"correctness": boolean, "justification": "A brief explanation of your decision, noting the model's choice vs. the ground truth choice."}
"""

        for _, row in tqdm(df_to_judge.iterrows(), total=len(df_to_judge), desc="Judging Answers"):
            final_record = row.to_dict()

            judge_user_prompt = f"""
Please evaluate the following model output for a multiple-choice question.

**Original Question:**
{row['question']}

**Ground Truth Answer:**
{row['ground_truth_answer']}

**Model's Full Response:**
{row['model_full_answer']}
"""
            try:
                response = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=[
                        {"role": "system", "content": judge_system_prompt},
                        {"role": "user", "content": judge_user_prompt}
                    ],
                    temperature=0.0,
                    response_format={"type": "json_object"}
                )
                judge_assessment = json.loads(response.choices[0].message.content)
                final_record['judge_evaluation'] = judge_assessment
            except Exception as e:
                print(f"Error judging question {row['question_number']}: {e}")
                final_record['judge_evaluation'] = {"correctness": "error", "justification": str(e)}

            final_results_list.append(final_record)

        # Save the final combined results to a single JSON file
        with open(FINAL_RESULTS_JSON, 'w') as f:
            json.dump(final_results_list, f, indent=4)
        print(f"\n✅ Phase 2 Complete. Final results saved to {FINAL_RESULTS_JSON}")

        # Final Summary
        correct_count = sum(1 for item in final_results_list if item.get('judge_evaluation', {}).get('correctness') is True)
        total_judged = len(final_results_list)

        if total_judged > 0:
            accuracy = (correct_count / total_judged) * 100
            print("\n--- Final MMLU Results ---")
            print(f"Answer Correctness: {accuracy:.2f}% ({correct_count}/{total_judged})")

    except FileNotFoundError:
        print(f"🚨 ERROR: The intermediate answers file was not found at {INTERMEDIATE_CSV}. Cannot run Phase 2.")
    except Exception as e:
        print(f"🚨 An unexpected error occurred during Phase 2: {e}")