<a href="https://colab.research.google.com/github/AlperYildirim1/gemma-pipeline/blob/main/makale_sft_gemma_3_4b_test_pipeline_PubmedQa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!
<div class="align-center">
<a href="https://unsloth.ai/"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
<a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
<a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐
</div>

To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://docs.unsloth.ai/get-started/installing-+-updating).

You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)


### News

Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).

Read our **[Gemma 3N Guide](https://docs.unsloth.ai/basics/gemma-3n-how-to-run-and-fine-tune)** and check out our new **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants which outperforms other quantization methods!

Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).


### Installation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import os
import pandas as pd
import json
from tqdm.auto import tqdm
import torch
from datasets import load_dataset
from unsloth import FastModel
from openai import OpenAI


MODEL_NAME = "Yujivus/gemma-3-1b-sft1"


DATASET_NAME = "qiaojin/PubMedQA"

DATASET_CONFIG = "pqa_labeled"
DATASET_SPLIT = "train" # Using the official test split


os.environ["OPENAI_API_KEY"] = "" # <--- PASTE YOUR KEY HERE
JUDGE_MODEL = "gpt-4.1-2025-04-14" # "gpt-4-turbo" or "gpt-4o" are recommended

# File Paths for Outputs
OUTPUT_DIR = "/content/drive/MyDrive/gemma sft cevaplar"
FINAL_RESULTS_JSON = os.path.join(OUTPUT_DIR, "final_evaluation_results_4b_sft_model.json")
INTERMEDIATE_CSV = os.path.join(OUTPUT_DIR, "/content/drive/MyDrive/gemma sft cevaplar/intermediate_generated_answers.csv")


MAX_SAMPLES = 100

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configuration loaded. Model: {MODEL_NAME}, Dataset: {DATASET_NAME}")
print(f"Final output will be saved to: {FINAL_RESULTS_JSON}")


In [None]:


# ==============================================================================
# SECTION 2: LOAD MODEL AND DATASET (REVISED)
# ==============================================================================
print("\nLoading model and tokenizer with Unsloth...")
model, tokenizer = FastModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=4096,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
)
print("✅ Model loaded successfully.")

print(f"\nLoading PubMedQA dataset with config: '{DATASET_CONFIG}'...")
# CORRECTED: Pass the correct configuration name to load_dataset
dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)
test_data = dataset[DATASET_SPLIT]

if MAX_SAMPLES is not None:
    print(f"Selecting the first {MAX_SAMPLES} samples for this run.")
    test_data = test_data.select(range(MAX_SAMPLES))

print(f"✅ Dataset loaded. Using {len(test_data)} samples from the '{DATASET_SPLIT}' split.")

In [None]:
# ==============================================================================
# SECTION 3: PHASE 1 - GENERATE ANSWERS (REVISED AND SIMPLIFIED)
# ==============================================================================
print("\n--- Starting Phase 1: Generating Answers (without parsing) ---")

generated_results = []

# Loop through the dataset with a progress bar
for idx, item in enumerate(tqdm(test_data, desc="Generating Answers")):
    question = item['question']
    context_text = "\n".join(item['context']['contexts'])
    ground_truth_answer = item['final_decision']
    long_answer_context = item['long_answer']

    user_prompt = f"Context:\n{context_text}\n\nQuestion:\n{question}"

    messages = [
        {"role": "user", "content": [{"type": "text", "text": user_prompt}]}
    ]

    text_input = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer([text_input], return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7, # This should have been 0.0
        top_p=0.95,
        do_sample=True
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    model_full_answer = full_response.split(text_input)[-1].strip()

    generated_results.append({
        "question_number": idx,
        "question": question,
        "context": context_text,
        "ground_truth_answer": ground_truth_answer,
        "ground_truth_long_answer": long_answer_context,
        "model_full_answer": model_full_answer,
    })

# Save intermediate results to a CSV file
df_answers = pd.DataFrame(generated_results)
df_answers.to_csv(INTERMEDIATE_CSV, index=False)

print(f"\n✅ Phase 1 Complete. {len(generated_results)} intermediate answers saved to {INTERMEDIATE_CSV}")

In [None]:

# ==============================================================================
# SECTION 4: PHASE 2 - EVALUATE WITH LLM-AS-JUDGE AND COMBINE
# ==============================================================================
print("\n--- Starting Phase 2: Evaluating and Combining Results ---")

if os.environ.get("OPENAI_API_KEY") == "sk-...":
    print("🚨 WARNING: OpenAI API key is not set. Skipping Phase 2.")
else:
    try:
        # Load the generated answers
        df_to_judge = pd.read_csv(INTERMEDIATE_CSV)
        client = OpenAI()

        # REVISED: This list will hold the final combined data
        final_results_list = []

        judge_system_prompt = """You are an expert, impartial judge for evaluating AI models in the medical domain.
Your task is to assess a model's full response to a question based on a given context.

First, you must carefully read the model's full response and determine its final, conclusive answer (e.g., 'yes', 'no', 'maybe').

Then, evaluate based on two criteria:
1.  `correctness`: Is the model's final answer correct when compared to the ground truth? (e.g., 'yes' vs 'yes' is correct). Be lenient with synonyms if applicable, but for yes/no questions, it should be precise.
2.  `reasoning_correctness`: Did the model use the provided context and sound logic to arrive at its conclusion, or was the correct answer just a lucky guess? If the reasoning is irrelevant, nonsensical, or contradicts the context, this should be marked as incorrect.

You MUST respond ONLY with a valid JSON object with the following structure:
{"extracted_answer": "The final answer you determined from the model's text", "correctness": boolean, "reasoning_correctness": boolean, "justification": "A brief explanation of your decision."}
"""

        for _, row in tqdm(df_to_judge.iterrows(), total=len(df_to_judge), desc="Judging Answers"):
            # REVISED: Start building the final record for this item
            final_record = row.to_dict()

            judge_user_prompt = f"""
Please evaluate the following model output.

**Original Question:**
{row['question']}

**Ground Truth Answer:**
{row['ground_truth_answer']}

**Model's Full Response (including reasoning):**
{row['model_full_answer']}
"""
            try:
                response = client.chat.completions.create(
                    model=JUDGE_MODEL,
                    messages=[
                        {"role": "system", "content": judge_system_prompt},
                        {"role": "user", "content": judge_user_prompt}
                    ],
                    temperature=0.0, # Judge should be deterministic
                    response_format={"type": "json_object"} # Enforce JSON output
                )

                judge_assessment = json.loads(response.choices[0].message.content)
                # REVISED: Add the judge's assessment to the final record
                final_record['judge_evaluation'] = judge_assessment

            except Exception as e:
                print(f"Error judging question {row['question_number']}: {e}")
                final_record['judge_evaluation'] = {
                    "correctness": "error",
                    "reasoning_correctness": "error",
                    "justification": str(e)
                }

            # REVISED: Add the complete record to our final list
            final_results_list.append(final_record)

        # REVISED: Save the final combined list to a single JSON file
        with open(FINAL_RESULTS_JSON, 'w') as f:
            json.dump(final_results_list, f, indent=4)

        print(f"\n✅ Phase 2 Complete. Final combined results for {len(final_results_list)} items saved to {FINAL_RESULTS_JSON}")

        # Final Summary
        correct_count = sum(1 for item in final_results_list if item.get('judge_evaluation', {}).get('correctness') is True)
        reasoning_correct_count = sum(1 for item in final_results_list if item.get('judge_evaluation', {}).get('reasoning_correctness') is True)
        total_judged = len(final_results_list)

        if total_judged > 0:
            accuracy = (correct_count / total_judged) * 100
            reasoning_accuracy = (reasoning_correct_count / total_judged) * 100
            print("\n--- Final Results ---")
            print(f"Answer Correctness: {accuracy:.2f}% ({correct_count}/{total_judged})")
            print(f"Reasoning Correctness: {reasoning_accuracy:.2f}% ({reasoning_correct_count}/{total_judged})")

    except FileNotFoundError:
        print(f"🚨 ERROR: The intermediate answers file was not found at {INTERMEDIATE_CSV}. Cannot run Phase 2.")
    except Exception as e:
        print(f"🚨 An unexpected error occurred during Phase 2: {e}")
