In [1]:
import os
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Paths
DRIVE_MODEL_FOLDER = "/content/drive/MyDrive/Model1Folder"
LOCAL_WORK_FOLDER = "/content/hi"
model_folder = os.path.join(LOCAL_WORK_FOLDER, "hi")

# Create destination folder
os.makedirs(model_folder, exist_ok=True)

# Copy model files if available
if not os.path.exists(DRIVE_MODEL_FOLDER):
    print(f"❌ Model folder not found at: {DRIVE_MODEL_FOLDER}")
else:
    print(f"✅ Model folder found at: {DRIVE_MODEL_FOLDER}")
    for filename in os.listdir(DRIVE_MODEL_FOLDER):
        shutil.copy2(os.path.join(DRIVE_MODEL_FOLDER, filename), os.path.join(model_folder, filename))
    print(f"✅ All files copied to: {model_folder}")


Mounted at /content/drive
✅ Model folder found at: /content/drive/MyDrive/Model1Folder
✅ All files copied to: /content/hi/hi


In [None]:
# Install dependencies
!pip install llama-cpp-python groq pandas transformers torch --quiet

import os
import re
import pandas as pd
from groq import Groq
from llama_cpp import Llama
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datetime import datetime

# === Configuration ===
client = Groq(api_key="")  # Replace with your Groq API key

# Model paths
LOCAL_WORK_FOLDER = "/content/hi/hi"
FINETUNED_MODEL = "unsloth.Q4_K_M.gguf"
FINETUNED_PATH = f"{LOCAL_WORK_FOLDER}/{FINETUNED_MODEL}"
ORIGINAL_HF_MODEL = "unsloth/Llama-3.2-1B-Instruct"

# === Load Models ===
def load_finetuned_model():
    """Load fine-tuned GGUF model"""
    try:
        print("Loading fine-tuned model...")
        llm = Llama(
            model_path=FINETUNED_PATH,
            n_ctx=2048,
            n_threads=8,
            n_gpu_layers=0,
            verbose=False,
        )
        print("✅ Fine-tuned model loaded")
        return llm
    except Exception as e:
        print(f"❌ Failed to load fine-tuned model: {str(e)}")
        return None

def load_original_hf_model():
    """Load original HuggingFace model"""
    try:
        print("Loading original HuggingFace model...")
        tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_HF_MODEL)
        model = AutoModelForCausalLM.from_pretrained(
            ORIGINAL_HF_MODEL,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None
        )
        print("✅ Original HuggingFace model loaded")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Failed to load original model: {str(e)}")
        return None, None

# Load models
finetuned_llm = load_finetuned_model()
original_model, original_tokenizer = load_original_hf_model()

# === Generate SQL Questions ===
def generate_sql_questions_groq(num=10):
    """Generate SQL questions using Groq API"""
    prompt = f"""
Generate exactly {num} distinct SQL questions. Each should consist of:
- SQL table creation
- Some INSERT statements
- A final SQL-related question

Format each as a single block, separate with two newlines.
Only output the questions — no explanation.
"""
    try:
        chat_completion = client.chat.completions.create(
            model="gemma2-9b-it",
            messages=[{"role": "user", "content": prompt}],
        )
        raw_output = chat_completion.choices[0].message.content.strip()
        all_questions = [p.strip() for p in raw_output.split("\n\n") if p.strip()]
        return all_questions[:num]
    except Exception as e:
        print(f"❌ Failed to generate questions with Groq: {str(e)}")
        return []

# === Get Answers from Models ===
def get_finetuned_answers(llm, questions):
    """Get answers from fine-tuned model"""
    answers = []
    if llm is None:
        return answers

    for i, question in enumerate(questions):
        try:
            response = llm.create_chat_completion(
                messages=[{"role": "user", "content": question}],
                temperature=0.7,
                max_tokens=256,
                stop=["</s>"]
            )
            answer = response["choices"][0]["message"]["content"]
            answers.append({"question": question, "answer": answer})
        except Exception as e:
            answers.append({"question": question, "answer": f"Error: {str(e)}"})
    return answers

def get_original_answers(model, tokenizer, questions):
    """Get answers from original HuggingFace model"""
    answers = []
    if model is None or tokenizer is None:
        return answers

    for i, question in enumerate(questions):
        try:
            # Format prompt
            prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"

            # Tokenize
            inputs = tokenizer(prompt, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = inputs.to(model.device)

            # Generate
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            # Decode response
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            answer = response.split("assistant<|end_header_id|>")[-1].strip()

            answers.append({"question": question, "answer": answer})
        except Exception as e:
            answers.append({"question": question, "answer": f"Error: {str(e)}"})

    return answers

# === Evaluate Answers ===
def extract_score_from_text(text):
    """Extract score from evaluation text with improved regex"""
    # Try multiple patterns to catch the score
    patterns = [
        r'Score:\s*(\d+)',           # Score: 8
        r'score:\s*(\d+)',           # score: 8
        r'Rating:\s*(\d+)',          # Rating: 8
        r'rating:\s*(\d+)',          # rating: 8
        r'(\d+)/10',                 # 8/10
        r'(\d+)\s*out\s*of\s*10',    # 8 out of 10
        r'(\d+)\s*points',           # 8 points
        r'give\s*(?:it\s*)?(?:a\s*)?(\d+)', # give it a 8
        r'rate\s*(?:it\s*)?(?:a\s*)?(\d+)', # rate it a 8
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            score = int(match.group(1))
            # Ensure score is within valid range
            if 1 <= score <= 10:
                return score

    # If no score found, try to find any number between 1-10
    numbers = re.findall(r'\b(\d+)\b', text)
    for num in numbers:
        score = int(num)
        if 1 <= score <= 10:
            return score

    return 0  # Default if no valid score found

def evaluate_answers_groq(qa_pairs, model_name):
    """Evaluate answers using Groq API"""
    evaluations = []

    for i, qa in enumerate(qa_pairs):
        eval_prompt = f"""
You are an expert SQL tutor. Evaluate this answer:

Question: {qa['question']}
Answer: {qa['answer']}

Please rate the correctness and completeness from 1-10 where:
- 1-3: Completely wrong or no answer
- 4-6: Partially correct but missing key elements
- 7-8: Mostly correct with minor issues
- 9-10: Excellent, complete and correct

Format your response as:
Score: [number from 1-10]
Explanation: [brief explanation of the rating]
"""
        try:
            chat_completion = client.chat.completions.create(
                model="gemma2-9b-it",
                messages=[{"role": "user", "content": eval_prompt}],
            )
            evaluation_text = chat_completion.choices[0].message.content.strip()

            # Extract score using improved function
            score = extract_score_from_text(evaluation_text)

            evaluations.append({
                "question_id": i + 1,
                "question": qa["question"],
                "answer": qa["answer"],
                "evaluation": evaluation_text,
                "score": score,
                "model": model_name
            })

            print(f"Question {i+1}: Score extracted: {score}")

        except Exception as e:
            evaluations.append({
                "question_id": i + 1,
                "question": qa["question"],
                "answer": qa["answer"],
                "evaluation": f"Error: {str(e)}",
                "score": 0,
                "model": model_name
            })

    return evaluations

# === Generate CSV Report ===
def generate_csv_report(finetuned_evals, original_evals):
    """Generate comprehensive CSV report"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Prepare comparison data
    comparison_data = []

    for i in range(len(finetuned_evals)):
        ft_eval = finetuned_evals[i]
        orig_eval = original_evals[i] if i < len(original_evals) else None

        row = {
            'question_id': i + 1,
            'question': ft_eval['question'],
            'finetuned_answer': ft_eval['answer'],
            'finetuned_score': ft_eval['score'],
            'finetuned_evaluation': ft_eval['evaluation'],
            'original_answer': orig_eval['answer'] if orig_eval else 'N/A',
            'original_score': orig_eval['score'] if orig_eval else 0,
            'original_evaluation': orig_eval['evaluation'] if orig_eval else 'N/A',
            'score_difference': ft_eval['score'] - (orig_eval['score'] if orig_eval else 0),
            'improvement': 'Better' if ft_eval['score'] > (orig_eval['score'] if orig_eval else 0) else 'Worse' if ft_eval['score'] < (orig_eval['score'] if orig_eval else 0) else 'Same'
        }
        comparison_data.append(row)

    # Create DataFrame
    df = pd.DataFrame(comparison_data)

    # Calculate summary statistics
    ft_scores = [eval['score'] for eval in finetuned_evals]
    orig_scores = [eval['score'] for eval in original_evals]

    ft_avg = sum(ft_scores) / len(ft_scores)
    orig_avg = sum(orig_scores) / len(orig_scores) if orig_scores else 0

    # Summary statistics
    summary_stats = {
        'metric': ['Questions_Total', 'Finetuned_Avg_Score', 'Original_Avg_Score',
                  'Average_Improvement', 'Improvement_Percentage', 'Questions_Better',
                  'Questions_Worse', 'Questions_Same', 'Max_Finetuned', 'Max_Original'],
        'value': [
            len(finetuned_evals),
            round(ft_avg, 2),
            round(orig_avg, 2),
            round(ft_avg - orig_avg, 2),
            round(((ft_avg - orig_avg) / orig_avg * 100) if orig_avg > 0 else 0, 2),
            sum(1 for row in comparison_data if row['improvement'] == 'Better'),
            sum(1 for row in comparison_data if row['improvement'] == 'Worse'),
            sum(1 for row in comparison_data if row['improvement'] == 'Same'),
            max(ft_scores),
            max(orig_scores) if orig_scores else 0
        ]
    }

    df_summary = pd.DataFrame(summary_stats)

    # Save files
    main_filename = f'model_comparison_detailed_{timestamp}.csv'
    summary_filename = f'model_comparison_summary_{timestamp}.csv'

    df.to_csv(main_filename, index=False)
    df_summary.to_csv(summary_filename, index=False)

    # Print summary
    print(f"\n📊 EVALUATION COMPLETE")
    print(f"Questions evaluated: {len(finetuned_evals)}")
    print(f"Fine-tuned average: {ft_avg:.2f}/10")
    print(f"Original average: {orig_avg:.2f}/10")
    print(f"Improvement: {ft_avg - orig_avg:+.2f} points")
    print(f"Better on: {sum(1 for row in comparison_data if row['improvement'] == 'Better')}/{len(comparison_data)} questions")

    print(f"\n📁 Files saved:")
    print(f"• {main_filename} - Detailed comparison")
    print(f"• {summary_filename} - Summary statistics")

    return main_filename, summary_filename

# === Main Pipeline ===
def main():
    print("🚀 Starting Model Comparison Evaluation")
    print("Using Groq API for evaluation")

    # Generate questions
    print("Generating SQL questions...")
    questions = generate_sql_questions_groq(10)
    print(f"Generated {len(questions)} questions")

    # Get answers
    print("Getting answers from models...")
    finetuned_answers = get_finetuned_answers(finetuned_llm, questions)
    original_answers = get_original_answers(original_model, original_tokenizer, questions)

    # Evaluate answers
    print("Evaluating answers...")
    finetuned_evals = evaluate_answers_groq(finetuned_answers, "Fine-tuned") if finetuned_answers else []
    original_evals = evaluate_answers_groq(original_answers, "Original") if original_answers else []

    # Generate report
    if finetuned_evals and original_evals:
        generate_csv_report(finetuned_evals, original_evals)
    else:
        print("❌ Could not complete evaluation - missing model responses")

if __name__ == "__main__":
    main()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.8/49.8 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


✅ Fine-tuned model loaded
Loading original HuggingFace model...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

✅ Original HuggingFace model loaded
🚀 Starting Model Comparison Evaluation
Using Groq API for evaluation
Generating SQL questions...
Generated 10 questions
Getting answers from models...
Evaluating answers...
Question 1: Score extracted: 8
Question 2: Score extracted: 9
Question 3: Score extracted: 10
Question 4: Score extracted: 9
Question 5: Score extracted: 10
Question 6: Score extracted: 3
Question 7: Score extracted: 8
Question 8: Score extracted: 9
Question 9: Score extracted: 10
Question 10: Score extracted: 10
Question 1: Score extracted: 6
Question 2: Score extracted: 7
Question 3: Score extracted: 4
Question 4: Score extracted: 4
Question 5: Score extracted: 4
Question 6: Score extracted: 1
Question 7: Score extracted: 4
Question 8: Score extracted: 10
Question 9: Score extracted: 7
Question 10: Score extracted: 8

📊 EVALUATION COMPLETE
Questions evaluated: 10
Fine-tuned average: 8.60/10
Original average: 5.50/10
Improvement: +3.10 points
Better on: 9/10 questions

📁 Files sa

In [3]:
drive.mount('/content/drive')
llama_drive_folder = '/content/drive/MyDrive/llama'
os.makedirs(llama_drive_folder, exist_ok=True)
import glob


# Move all CSV files from current directory to Drive
for csv_file in glob.glob("*.csv"):
    shutil.move(csv_file, os.path.join(llama_drive_folder, os.path.basename(csv_file)))

print("✅ All CSV files moved to Google Drive → llama folder.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ All CSV files moved to Google Drive → llama folder.
