In [1]:
!pip install -q torch transformers accelerate bitsandbytes sentence-transformers rank_bm25

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import torch
import json
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# 1. Setup the GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"System ready! Using device: {device}")

# 2. Load the small 'Similarity' Model (for checking text overlap)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Load the big 'Grading' Model (Qwen 2.5 7B)
# We load it in '4-bit' mode so it fits in the free Colab GPU
model_id = "Qwen/Qwen2.5-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

print("Loading Qwen model... this may take 1-2 minutes...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("Qwen model loaded successfully!")

System ready! Using device: cuda
Loading Qwen model... this may take 1-2 minutes...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Qwen model loaded successfully!


In [16]:
import json
import re

# --- PART A: Hybrid Similarity Logic ---
def calculate_hybrid_similarity(student_ans, model_ans):
    # 1. Semantic Score
    embeddings = embedder.encode([student_ans, model_ans], convert_to_tensor=True)
    semantic_score = util.cos_sim(embeddings[0], embeddings[1]).item()

    # 2. Lexical Score (Jaccard)
    set_student = set(student_ans.lower().split())
    set_model = set(model_ans.lower().split())
    intersection = len(set_student.intersection(set_model))
    union = len(set_student.union(set_model))
    lexical_score = intersection / union if union > 0 else 0

    # 3. Combine
    hybrid_score = (0.9 * semantic_score) + (0.1 * lexical_score)
    return round(hybrid_score, 3)

# --- PART B: The Corrected Grading Function ---
def grade_student(question, student_ans, model_ans):
    sim_score = calculate_hybrid_similarity(student_ans, model_ans)

    prompt = f"""
    You are a strict academic grading assistant. Grade the Student Answer based on the Model Answer and the Similarity Score.

    ### INPUT DATA:
    - **Question**: {question}
    - **Model Answer**: {model_ans}
    - **Student Answer**: {student_ans}
    - **Hybrid Similarity Score**: {sim_score} (Range: 0.0 to 1.0)

    ### GRADING RUBRIC:
    - **1.0**: Perfect. Conceptually identical.
    - **0.75**: Good. Correct concept but minor errors or weak terms.
    - **0.5**: Partial. Main idea present but significant gaps.
    - **0.25**: Poor. Mostly wrong, but one relevant keyword.
    - **0.0**: Wrong.

    ### REQUIREMENTS:
    1. **Feedback**: 2 short sentences max.
    2. **Format**: Output ONLY valid JSON.

    ### OUTPUT JSON:
    {{
        "grade": <number>,
        "feedback": "<text>"
    }}
    """

    messages = [
        {"role": "system", "content": "You are a grading machine. Output only JSON."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=150,
        temperature=0.05,
        do_sample=True
    )

    # --- FIX START: Extract only the new tokens ---
    # We slice the output to remove the input prompt
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # --- FIX END ---

    # Robust JSON Cleaning
    try:
        # 1. Try to find the JSON object using regex (most reliable)
        # Looks for content starting with { and ending with }
        json_match = re.search(r"\{.*\}", response, re.DOTALL)

        if json_match:
            clean_json = json_match.group(0)
            return json.loads(clean_json)
        else:
            return {"error": "No JSON found in response", "raw_output": response}

    except Exception as e:
        return {"error": f"JSON Parsing Failed: {str(e)}", "raw_output": response}

In [17]:
# Example 1: High Similarity
q = "What is the function of the CPU?"
m = "The CPU executes instructions and processes data."
s = "The CPU is responsible for executing instructions and processing data."

print("Grading Example 1...")
result = grade_student(q, s, m)
print(json.dumps(result, indent=2))

print("-" * 30)

# Example 2: Low Similarity but Correct (Qwen decides)
q = "Explain Python lists."
m = "A list is a mutable, ordered sequence of elements."
s = "It's like an array that can change size and hold different stuff."

print("Grading Example 2...")
result = grade_student(q, s, m)
print(json.dumps(result, indent=2))

Grading Example 1...
{
  "grade": 0.889,
  "feedback": "Conceptually correct and very similar to the model answer."
}
------------------------------
Grading Example 2...
{
  "grade": 0.5,
  "feedback": "The student correctly identified that the list can change size and hold different types of data, but missed the key term 'mutable' and the concept of being an ordered sequence."
}
