In [8]:
# Install necessary libraries
!pip install -q torch transformers accelerate bitsandbytes sentence-transformers rank_bm25
!pip install -q chromadb langchain langchain-community langchain-huggingface

In [2]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive
Drive mounted successfully.


In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# 1. Setup Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"System ready! Using device: {device}")

# 2. Setup Your Embeddings (The code you provided)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 3. Load Your ChromaDB
# --- UPDATE THIS PATH ---
DB_PATH = "/content/drive/MyDrive/graduation_project/vector_store/chroma_db"

if os.path.exists(DB_PATH):
    vector_db = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
    print(f"Vector Store loaded. Collection contains {vector_db._collection.count()} chunks.")
else:
    print(f"WARNING: Path {DB_PATH} not found. Please check the path.")

# 4. Load Qwen 7B (4-bit Quantized)
model_id = "Qwen/Qwen2.5-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

print("Loading Qwen 7B...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("Qwen 7B loaded successfully!")

System ready! Using device: cuda
Vector Store loaded. Collection contains 0 chunks.
Loading Qwen 7B...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Qwen 7B loaded successfully!


In [12]:
import json
import re
from sentence_transformers import util # We still need this for the math part (cos_sim)

# --- CONFIGURATION ---
THRESHOLD_HIGH = 0.85
VALID_GRADES = [0.0, 0.25, 0.5, 0.75, 1.0]

# --- Helper 1: Hybrid Similarity (Using your LangChain Embeddings) ---
def calculate_hybrid_similarity(text1, text2):
    # 1. Semantic Score (Meaning)
    # We use your existing 'embeddings' object
    emb1 = embeddings.embed_query(text1)
    emb2 = embeddings.embed_query(text2)

    # Convert to tensor for calculation
    tensor1 = torch.tensor(emb1)
    tensor2 = torch.tensor(emb2)
    semantic_score = util.cos_sim(tensor1, tensor2).item()

    # 2. Lexical Score (Word Overlap - Jaccard)
    set1 = set(text1.lower().split())
    set2 = set(text2.lower().split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    lexical_score = intersection / union if union > 0 else 0

    # 3. Weighted Average
    return round((0.7 * semantic_score) + (0.3 * lexical_score), 3)

# --- Helper 2: RAG Retrieval ---
def get_diverse_context(question):
    try:
        # 'mmr' ensures diversity in retrieved chunks
        results = vector_db.search(query=question, search_type="mmr", k=3)
        if not results:
            return "No context found."
        return "\n\n".join([doc.page_content for doc in results])
    except Exception as e:
        return f"Error retrieving context: {str(e)}"

# --- Main Logic ---
def grade_submission(question, student_ans, model_ans):

    # === TIER 1: FAST LANE ===
    direct_score = calculate_hybrid_similarity(student_ans, model_ans)

    if direct_score >= THRESHOLD_HIGH:
        return {
            "grade": 1.0,
            "feedback": "Perfect match with the model answer.",
            "method": "Fast_Tier_Direct_Match"
        }

    # === TIER 2: RAG LANE ===
    context_chunks = get_diverse_context(question)

    prompt = f"""
    You are a strict academic grading AI.
    The student's answer did NOT match the Model Answer directly.
    Decide the grade based on the **Reference Context**.

    ### DATA:
    - **Question**: {question}
    - **Student Answer**: {student_ans}
    - **Model Answer**: {model_ans}
    - **Reference Context**: {context_chunks}

    ### RUBRIC (Pick Closest):
    - 1.0: Correct concept, supported by Context.
    - 0.75: Mostly correct, misses minor details.
    - 0.5: Partially correct, significant gaps.
    - 0.25: Mostly wrong, mentions 1 correct keyword.
    - 0.0: Wrong.

    ### OUTPUT JSON ONLY:
    {{
        "grade": <number>,
        "feedback": "Two sentences max."
    }}
    """

    messages = [
        {"role": "system", "content": "Output only valid JSON."},
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=150,
        temperature=0.05,
        do_sample=True
    )

    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    try:
        json_match = re.search(r"\{.*\}", response, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group(0))
            raw_grade = float(result.get("grade", 0))
            final_grade = min(VALID_GRADES, key=lambda x: abs(x - raw_grade))

            return {
                "grade": final_grade,
                "feedback": result.get("feedback", "No feedback."),
                "method": "RAG_LLM_Check"
            }
        else:
            return {"grade": 0.0, "feedback": "AI Format Error", "raw": response}
    except Exception as e:
        return {"grade": 0.0, "feedback": f"System Error: {str(e)}"}

In [13]:
# Test your system with hardcoded examples
test_cases = [
    {
        "type": "Exact Match",
        "Question": "What is CPU?",
        "Model_Answer": "Central Processing Unit",
        "Student_Answer": "Central Processing Unit"
    },
    {
        "type": "Correct but RAG needed",
        "Question": "Explain the stack data structure.",
        "Model_Answer": "Stack follows LIFO (Last In First Out) principle.",
        "Student_Answer": "It is a linear structure where insertion and deletion happen at one end, like a pile of plates."
    },
    {
        "type": "Wrong Answer",
        "Question": "What is Python?",
        "Model_Answer": "Python is a high-level programming language.",
        "Student_Answer": "Python is a type of snake in the jungle."
    }
]

print(f"{'TEST TYPE':<25} | {'GRADE':<6} | {'METHOD':<20} | FEEDBACK")
print("-" * 100)

for case in test_cases:
    output = grade_submission(case["Question"], case["Student_Answer"], case["Model_Answer"])
    print(f"{case['type']:<25} | {output['grade']:<6} | {output['method']:<20} | {output['feedback']}")

TEST TYPE                 | GRADE  | METHOD               | FEEDBACK
----------------------------------------------------------------------------------------------------
Exact Match               | 1.0    | Fast_Tier_Direct_Match | Perfect match with the model answer.
Correct but RAG needed    | 0.75   | RAG_LLM_Check        | The student's answer is mostly correct, as it describes the nature of a stack as a linear structure with one-end operations. However, it lacks the specific LIFO principle mentioned in the model answer.
Wrong Answer              | 0.0    | RAG_LLM_Check        | The student's answer is incorrect and does not match the question.


In [18]:
import pandas as pd
from google.colab import files
from tqdm import tqdm

# 1. Upload File
print("Upload your 'students.csv' file (Must have columns: question, student_answer, model_answer)")
filename = "/content/drive/MyDrive/graduation_project/data_raw/data_khawaga_4_10.csv"

# 2. Process
df = pd.read_csv(filename)
results = []

print("Starting Batch Grading...")
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        q = row['question']
        s = row['student_answer']
        m = row['model_answer']

        # Run Grading
        res = grade_submission(q, s, m)

        # Save info
        results.append({
            "AI_Grade": res['grade'],
            "AI_Feedback": res['feedback'],
            "Method_Used": res['method']
        })
    except Exception as e:
        results.append({"AI_Grade": 0, "AI_Feedback": "Error", "Method_Used": "Fail"})



Upload your 'students.csv' file (Must have columns: Question, Student_Answer, Model_Answer)
Starting Batch Grading...


 12%|█▏        | 1039/8352 [58:18<6:50:24,  3.37s/it]


KeyboardInterrupt: 

In [20]:
# 3. Save Results
results_df = pd.DataFrame(results)
final_df = pd.concat([df, results_df], axis=1)

output_file = "graded_results.csv"
final_df.to_csv(output_file, index=False)
print(f"Done! Saving {output_file}...")
files.download(output_file)

Done! Saving graded_results.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
print(len(results_df),'\n')
print(results_df.head())

1039 

   AI_Grade                                        AI_Feedback    Method_Used
0      0.25  Mentions the concept of feasibility but does n...  RAG_LLM_Check
1      0.75  Close to the correct concept but misses the te...  RAG_LLM_Check
2      0.75  The student's answer is mostly correct, as it ...  RAG_LLM_Check
3      0.75  The student's answer is mostly correct, as it ...  RAG_LLM_Check
4      0.75  The student's answer is mostly correct as it d...  RAG_LLM_Check
