In [None]:
'''You want to find students who made mistakes related to “Photosynthesis”.

You’ve already saved each student’s mistakes (gap analysis) into a special searchable database called ChromaDB.

Now, you're asking:

“Hey ChromaDB, show me the top 3 students whose mistakes are similar to the topic ‘Photosynthesis’.”

🔍 What happens step by step?
✅ Step 1: Ask the database
python
Copy
Edit
query_result = collection.query(query_texts=["Photosynthesis"], n_results=3)
This searches for the top 3 students whose learning gaps are most similar to “Photosynthesis.”

✅ Step 2: Get the data back
The database gives you:

Student IDs

How similar their mistakes are to “Photosynthesis”

Their score

What mistakes they made

✅ Step 3: Show that nicely
You create a table with this info:

python
Copy
Edit
similar_df = pd.DataFrame({
    "Similar Rank": ...,       # 1st, 2nd, 3rd closest
    "Student ID": ...,         # Which student
    "Score": ...,              # Their quiz score
    "Distance": ...,           # How close the mistake is to “Photosynthesis”
    "Gap Summary Snippet": ... # A short preview of their mistake
})
Then you show this table on the screen:

python
Copy
Edit
display(similar_df)
🧾 Example Output:
Rank	Student	Score	How Close	What Their Mistake Said
1	STUDENT_005	4	0.76	Gave a simple answer like “plants prepare food”…
2	STUDENT_003	3	0.97	Said “sunlight makes energy,” missing real meaning…
3	STUDENT_001	3	1.01	Wrote “plants use sunlight,” missing other parts…

🎯 In the simplest words:
You are finding students whose mistakes are like each other.

In this case, you searched for Photosynthesis, and it found 3 students who didn’t fully understand that topic.

If you want, I can:

Turn this into a function (so you can search any topic)

Help you display full gap explanations, not just previews

Add a button to do this interactively'''

In [None]:
# =========================================================
# Cell 1: Imports
# =========================================================
import pandas as pd
import re
from openai import OpenAI
import chromadb
from chromadb.utils import embedding_functions
import matplotlib.pyplot as plt

# =========================================================
# Cell 2: Configuration
# =========================================================
OPENROUTER_API_KEY = "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"  # Replace with your key
GAP_ANALYSIS_MODEL = "gpt-4o-mini"
LESSON_GEN_MODEL = "gpt-4o-mini"

# OpenRouter client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY
)

# Initialize Chroma Vector DB
chroma_client = chromadb.PersistentClient(path="vector_db_progress")
embedding_fn = embedding_functions.DefaultEmbeddingFunction()
collection = chroma_client.get_or_create_collection(
    name="student_progress",
    embedding_function=embedding_fn
)

# =========================================================
# Cell 3: Quiz Questions & Student Answers
# =========================================================
quiz_questions = {
    1: {"question": "What is the capital of France?", "answer": "Paris"},
    2: {"question": "What is 9 * 8?", "answer": "72"},
    3: {"question": "Who wrote 'Romeo and Juliet'?", "answer": "Shakespeare"},
    4: {"question": "Define photosynthesis in one sentence.", "answer": "Process by which plants make food using sunlight"},
    5: {"question": "What is the boiling point of water in Celsius?", "answer": "100"}
}

students_answers = {
    "STUDENT_001": ["Paris", "72", "Shakespeare", "Plants use sunlight", "90"],
    "STUDENT_002": ["Paris", "70", "Shakespere", "It is plant food", "100"],
    "STUDENT_003": ["Lyon", "72", "Shakespeare", "Sunlight makes energy", "100"],
    "STUDENT_004": ["Paris", "65", "Unknown", "Photosynthesis is energy", "80"],
    "STUDENT_005": ["Paris", "72", "Shakespeare", "Plants prepare food", "100"]
}

# =========================================================
# Cell 4: Quiz Scoring
# =========================================================
def score_quiz(student_id, answers):
    total_score = 0
    details = []

    for q_no, q_data in quiz_questions.items():
        student_answer = answers[q_no - 1]
        correct_answer = q_data["answer"]

        # Clean and compare answers
        score = 1 if re.sub(r"\W+", "", student_answer.lower()) == re.sub(r"\W+", "", correct_answer.lower()) else 0
        total_score += score

        details.append({
            "Question": q_data["question"],
            "Student Answer": student_answer,
            "Correct Answer": correct_answer,
            "Score": score
        })

    return total_score, details


quiz_results = {}
for sid, answers in students_answers.items():
    total, detail = score_quiz(sid, answers)
    quiz_results[sid] = {"score": total, "details": detail}

# =========================================================
# Cell 5: Gap Analysis
# =========================================================
def perform_gap_analysis(details):
    incorrect = [d for d in details if d["Score"] == 0]
    if not incorrect:
        return "No learning gaps detected."

    gap_text = "\n".join(
        [f"Q: {d['Question']} | Student: {d['Student Answer']} | Correct: {d['Correct Answer']}" for d in incorrect]
    )

    prompt = f"Analyze the student's mistakes below and list learning gaps:\n{gap_text}"

    try:
        res = client.chat.completions.create(
            model=GAP_ANALYSIS_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200
        )
        return res.choices[0].message.content.strip()
    except Exception as e:
        print("Gap analysis error:", e)
        return "Gap analysis unavailable."


for sid, result in quiz_results.items():
    result["gap_analysis"] = perform_gap_analysis(result["details"])

# =========================================================
# Cell 6: Save to Vector DB
# =========================================================
for sid, result in quiz_results.items():
    try:
        collection.delete(ids=[sid])  # Remove if exists
    except:
        pass

    collection.add(
        ids=[sid],
        documents=[result["gap_analysis"]],
        metadatas=[{"score": result["score"]}]
    )

# =========================================================
# Cell 7: Lesson Generation
# =========================================================
def generate_lesson(gap_text):
    if "No learning gaps" in gap_text:
        return "Student is performing well. No extra lessons required."

    prompt = f"Create a short personalized lesson to fix these gaps:\n{gap_text}"

    try:
        res = client.chat.completions.create(
            model=LESSON_GEN_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300
        )
        return res.choices[0].message.content.strip()
    except Exception as e:
        print("Lesson generation error:", e)
        return "Lesson unavailable."


for sid, result in quiz_results.items():
    result["lesson"] = generate_lesson(result["gap_analysis"])

# =========================================================
# Cell 8: Final Dashboard Table
# =========================================================
dashboard = []
for sid, res in quiz_results.items():
    dashboard.append({
        "Student": sid,
        "Score": res["score"],
        "Gap Analysis": res["gap_analysis"],
        "Lesson": res["lesson"]
    })

df_dashboard = pd.DataFrame(dashboard)
df_dashboard["Rank"] = df_dashboard["Score"].rank(method="min", ascending=False).astype(int)
df_dashboard.sort_values("Rank", inplace=True)

print("\n=== Final Dashboard Table ===\n")
display(df_dashboard[["Rank", "Student", "Score", "Gap Analysis", "Lesson"]])

# =========================================================
# Cell 9: Bar Chart of Student Scores
# =========================================================
plt.figure(figsize=(10, 5))
plt.bar(df_dashboard["Student"], df_dashboard["Score"], color="skyblue")
plt.title("Student Quiz Scores")
plt.xlabel("Student ID")
plt.ylabel("Score")
plt.ylim(0, 5)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# =========================================================
# Cell 10: Query Vector DB (Find Similar Gaps)
# =========================================================
print("\n=== Similar Gaps: Query = 'Photosynthesis' ===\n")
query_result = collection.query(query_texts=["Photosynthesis"], n_results=3)

similar_ids = query_result["ids"][0]
distances = query_result["distances"][0]
metadatas = query_result["metadatas"][0]
documents = query_result["documents"][0]

similar_df = pd.DataFrame({
    "Similar Rank": range(1, len(similar_ids) + 1),
    "Student ID": similar_ids,
    "Score": [meta["score"] for meta in metadatas],
    "Distance": [round(d, 3) for d in distances],
    "Gap Summary Snippet": [doc[:150] + "..." for doc in documents]
})

display(similar_df)