In [None]:
# CELL 1 – Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

print("All imports successful!")


In [None]:
# CELL 2 – Load and inspect quiz data
csv_path = "../data/quiz/quiz_samples.csv"
quiz_df = pd.read_csv(csv_path)

print("Dataset shape:", quiz_df.shape)
print("\nFirst few rows:")
quiz_df.head()


In [None]:
# CELL 3 – Rename columns to standard names
quiz_df = quiz_df.rename(columns={
    quiz_df.columns[0]: "question",
    quiz_df.columns[1]: "answer",
    quiz_df.columns[2]: "topic",
    quiz_df.columns[3]: "difficulty"
})

print("Columns:", quiz_df.columns.tolist())
quiz_df.head()


In [None]:
# CELL 4 – Data cleaning and info
quiz_df["difficulty"] = quiz_df["difficulty"].astype(str).str.lower().str.strip()
quiz_df = quiz_df.dropna(subset=["question", "answer", "topic", "difficulty"])
quiz_df = quiz_df[quiz_df["question"].str.strip() != ""]

print("Cleaned shape:", quiz_df.shape)
print("\nDifficulty distribution:")
print(quiz_df["difficulty"].value_counts())
print("\nTopic distribution:")
print(quiz_df["topic"].value_counts())


In [None]:
# CELL 5 – Visualize difficulty distribution
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

quiz_df["difficulty"].value_counts().plot(kind="bar", ax=axes[0], color="skyblue")
axes[0].set_title("Difficulty Distribution")
axes[0].set_xlabel("Difficulty")
axes[0].set_ylabel("Count")

quiz_df["topic"].value_counts().plot(kind="bar", ax=axes[1], color="lightcoral")
axes[1].set_title("Topic Distribution")
axes[1].set_xlabel("Topic")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.savefig("../data/eda_plots.png")
plt.show()

print("EDA plots saved to data/eda_plots.png")


In [None]:
# CELL 6 – Train/test split and vectorizer setup
X = quiz_df["question"]
y = quiz_df["difficulty"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)}, Test set: {len(X_test)}")

vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"Training vector shape: {X_train_vec.shape}")
print(f"Test vector shape: {X_test_vec.shape}")


In [None]:
# CELL 7 – Train logistic regression
clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    C=0.5,
    solver="liblinear"
)

clf.fit(X_train_vec, y_train)

print("Model trained successfully!")


In [None]:
# CELL 8 – Evaluate model
y_pred = clf.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


In [None]:
# CELL 9 – Manual test predictions
test_questions = [
    "What is overfitting in machine learning?",
    "What does the len() function do?",
    "Explain neural networks.",
]

test_vec = vectorizer.transform(test_questions)
predictions = clf.predict(test_vec)

for q, pred in zip(test_questions, predictions):
    print(f"Q: {q}")
    print(f"Predicted difficulty: {pred}\n")


In [None]:
# CELL 10 – Save model and vectorizer
models_dir = "../backend/models"
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "quiz_difficulty_clf.joblib")
vec_path = os.path.join(models_dir, "quiz_vectorizer.joblib")

joblib.dump(clf, model_path)
joblib.dump(vectorizer, vec_path)

print(f"Model saved to: {model_path}")
print(f"Vectorizer saved to: {vec_path}")
