# AI Study Pal Capstone: Quiz Model Training

This notebook trains and saves the quiz model (Logistic Regression for difficulty, KMeans for topics) for the AI Study Pal backend. Synthetic MCQ data is used for demonstration.

In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
# 2. Load and Explore Dataset
# Synthetic MCQ dataset
mcq_data = pd.DataFrame({
    'stem': [
        'What is machine learning?',
        'Define supervised learning.',
        'Explain unsupervised learning.',
        'What is a neural network?',
        'Describe logistic regression.',
        'What is clustering?',
        'Explain KMeans algorithm.',
        'What is feature engineering?',
        'Define accuracy in ML.',
        'What is overfitting?'
    ],
    'difficulty': ['easy', 'easy', 'medium', 'medium', 'easy', 'medium', 'medium', 'easy', 'easy', 'medium'],
    'topic': ['ML Basics', 'Supervised', 'Unsupervised', 'Neural Networks', 'Regression', 'Clustering', 'Clustering', 'Features', 'Metrics', 'General']
})
mcq_data.head()

In [None]:
# 3. Data Preprocessing
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(mcq_data['stem'])
y = mcq_data['difficulty'].map({'easy': 0, 'medium': 1})

# Split for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4. Feature Engineering
# (Already using BoW features; can add length feature)
mcq_data['length'] = mcq_data['stem'].apply(len)
X_len = mcq_data['length'].values.reshape(-1, 1)

# Combine BoW and length (for demonstration)
from scipy.sparse import hstack
X_full = hstack([X, X_len])

In [None]:
# 5. Model Selection and Training
# Logistic Regression for difficulty
logreg = LogisticRegression()
logreg.fit(X_full, y)

# KMeans for topic clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_full)

# Save vectorizer, logreg, kmeans
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(logreg, 'quiz_model.pkl')
joblib.dump(kmeans, 'quiz_kmeans.pkl')

In [None]:
# 6. Model Evaluation
# Predict on training set (demo)
preds = logreg.predict(X_full)
print('Accuracy:', accuracy_score(y, preds))
print(classification_report(y, preds))

In [None]:
# 7. Hyperparameter Tuning
# (Demo: GridSearchCV for LogisticRegression)
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10]}
gs = GridSearchCV(LogisticRegression(), param_grid, cv=3)
gs.fit(X_full, y)
print('Best C:', gs.best_params_['C'])

In [None]:
# 8. Model Deployment Example
# Load and use model for prediction
vectorizer_loaded = joblib.load('vectorizer.pkl')
logreg_loaded = joblib.load('quiz_model.pkl')
kmeans_loaded = joblib.load('quiz_kmeans.pkl')

sample_questions = ["What is supervised learning?", "Explain KMeans clustering."]
X_sample = vectorizer_loaded.transform(sample_questions)
X_sample_full = hstack([X_sample, np.array([len(q) for q in sample_questions]).reshape(-1, 1)])

# Predict difficulty
difficulty_pred = logreg_loaded.predict(X_sample_full)
print('Predicted difficulty:', difficulty_pred)

# Predict topic cluster
topic_pred = kmeans_loaded.predict(X_sample_full)
print('Predicted topic cluster:', topic_pred)