In [2]:
# =============================================
# AI Study Pal - Phase 2: ML Quiz Generation
# =============================================

# 1️⃣ Import Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.cluster import KMeans

# 2️⃣ Create Sample Quiz Dataset
data = {
    "question": [
        "What is 2+2?", 
        "Solve x^2 + 4x + 4 = 0",
        "Who discovered gravity?", 
        "Explain photosynthesis.",
        "What is the capital of France?", 
        "Describe World War II causes.",
        "Calculate 5 * 6",
        "Name the first president of the USA",
        "Define photosynthesis process",
        "Find the derivative of x^2 + 2x"
    ],
    "subject": [
        "Math", "Math", "Science", "Science", "History", "History", "Math", "History", "Science", "Math"
    ],
    "difficulty": [
        "Easy", "Medium", "Easy", "Medium", "Easy", "Medium", "Easy", "Easy", "Medium", "Medium"
    ]
}

df = pd.DataFrame(data)
df.to_csv("questions.csv", index=False)
print("✅ Sample quiz dataset saved as 'questions.csv'")
df.head()

# 3️⃣ Feature Extraction - Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['question'])
y = df['difficulty']

print("\n✅ Feature matrix shape:", X.shape)

# 4️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5️⃣ Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 6️⃣ Evaluate Model
y_pred = model.predict(X_test)
print("\n🔹 Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score (Medium):", f1_score(y_test, y_pred, pos_label='Medium'))

# 7️⃣ Function to Generate Quiz
def generate_quiz(questions_list):
    X_new = vectorizer.transform(questions_list)
    difficulty_pred = model.predict(X_new)
    quiz_df = pd.DataFrame({
        "question": questions_list,
        "predicted_difficulty": difficulty_pred
    })
    return quiz_df

# 8️⃣ Test Quiz Generation
new_questions = [
    "Calculate 10 + 15",
    "Explain Newton's second law",
    "Name the first president of the USA",
    "Find the integral of x^2"
]

quiz = generate_quiz(new_questions)
print("\n🔹 Generated Quiz:")
print(quiz)

# 9️⃣ Optional: Cluster Questions for Resource Suggestions
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

print("\n🔹 Clustered Questions:")
print(df[['question', 'subject', 'difficulty', 'cluster']])


✅ Sample quiz dataset saved as 'questions.csv'

✅ Feature matrix shape: (10, 28)

🔹 Model Evaluation:
Accuracy: 0.0
F1 Score (Medium): 0.0

🔹 Generated Quiz:
                              question predicted_difficulty
0                    Calculate 10 + 15                 Easy
1          Explain Newton's second law                 Easy
2  Name the first president of the USA                 Easy
3             Find the integral of x^2                 Easy

🔹 Clustered Questions:
                              question  subject difficulty  cluster
0                         What is 2+2?     Math       Easy        2
1               Solve x^2 + 4x + 4 = 0     Math     Medium        0
2              Who discovered gravity?  Science       Easy        0
3              Explain photosynthesis.  Science     Medium        0
4       What is the capital of France?  History       Easy        2
5        Describe World War II causes.  History     Medium        0
6                      Calculate 5 * 6    