In [81]:
import pandas as pd
import numpy as np
import re

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [82]:
df=pd.read_csv('/content/question_ans_analysis.csv')

In [83]:
!pip install ydata-profiling

In [84]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Data Profile")
profile.to_notebook_iframe()

In [85]:
print("First 5 rows:")
print(df.head())

First 5 rows:
                                       question_text           subject  \
0    Solve the quadratic equation scenario number 1.       Mathematics   
1     Implement binary search for scenario number 2.  Computer Science   
2      Apply Newton law to system scenario number 3.           Physics   
3  Evaluate matrix determinant for scenario numbe...       Mathematics   
4    Solve the quadratic equation scenario number 5.       Mathematics   

  cognitive_level_bloom  readability_score  word_count  sentence_count  \
0                create              77.49          22               1   
1            understand              45.01          17               2   
2                create              89.84          36               1   
3              remember              44.38          17               1   
4            understand              54.48          23               2   

   time_taken_minutes  total_students_attempted  correct_attempts  \
0                  23      

In [86]:
print("\nInfo:")
df.info()



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   question_text             5000 non-null   object 
 1   subject                   5000 non-null   object 
 2   cognitive_level_bloom     5000 non-null   object 
 3   readability_score         5000 non-null   float64
 4   word_count                5000 non-null   int64  
 5   sentence_count            5000 non-null   int64  
 6   time_taken_minutes        5000 non-null   int64  
 7   total_students_attempted  5000 non-null   int64  
 8   correct_attempts          5000 non-null   int64  
 9   incorrect_attempts        5000 non-null   int64  
 10  correct_percentage        5000 non-null   float64
 11  learning_gap_score        5000 non-null   float64
 12  discrimination_index      5000 non-null   float64
 13  difficulty_label          5000 non-null   object 
 14  a

In [87]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
question_text               0
subject                     0
cognitive_level_bloom       0
readability_score           0
word_count                  0
sentence_count              0
time_taken_minutes          0
total_students_attempted    0
correct_attempts            0
incorrect_attempts          0
correct_percentage          0
learning_gap_score          0
discrimination_index        0
difficulty_label            0
assessment_quality_score    0
dtype: int64


In [88]:
columns_to_drop = [
    "correct_percentage",
    "learning_gap_score",
    "discrimination_index",
    "assessment_quality_score",
    "subject",
    "cognitive_level_bloom"
]

df = df.drop(columns=columns_to_drop, errors="ignore")

print("\nRemaining Columns:")
print(df.columns)


Remaining Columns:
Index(['question_text', 'readability_score', 'word_count', 'sentence_count',
       'time_taken_minutes', 'total_students_attempted', 'correct_attempts',
       'incorrect_attempts', 'difficulty_label'],
      dtype='object')


In [89]:
df["student_accuracy"] = df["correct_attempts"] / df["total_students_attempted"]
df["error_rate"] = df["incorrect_attempts"] / df["total_students_attempted"]

In [90]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_question"] = df["question_text"].apply(clean_text)

In [91]:
y = df["difficulty_label"]

print("\nClass Distribution:")
print(y.value_counts())


Class Distribution:
difficulty_label
hard      1703
easy      1652
medium    1645
Name: count, dtype: int64


In [92]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

X_text = embedder.encode(
    df["clean_question"].tolist(),
    show_progress_bar=True
)

print("Embedding Shape:", X_text.shape)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Embedding Shape: (5000, 384)


In [93]:
numeric_features = [
    "readability_score",
    "word_count",
    "sentence_count",
    "student_accuracy",
    "error_rate"
]

X_numeric = df[numeric_features].values

In [94]:
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

In [95]:
X = np.hstack((X_text, X_numeric_scaled))

print("Final Feature Shape:", X.shape)

Final Feature Shape: (5000, 389)


In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Training Samples:", X_train.shape[0])
print("Testing Samples:", X_test.shape[0])

Training Samples: 3500
Testing Samples: 1500


In [97]:
lr_model = LogisticRegression(max_iter=1000, C=0.5)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

In [98]:
print("===== Logistic Regression =====")

print("Accuracy:", accuracy_score(y_test, y_pred_lr))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

===== Logistic Regression =====
Accuracy: 0.9886666666666667

Classification Report:
              precision    recall  f1-score   support

        easy       1.00      0.99      0.99       496
        hard       1.00      0.98      0.99       511
      medium       0.97      1.00      0.98       493

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500


Confusion Matrix:
[[489   0   7]
 [  0 503   8]
 [  2   0 491]]


In [99]:
dt_model = DecisionTreeClassifier(
    max_depth=3,
    random_state=42
)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

In [100]:
print("===== Decision Tree =====")

print("Accuracy:", accuracy_score(y_test, y_pred_dt))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

===== Decision Tree =====
Accuracy: 0.9973333333333333

Classification Report:
              precision    recall  f1-score   support

        easy       1.00      1.00      1.00       496
        hard       1.00      1.00      1.00       511
      medium       1.00      0.99      1.00       493

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500


Confusion Matrix:
[[496   0   0]
 [  0 510   1]
 [  2   1 490]]


In [101]:
def predict_difficulty(question_text,
                       readability_score,
                       word_count,
                       sentence_count,
                       correct_attempts,
                       total_students_attempted):

    clean_q = clean_text(question_text)
    emb = embedder.encode([clean_q])

    student_accuracy = correct_attempts / total_students_attempted
    error_rate = 1 - student_accuracy

    numeric = np.array([[readability_score,
                         word_count,
                         sentence_count,
                         student_accuracy,
                         error_rate]])

    numeric_scaled = scaler.transform(numeric)

    features = np.hstack((emb, numeric_scaled))

    prediction = lr_model.predict(features)

    return prediction[0]

In [102]:
import joblib

joblib.dump(lr_model, "lr_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(embedder, "embedder.pkl")

['embedder.pkl']