In [1]:
import json
import pandas as pd
import numpy as np
import re
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, FunctionTransformer

In [3]:
DATA_PATH = "problems_data.jsonl" 

rows = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")]
df = pd.DataFrame(rows)

In [4]:
df["combined_text"] = (
    df["title"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["input_description"].fillna("") + " " +
    df["output_description"].fillna("")
)

df["problem_class"] = df["problem_class"].str.title() 

In [6]:
from feature_utils import text_features

feature_union = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2)), "combined_text"),
        ("stats", Pipeline([
            ("extract", FunctionTransformer(text_features, validate=False)),
            ("scale", StandardScaler())
        ]), "combined_text")
    ]
)

In [7]:
X_train, X_test, y_train_cls, y_test_cls = train_test_split(
    df, df["problem_class"], test_size=0.2, random_state=42
)
y_train_reg = df.loc[y_train_cls.index, "problem_score"]
y_test_reg = df.loc[y_test_cls.index, "problem_score"]

In [8]:
cls_model = Pipeline([
    ("features", feature_union),
    ("clf", LogisticRegression(max_iter=500))
])

cls_model.fit(X_train, y_train_cls)
pred_cls = cls_model.predict(X_test)

print("=== Classification Results ===")
print("Accuracy:", accuracy_score(y_test_cls, pred_cls))
print("Confusion Matrix:\n", confusion_matrix(y_test_cls, pred_cls))

reg_model = Pipeline([
    ("features", feature_union),
    ("reg", RandomForestRegressor(n_estimators=200, random_state=42))
])

reg_model.fit(X_train, y_train_reg)
pred_score = reg_model.predict(X_test)

print("\n=== Regression Results ===")
print("MAE:", mean_absolute_error(y_test_reg, pred_score))
print("RMSE:", np.sqrt(mean_squared_error(y_test_reg, pred_score)))

=== Classification Results ===
Accuracy: 0.5249088699878494
Confusion Matrix:
 [[ 42  47  47]
 [ 17 309  99]
 [ 22 159  81]]

=== Regression Results ===
MAE: 1.6939842041312272
RMSE: 2.027289053200058


In [9]:
joblib.dump(cls_model, "difficulty_classifier.joblib")
joblib.dump(reg_model, "difficulty_regressor.joblib")

print("\nModels Saved Successfully!")


Models Saved Successfully!


In [10]:
text_features(["test graph dp recursion"])

Unnamed: 0,text_len,num_digits,num_symbols,contains_graph,contains_dp,contains_math,contains_recursion
0,23,0,0,1,1,0,1
