In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Dataset loading
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2\n2 3\n1 3\n3...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0\n2 3 2\n50 60 50\n30 50 40', 'ou...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3\n3 C\n2 C\n1 C', 'output': 'GH...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0\n10 0\n10 10', 'output': '14.1...",hard,9.6,https://open.kattis.com/problems/barktree


In [3]:
#DATA PREPROCESSING-cleaning the dataset

#handling the missing values
df = df.fillna("")

# Combine all important text columns into one
df["combined_text"] = (df["title"] + " " + df["description"] + " " + df["input_description"] + " " + df["output_description"]
)

df[["combined_text"]].head(2)

Unnamed: 0,combined_text
0,Uuu Unununium (Uuu) was the name of the chemic...
1,House Building A number of eccentrics from cen...


In [4]:
#FEATURE ENGINEERING

keywords = ["dp", "graph", "recursion", "tree", "greedy", "bfs", "dfs"]

def extract_numeric_features(text):
    length = len(text.split())
    math_symbols = "+-*/=<>%"
    sym_count = sum(text.count(sym) for sym in math_symbols)
    kw_counts = [text.lower().count(kw) for kw in keywords]
    return [length, sym_count] + kw_counts

X_numeric = np.array(
    df["combined_text"].apply(extract_numeric_features).tolist()
)



#TF_IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=3000,
    stop_words="english",
    ngram_range=(1, 2)
)

# Convert text to numerical features
X_tfidf = tfidf.fit_transform(df["combined_text"])


# Scale numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)


#stacking the features
from scipy.sparse import hstack
X_final = hstack([X_tfidf, X_numeric_scaled])



# Making the Targets
y_class = df["problem_class"]    #to be solved by classification
y_score = df["problem_score"]    # by regression


In [5]:
#TEST TRAIN DATA
from sklearn.model_selection import train_test_split

# Classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_final, y_class, test_size=0.2, random_state=42)

# Regression
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_final, y_score, test_size=0.2, random_state=42)

In [6]:
#Classification model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


#create model
clf = LogisticRegression(
    max_iter=5000,
    class_weight="balanced",
    solver="saga")


#train
clf.fit(X_train_c, y_train_c)

#predict
y_pred_c = clf.predict(X_test_c)


#evaluate
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_c, y_pred_c))
print("\nClassification Report:\n", classification_report(y_test_c, y_pred_c))

Accuracy: 0.4823815309842041

Confusion Matrix:
 [[ 79  19  38]
 [ 85 215 125]
 [ 58 101 103]]

Classification Report:
               precision    recall  f1-score   support

        easy       0.36      0.58      0.44       136
        hard       0.64      0.51      0.57       425
      medium       0.39      0.39      0.39       262

    accuracy                           0.48       823
   macro avg       0.46      0.49      0.47       823
weighted avg       0.51      0.48      0.49       823



In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Regression model: Gradient Boosting usually handles difficulty scores better
reg = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

# Train model
reg.fit(X_train_r, y_train_r)

# Predict
y_pred_r = reg.predict(X_test_r)
y_pred_r = np.clip(y_pred_r, 1, 10)


# Evaluate
mae = mean_absolute_error(y_test_r, y_pred_r)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_r))

print("MAE:", mae)
print("RMSE:", rmse)


MAE: 1.6743733530402036
RMSE: 2.0241765994505365


In [8]:
import joblib

joblib.dump(tfidf, "autojudge_tfidf.pkl")
joblib.dump(scaler, "autojudge_scaler.pkl") 
joblib.dump(clf, "autojudge_classifier.pkl")
joblib.dump(reg, "autojudge_regressor.pkl")

['autojudge_regressor.pkl']