In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [2]:
df = pd.read_json("problems_data.jsonl", lines=True)

def build_full_text(df):
    text_cols = ['title', 'description', 'input_description', 'output_description']
    return (
        df[text_cols]
        .fillna('')
        .agg(' '.join, axis=1)
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

df['full_text'] = build_full_text(df)

le = LabelEncoder()
df['problem_class_enc'] = le.fit_transform(df['problem_class'])
print(le.classes_)

y_cls = df['problem_class_enc']
y_reg = df['problem_score']

['easy' 'hard' 'medium']


In [4]:
def extract_stat_features(df):
    text = df['full_text']

    features = pd.DataFrame({
        'char_len': text.str.len(),
        'word_len': text.str.split().str.len(),
        'sentence_count': text.str.count(r'[.!?]') + 1,

        'digit_count': text.str.count(r'\d'),
        'math_symbol_count': text.str.count(r'[\+\-\*/=<>\^]'),

        # NEW (important)
        'log_char_len': np.log1p(text.str.len()),
        'log_word_len': np.log1p(text.str.split().str.len()),

        'constraint_count': text.str.count(r'(<=|>=|<|>)'),
        'power_count': text.str.count(r'\^'),
        'big_o_count': text.str.count(r'o\('),

        'newline_count': text.str.count(r'\n'),
        'colon_count': text.str.count(':'),

        'avg_word_len': text.str.len() / (text.str.split().str.len() + 1),
    })

    return features

KEYWORDS = {
    # Core paradigms
    'dp': ['dp', 'dynamic programming'],
    'graph': ['graph', 'tree', 'dag'],
    'greedy': ['greedy'],
    'binary_search': ['binary search'],
    'two_pointers': ['two pointers'],

    # Data structures
    'segment_tree': ['segment tree'],
    'fenwick': ['fenwick', 'bit'],
    'heap': ['heap', 'priority queue'],
    'disjoint_set': ['dsu', 'union find'],

    # Math / advanced
    'modulo': ['mod', 'modulo'],
    'combinatorics': ['combination', 'permutation'],
    'probability': ['probability', 'expected'],

    # Geometry / strings
    'geometry': ['geometry', 'convex'],
    'string_algo': ['kmp', 'z', 'suffix'],

    # Brute force indicators
    'bruteforce': ['brute', 'naive']
}

def extract_keyword_features(df):
    text = df['full_text']
    feats = {}

    for k, patterns in KEYWORDS.items():
        feats[f'{k}_present'] = text.apply(
            lambda x: any(p in x for p in patterns)
        ).astype(int)

        feats[f'{k}_count'] = text.apply(
            lambda x: sum(p in x for p in patterns)
        )

    return pd.DataFrame(feats)

In [5]:
X_stat = extract_stat_features(df)
X_kw = extract_keyword_features(df)

In [6]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_features=30000)
X_tfidf = tfidf.fit_transform(df['full_text'])

X = csr_matrix(hstack([ X_tfidf, X_stat.values, X_kw.values]))
#X = csr_matrix(hstack([X_tfidf, X_stat.values, X_kw.values]))
#y = y_reg.values

## classification

In [7]:
best_params_cls = {'C': 0.00010021197815889274, 'class_weight': 'balanced'}
# X_cls = full feature matrix
# y_cls = full labels

final_clf = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression(
        **best_params_cls,
        solver='lbfgs',
        penalty='l2',
        max_iter=3000,
        n_jobs=-1
    ))
])

final_clf.fit(X, y_cls)

In [8]:
y_pred = final_clf.predict(X)

print(classification_report(y_cls, y_pred, target_names=le.classes_))
print(confusion_matrix(y_cls, y_pred))

              precision    recall  f1-score   support

        easy       0.99      1.00      0.99       766
        hard       1.00      0.99      1.00      1941
      medium       1.00      1.00      1.00      1405

    accuracy                           1.00      4112
   macro avg       0.99      1.00      1.00      4112
weighted avg       1.00      1.00      1.00      4112

[[ 766    0    0]
 [   7 1929    5]
 [   4    0 1401]]


## regresssion

In [9]:
best_params_reg = {'alpha': 9942.524780082125, 'fit_intercept': False}

final_reg = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('reg', Ridge(**best_params_reg))
])

final_reg.fit(X, y_reg)

In [10]:
preds = final_reg.predict(X)
mean_squared_error(y_reg, preds)**0.5

np.float64(0.6227107518819126)

## dumping

In [11]:
import joblib

In [12]:
joblib.dump(tfidf, "models/tfidf.joblib")
joblib.dump(final_clf, "models/clf_model.joblib")
joblib.dump(final_reg, "models/reg_model.joblib")
joblib.dump(le, "models/label_encoder.joblib")

feature_info = {
    "stat_cols": list(X_stat.columns),
    "kw_cols": list(X_kw.columns)
}

joblib.dump(feature_info, "models/feature_info.joblib")

['models/feature_info.joblib']