In [7]:
import numpy as np
import pandas as pd
#import plotly.express as px
#import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [8]:
df = pd.read_json("problems_data.jsonl", lines=True)

def build_full_text(df):
    text_cols = ['title', 'description', 'input_description', 'output_description']
    return (
        df[text_cols]
        .fillna('')
        .agg(' '.join, axis=1)
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

df['full_text'] = build_full_text(df)

le = LabelEncoder()
df['problem_class_enc'] = le.fit_transform(df['problem_class'])
print(le.classes_)

y_cls = df['problem_class_enc']
y_reg = df['problem_score']

['easy' 'hard' 'medium']


In [9]:
def extract_stat_features(df):
    text = df['full_text']

    features = pd.DataFrame({
        'char_len': text.str.len(),
        'word_len': text.str.split().str.len(),
        'sentence_count': text.str.count(r'[.!?]') + 1,

        'digit_count': text.str.count(r'\d'),
        'math_symbol_count': text.str.count(r'[\+\-\*/=<>\^]'),

        # NEW (important)
        'log_char_len': np.log1p(text.str.len()),
        'log_word_len': np.log1p(text.str.split().str.len()),

        'constraint_count': text.str.count(r'(<=|>=|<|>)'),
        'power_count': text.str.count(r'\^'),
        'big_o_count': text.str.count(r'o\('),

        'newline_count': text.str.count(r'\n'),
        'colon_count': text.str.count(':'),

        'avg_word_len': text.str.len() / (text.str.split().str.len() + 1),
    })

    return features

KEYWORDS = {
    # Core paradigms
    'dp': ['dp', 'dynamic programming'],
    'graph': ['graph', 'tree', 'dag'],
    'greedy': ['greedy'],
    'binary_search': ['binary search'],
    'two_pointers': ['two pointers'],

    # Data structures
    'segment_tree': ['segment tree'],
    'fenwick': ['fenwick', 'bit'],
    'heap': ['heap', 'priority queue'],
    'disjoint_set': ['dsu', 'union find'],

    # Math / advanced
    'modulo': ['mod', 'modulo'],
    'combinatorics': ['combination', 'permutation'],
    'probability': ['probability', 'expected'],

    # Geometry / strings
    'geometry': ['geometry', 'convex'],
    'string_algo': ['kmp', 'z', 'suffix'],

    # Brute force indicators
    'bruteforce': ['brute', 'naive']
}

def extract_keyword_features(df):
    text = df['full_text']
    feats = {}

    for k, patterns in KEYWORDS.items():
        feats[f'{k}_present'] = text.apply(
            lambda x: any(p in x for p in patterns)
        ).astype(int)

        feats[f'{k}_count'] = text.apply(
            lambda x: sum(p in x for p in patterns)
        )

    return pd.DataFrame(feats)

In [10]:
X_stat = extract_stat_features(df)
X_kw = extract_keyword_features(df)

## classification

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [12]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_features=30000)
X_tfidf = tfidf.fit_transform(df['full_text'])

X_cls = csr_matrix(hstack([
    X_tfidf,
    X_stat.values,
    X_kw.values
]))

In [24]:
def logreg_objective(trial):
    C = trial.suggest_float("C", 1e-4, 10.0, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    model = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),  # sparse-safe
        ('clf', LogisticRegression(
            C=C,
            solver='lbfgs',
            penalty='l2',
            max_iter=3000,
            class_weight=class_weight,
            n_jobs=-1
        ))
    ])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1s = []

    for tr, va in skf.split(X_cls, y_cls):
        model.fit(X_cls[tr], y_cls[tr])
        preds = model.predict(X_cls[va])
        f1s.append(f1_score(y_cls[va], preds, average='macro'))

    return np.mean(f1s)

study = optuna.create_study(direction="maximize")
study.optimize(logreg_objective, n_trials=100)

print("Best macro-F1:", study.best_value)
print("Best params:", study.best_params)

[I 2026-01-06 14:04:04,161] A new study created in memory with name: no-name-81319586-d8c6-483f-a58a-21b3ca7dbeb9
[I 2026-01-06 14:04:16,517] Trial 0 finished with value: 0.4299453523375288 and parameters: {'C': 0.791562070704174, 'class_weight': None}. Best is trial 0 with value: 0.4299453523375288.
[I 2026-01-06 14:04:28,471] Trial 1 finished with value: 0.4279807186502861 and parameters: {'C': 0.15925810768385337, 'class_weight': None}. Best is trial 0 with value: 0.4299453523375288.
[I 2026-01-06 14:04:40,043] Trial 2 finished with value: 0.4249646920020217 and parameters: {'C': 0.03920712171214759, 'class_weight': None}. Best is trial 0 with value: 0.4299453523375288.
[I 2026-01-06 14:04:51,230] Trial 3 finished with value: 0.4409644604356172 and parameters: {'C': 0.00981933866178707, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.4409644604356172.
[I 2026-01-06 14:05:03,787] Trial 4 finished with value: 0.4068988258607448 and parameters: {'C': 0.00016732722621584583, 

Best macro-F1: 0.4548674007084618
Best params: {'C': 0.00010021197815889274, 'class_weight': 'balanced'}


In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y_cls,
    test_size=0.2,
    random_state=42,
    stratify=y_cls
)

final_model = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression(
        **study.best_params,
        solver='lbfgs',
        penalty='l2',
        max_iter=3000,
        n_jobs=-1
    ))
])

final_model.fit(X_tr, y_tr)
y_pred = final_model.predict(X_te)

print(classification_report(y_te, y_pred, target_names=le.classes_))
print(confusion_matrix(y_te, y_pred))

              precision    recall  f1-score   support

        easy       0.52      0.37      0.44       153
        hard       0.56      0.64      0.60       389
      medium       0.37      0.35      0.36       281

    accuracy                           0.49       823
   macro avg       0.48      0.46      0.46       823
weighted avg       0.49      0.49      0.49       823

[[ 57  38  58]
 [ 27 250 112]
 [ 25 157  99]]


In [26]:
print(study.best_params)

{'C': 0.00010021197815889274, 'class_weight': 'balanced'}


## regression

In [11]:
X_reg = csr_matrix(hstack([X_tfidf,X_stat.values,X_kw.values]))
y = y_reg.values

In [33]:
def ridge_objective(trial):
    alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True)
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])

    model = Ridge(alpha=alpha, fit_intercept=fit_intercept)
    model = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),  # sparse-safe
        ('reg', Ridge(alpha=alpha, fit_intercept=fit_intercept))
    ])

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []

    for tr, va in kf.split(X_reg):
        model.fit(X_reg[tr], y[tr])
        preds = model.predict(X_reg[va])
        rmses.append(mean_squared_error(y[va], preds)**0.5)

    return np.mean(rmses)


study = optuna.create_study(direction='minimize')
study.optimize(ridge_objective, n_trials=100)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 15:18:39,494] A new study created in memory with name: no-name-ed34c832-69af-4319-8859-70622e8c11dd
[I 2026-01-06 15:18:40,806] Trial 0 finished with value: 2.1655668909094032 and parameters: {'alpha': 310.2611875038261, 'fit_intercept': False}. Best is trial 0 with value: 2.1655668909094032.
[I 2026-01-06 15:18:44,484] Trial 1 finished with value: 5.9529311089774 and parameters: {'alpha': 0.03378160874182457, 'fit_intercept': True}. Best is trial 0 with value: 2.1655668909094032.
[I 2026-01-06 15:18:48,758] Trial 2 finished with value: 73.91593972231126 and parameters: {'alpha': 0.00015050866556205674, 'fit_intercept': False}. Best is trial 0 with value: 2.1655668909094032.
[I 2026-01-06 15:18:51,714] Trial 3 finished with value: 2.238078948644084 and parameters: {'alpha': 1.4880507139312662, 'fit_intercept': False}. Best is trial 0 with value: 2.1655668909094032.
[I 2026-01-06 15:18:52,156] Trial 4 finished with value: 2.0626434461939285 and parameters: {'alpha': 8500.3

Best RMSE: 2.0547407176407386
Best Params: {'alpha': 9942.524780082125, 'fit_intercept': False}


## version check

In [40]:
import sklearn, joblib


print('numpy',numpy.__version__)
print("sklearn:", sklearn.__version__)
print("pandas",pandas.__version__)
print("scipy",scipy.__version__)
print("joblib:", joblib.__version__)

numpy 2.0.2
sklearn: 1.6.1
pandas 2.2.2
scipy 1.16.3
joblib: 1.5.3


In [41]:
34

34