In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder
#from sklearn.decomposition import PCA
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_json("/content/problems_data.jsonl", lines=True)

def build_full_text(df):
    text_cols = ['title', 'description', 'input_description', 'output_description']
    return (
        df[text_cols]
        .fillna('')
        .agg(' '.join, axis=1)
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

df['full_text'] = build_full_text(df)



le = LabelEncoder()
df['problem_class_enc'] = le.fit_transform(df['problem_class'])
print(le.classes_)

y_cls = df['problem_class_enc']
y_reg = df['problem_score']

['easy' 'hard' 'medium']


# base

In [None]:
def extract_stat_features(df):
    text = df['full_text']
    features = pd.DataFrame({
        'char_len': text.str.len(),
        'word_len': text.str.split().str.len(),
        'sentence_count': text.str.count(r'[.!?]') + 1,
        'digit_count': text.str.count(r'\d'),
        'math_symbol_count': text.str.count(r'[\+\-\*/=<>\^]'),
    })

    features['avg_word_len'] = features['char_len'] / (features['word_len'] + 1)
    return features


KEYWORDS = ['dp', 'graph', 'tree', 'recursion','segment', 'bit', 'modulo']

def extract_keyword_features(df):
    text = df['full_text']
    return pd.DataFrame({
        kw: text.str.contains(kw, case=False, regex=False).astype(int)
        for kw in KEYWORDS
    })

X_stat_b = extract_stat_features(df)
X_kw_b = extract_keyword_features(df)

X = pd.concat([X_stat,X_kw],axis=1)
# Convert once
dtrain = xgb.DMatrix(X.values, label=y_reg.values)

# model (regressor)

In [None]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'verbosity': 0,
        'nthread': -1,  # while using GPU remove it

        'n_estimators': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),

        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'max_depth': trial.suggest_int('max_depth', 3, 12),

        'subsample': trial.suggest_float('subsample', 0.85, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.85, 1),

        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 20.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 20.0, log=True),

        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),

        #'max_bin': trial.suggest_categorical('max_bin', [256, 512]),
        'max_bin' : 512
    }

    if params['grow_policy'] == 'lossguide':
        params['max_leaves'] = trial.suggest_int('max_leaves', 32, 512)

    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=params['n_estimators'],
        nfold=5,
        early_stopping_rounds=75,
        seed=42,
        verbose_eval=False
    )

    return cv_results['test-rmse-mean'].min()

# Run the detailed study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5) # Need more trials for this larger space

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 05:15:56,233] A new study created in memory with name: no-name-09e7722e-d5a0-48e5-ba04-9caa4a44c840
[I 2026-01-06 05:16:02,117] Trial 0 finished with value: 2.103037858764241 and parameters: {'learning_rate': 0.021833879428815508, 'grow_policy': 'depthwise', 'max_depth': 7, 'subsample': 0.8890433797851562, 'colsample_bytree': 0.914030080581765, 'reg_alpha': 1.0122031643883272e-06, 'reg_lambda': 2.884136937605826e-07, 'min_child_weight': 3, 'gamma': 4.9462383224882636e-05}. Best is trial 0 with value: 2.103037858764241.
[I 2026-01-06 05:16:04,965] Trial 1 finished with value: 2.097233862703798 and parameters: {'learning_rate': 0.009861953371312158, 'grow_policy': 'lossguide', 'max_depth': 6, 'subsample': 0.9244696872432729, 'colsample_bytree': 0.9667337632870183, 'reg_alpha': 4.1792421500890566e-06, 'reg_lambda': 5.7330956074582745e-08, 'min_child_weight': 15, 'gamma': 1.6936505562799922e-05, 'max_leaves': 290}. Best is trial 1 with value: 2.097233862703798.
[I 2026-01-06 

Best RMSE: 2.090253840368315
Best Params: {'learning_rate': 0.05509314498629869, 'grow_policy': 'lossguide', 'max_depth': 3, 'subsample': 0.9910750779428636, 'colsample_bytree': 0.8690314856431749, 'reg_alpha': 1.026458923700637e-08, 'reg_lambda': 2.3637314544574976e-07, 'min_child_weight': 1, 'gamma': 0.001961100558930839, 'max_leaves': 411}


In [None]:
# 1. Extract the best hyperparameters from the study
best_params = study.best_params

# 2. Add the necessary fixed parameters
# We ensure GPU is enabled and the objective is set correctly
best_params['objective'] = 'reg:squarederror'
best_params['tree_method'] = 'auto' # gpu_hist
best_params['eval_metric'] = 'rmse'

if 'n_estimators' not in best_params:
    best_params['n_estimators'] = 2500

final_model = xgb.XGBRegressor(**best_params)

def eval(y_test,y_pred) :
  print("MSE : ",mean_squared_error(y_test,y_pred))
  print("RMSE : ",mean_squared_error(y_test,y_pred)**0.5)
  print("R2 : ",r2_score(y_test,y_pred))
  return

X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

MSE :  5.595162529488553
RMSE :  2.3654095902165766
R2 :  -0.16559944705465224


In [None]:
temp_model = xgb.XGBRegressor()
temp_model.fit(X_train, y_train)
y_pred = temp_model.predict(X_test)

print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

5.638804092275246
-0.1746909758859727


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
temp_model = LinearRegression()
temp_model.fit(X_train, y_train)
y_pred = temp_model.predict(X_test)

print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

4.503896088550488
0.06173614032871855


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
temp_model = RandomForestRegressor()
temp_model.fit(X_train, y_train)
y_pred = temp_model.predict(X_test)

print(mean_squared_error(y_test,y_pred))
print(r2_score(y_test,y_pred))

4.907173374240584
-0.022275678583187686


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=1)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

MSE :  4.534497553911927
RMSE :  2.129435970841088
R2 :  0.03511508986132461


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=99)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

MSE :  4.650258273031444
RMSE :  2.156445750078458
R2 :  0.04689594480565473


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=7)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

MSE :  4.506656871163509
RMSE :  2.122888803296939
R2 :  0.02119930565535455


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=564)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

MSE :  4.541801961626288
RMSE :  2.131150384563766
R2 :  0.038922052564470566


# new

In [None]:
df = pd.read_json("/content/problems_data.jsonl", lines=True)

def build_full_text(df):
    text_cols = ['title', 'description', 'input_description', 'output_description']
    return (
        df[text_cols]
        .fillna('')
        .agg(' '.join, axis=1)
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

df['full_text'] = build_full_text(df)

le = LabelEncoder()
df['problem_class_enc'] = le.fit_transform(df['problem_class'])
print(le.classes_)

y_cls = df['problem_class_enc']
y_reg = df['problem_score']

['easy' 'hard' 'medium']


In [None]:
def extract_stat_features(df):
    text = df['full_text']

    features = pd.DataFrame({
        'char_len': text.str.len(),
        'word_len': text.str.split().str.len(),
        'sentence_count': text.str.count(r'[.!?]') + 1,

        'digit_count': text.str.count(r'\d'),
        'math_symbol_count': text.str.count(r'[\+\-\*/=<>\^]'),

        # NEW (important)
        'log_char_len': np.log1p(text.str.len()),
        'log_word_len': np.log1p(text.str.split().str.len()),

        'constraint_count': text.str.count(r'(<=|>=|<|>)'),
        'power_count': text.str.count(r'\^'),
        'big_o_count': text.str.count(r'o\('),

        'newline_count': text.str.count(r'\n'),
        'colon_count': text.str.count(':'),

        'avg_word_len': text.str.len() / (text.str.split().str.len() + 1),
    })

    return features

KEYWORDS = {
    # Core paradigms
    'dp': ['dp', 'dynamic programming'],
    'graph': ['graph', 'tree', 'dag'],
    'greedy': ['greedy'],
    'binary_search': ['binary search'],
    'two_pointers': ['two pointers'],

    # Data structures
    'segment_tree': ['segment tree'],
    'fenwick': ['fenwick', 'bit'],
    'heap': ['heap', 'priority queue'],
    'disjoint_set': ['dsu', 'union find'],

    # Math / advanced
    'modulo': ['mod', 'modulo'],
    'combinatorics': ['combination', 'permutation'],
    'probability': ['probability', 'expected'],

    # Geometry / strings
    'geometry': ['geometry', 'convex'],
    'string_algo': ['kmp', 'z', 'suffix'],

    # Brute force indicators
    'bruteforce': ['brute', 'naive']
}

def extract_keyword_features(df):
    text = df['full_text']
    feats = {}

    for k, patterns in KEYWORDS.items():
        feats[f'{k}_present'] = text.apply(
            lambda x: any(p in x for p in patterns)
        ).astype(int)

        feats[f'{k}_count'] = text.apply(
            lambda x: sum(p in x for p in patterns)
        )

    return pd.DataFrame(feats)


In [None]:
X_stat = extract_stat_features(df)
X_kw = extract_keyword_features(df)

X = pd.concat([X_stat,X_kw],axis=1)
# Convert once
dtrain = xgb.DMatrix(X.values, label=y_reg.values)

In [None]:
X

Unnamed: 0,char_len,word_len,sentence_count,digit_count,math_symbol_count,log_char_len,log_word_len,constraint_count,power_count,big_o_count,...,combinatorics_present,combinatorics_count,probability_present,probability_count,geometry_present,geometry_count,string_algo_present,string_algo_count,bruteforce_present,bruteforce_count
0,1567,285,15,15,3,7.357556,5.655992,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,1329,239,14,12,3,7.192934,5.480639,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1227,218,12,16,4,7.113142,5.389072,1,3,0,...,0,0,1,1,0,0,1,1,0,0
3,1301,236,13,16,0,7.171657,5.468060,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2120,406,18,8,0,7.659643,6.008813,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,425,70,7,13,2,6.054439,4.262680,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4108,137,19,5,4,0,4.927254,2.995732,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4109,140,20,3,0,0,4.948760,3.044522,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4110,102,19,4,2,0,4.634729,2.995732,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def objective(trial):
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'verbosity': 0,

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),

        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),

        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-6, 1.0, log=True),

        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),

        'grow_policy': grow_policy,
        'max_bin': 512,
    }

    if grow_policy == 'depthwise':
        params['max_depth'] = trial.suggest_int('max_depth', 4, 12)
    else:
        params['max_leaves'] = trial.suggest_int('max_leaves', 32, 512)

    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=3000,
        nfold=5,
        early_stopping_rounds=100,
        seed=42,
        verbose_eval=False
    )

    return cv_results['test-rmse-mean'].min()

# Run the detailed study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)


[I 2026-01-06 06:57:37,364] A new study created in memory with name: no-name-8cc55406-0af8-4dda-850c-1f28b18b6539
[I 2026-01-06 06:57:39,212] Trial 0 finished with value: 2.0897851058781773 and parameters: {'grow_policy': 'depthwise', 'learning_rate': 0.07049082147068857, 'subsample': 0.7914248259976527, 'colsample_bytree': 0.8437447699339847, 'reg_alpha': 1.5049365586240719e-06, 'reg_lambda': 3.611434021003896, 'min_child_weight': 9, 'gamma': 0.09966427385533541, 'max_delta_step': 0, 'max_depth': 7}. Best is trial 0 with value: 2.0897851058781773.
[I 2026-01-06 06:57:41,195] Trial 1 finished with value: 2.0812035015736923 and parameters: {'grow_policy': 'lossguide', 'learning_rate': 0.04055102070020644, 'subsample': 0.9231627555423484, 'colsample_bytree': 0.8705356152907755, 'reg_alpha': 0.002639574097121546, 'reg_lambda': 1.4913050397508451e-06, 'min_child_weight': 6, 'gamma': 0.005432754087529552, 'max_delta_step': 7, 'max_leaves': 234}. Best is trial 1 with value: 2.081203501573692

Best RMSE: 2.0735627085564357
Best Params: {'grow_policy': 'depthwise', 'learning_rate': 0.0426214327214927, 'subsample': 0.8378152132021577, 'colsample_bytree': 0.8849060409286044, 'reg_alpha': 1.4497574443384956e-06, 'reg_lambda': 0.11782796709863723, 'min_child_weight': 8, 'gamma': 0.0012764385483885006, 'max_delta_step': 6, 'max_depth': 4}


In [None]:
# 1. Extract the best hyperparameters from the study
best_params = study.best_params

# 2. Add the necessary fixed parameters
best_params['objective'] = 'reg:squarederror'
best_params['tree_method'] = 'hist'
best_params['eval_metric'] = 'rmse'

if 'n_estimators' not in best_params:
    best_params['n_estimators'] = 2500

final_model = xgb.XGBRegressor(**best_params)

def eval(y_test,y_pred) :
  print("MSE : ",mean_squared_error(y_test,y_pred))
  print("RMSE : ",mean_squared_error(y_test,y_pred)**0.5)
  print("R2 : ",r2_score(y_test,y_pred))
  return

X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
eval(y_test,y_pred)

# resedual learning

In [None]:
X_base = pd.concat([X_stat, X_kw], axis=1)
y = y_reg.values

ridge = Ridge(alpha=1.0)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
base_preds = np.zeros(len(X_base))

for tr, va in kf.split(X_base):
    ridge.fit(X_base.iloc[tr], y[tr])
    base_preds[va] = ridge.predict(X_base.iloc[va])

residuals = y - base_preds
print("Baseline RMSE:", mean_squared_error(y, base_preds)**0.5)
dtrain_res = xgb.DMatrix(X.values, label=residuals)

In [None]:
def objective(trial):
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'verbosity': 0,

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),

        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),

        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-6, 1.0, log=True),

        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),

        'grow_policy': grow_policy,
        'max_bin': 512,
    }

    if grow_policy == 'depthwise':
        params['max_depth'] = trial.suggest_int('max_depth', 4, 12)
    else:
        params['max_leaves'] = trial.suggest_int('max_leaves', 32, 512)

    cv_results = xgb.cv(
        params,
        dtrain_res,
        num_boost_round=3000,
        nfold=5,
        early_stopping_rounds=100,
        seed=42,
        verbose_eval=False
    )

    return cv_results['test-rmse-mean'].min()

# Run the detailed study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)


[I 2026-01-06 07:24:47,901] A new study created in memory with name: no-name-3fd10a0c-c491-4fb3-a95a-b6f691a0a71c
[I 2026-01-06 07:24:49,979] Trial 0 finished with value: 2.084381103562434 and parameters: {'grow_policy': 'lossguide', 'learning_rate': 0.01811734510680885, 'subsample': 0.9406529356824199, 'colsample_bytree': 0.9097009234634512, 'reg_alpha': 1.1623719536871289, 'reg_lambda': 0.612056352782902, 'min_child_weight': 2, 'gamma': 0.01563841368951787, 'max_delta_step': 8, 'max_leaves': 429}. Best is trial 0 with value: 2.084381103562434.
[I 2026-01-06 07:24:51,946] Trial 1 finished with value: 2.0833628757262033 and parameters: {'grow_policy': 'lossguide', 'learning_rate': 0.010513455695328168, 'subsample': 0.7914876202987818, 'colsample_bytree': 0.7922827778189475, 'reg_alpha': 3.0460936770825775, 'reg_lambda': 0.000253720318143337, 'min_child_weight': 10, 'gamma': 0.02910663924341847, 'max_delta_step': 6, 'max_leaves': 395}. Best is trial 1 with value: 2.0833628757262033.
[I 

Best RMSE: 2.0818460544795405
Best Params: {'grow_policy': 'lossguide', 'learning_rate': 0.0107371107704702, 'subsample': 0.7046386733811062, 'colsample_bytree': 0.9353926019916539, 'reg_alpha': 0.00011505091844551486, 'reg_lambda': 0.06941040371459081, 'min_child_weight': 19, 'gamma': 0.000674333100178526, 'max_delta_step': 8, 'max_leaves': 256}


In [None]:
# 1. Extract the best hyperparameters from the study
best_params = study.best_params

# 2. Add the necessary fixed parameters
best_params['objective'] = 'reg:squarederror'
best_params['tree_method'] = 'hist'
best_params['eval_metric'] = 'rmse'

if 'n_estimators' not in best_params:
    best_params['n_estimators'] = 2500

def eval(y_test,y_pred) :
  print("MSE : ",mean_squared_error(y_test,y_pred))
  print("RMSE : ",mean_squared_error(y_test,y_pred)**0.5)
  print("R2 : ",r2_score(y_test,y_pred))
  return

base_model =  Ridge(alpha=1.0)
xgb_model = xgb.XGBRegressor(**best_params)

X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)

base_model.fit(X_train, y_train)
train_base_pred = base_model.predict(X_train)
test_base_pred = base_model.predict(X_test)
train_res = y_train - train_base_pred
test_res = y_test - test_base_pred
print('performance of base model')
eval(y_test,test_base_pred)

xgb_model.fit(X_train,train_res)
test_xgb_pred = xgb_model.predict(X_test)

print('performance of XGB model')
eval(test_res,test_xgb_pred)

y_pred = test_base_pred + test_xgb_pred
print('overall performance')
eval(y_test,y_pred)

# new 2

In [None]:
X_stat = extract_stat_features(df)
X_kw = extract_keyword_features(df)

X = pd.concat([X_stat,X_kw],axis=1)
y = y_reg.values

In [None]:
base_model = Ridge(alpha=1.0)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
base_preds = np.zeros(len(X))

for tr, va in kf.split(X):
    base_model.fit(X.iloc[tr], y[tr])
    base_preds[va] = base_model.predict(X.iloc[va])

residuals = y - base_preds
print("Baseline RMSE:", mean_squared_error(y, base_preds)**0.5)
dtrain_res = xgb.DMatrix(X.values, label=residuals)

Baseline RMSE: 2.0842505758577268


In [None]:
def objective(trial):
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'verbosity': 0,

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),

        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),

        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-6, 1.0, log=True),

        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),

        'grow_policy': grow_policy,
        'max_bin': 512,
    }

    if grow_policy == 'depthwise':
        params['max_depth'] = trial.suggest_int('max_depth', 4, 12)
    else:
        params['max_leaves'] = trial.suggest_int('max_leaves', 32, 512)

    cv_results = xgb.cv(
        params,
        dtrain_res,
        num_boost_round=3000,
        nfold=5,
        early_stopping_rounds=100,
        seed=42,
        verbose_eval=False
    )

    return cv_results['test-rmse-mean'].min()

# Run the detailed study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 10:53:29,524] A new study created in memory with name: no-name-a155ebd9-71c2-4172-89fc-c2088cd91a24
[I 2026-01-06 10:53:34,392] Trial 0 finished with value: 2.083277734655115 and parameters: {'grow_policy': 'lossguide', 'learning_rate': 0.017711089000257196, 'subsample': 0.8850142101397589, 'colsample_bytree': 0.9129447546355627, 'reg_alpha': 0.011978876106446422, 'reg_lambda': 0.1365333140038803, 'min_child_weight': 5, 'gamma': 1.9905520476609824e-06, 'max_delta_step': 9, 'max_leaves': 49}. Best is trial 0 with value: 2.083277734655115.
[I 2026-01-06 10:53:36,066] Trial 1 finished with value: 2.0853369041389573 and parameters: {'grow_policy': 'depthwise', 'learning_rate': 0.07857126345501612, 'subsample': 0.8085844215365594, 'colsample_bytree': 0.7330890104482536, 'reg_alpha': 5.647425044451438e-06, 'reg_lambda': 9.026401298655605, 'min_child_weight': 4, 'gamma': 0.0022419209780294154, 'max_delta_step': 7, 'max_depth': 7}. Best is trial 0 with value: 2.083277734655115.
[

Best RMSE: 2.0827027865928835
Best Params: {'grow_policy': 'lossguide', 'learning_rate': 0.030763710949571452, 'subsample': 0.9175042749877119, 'colsample_bytree': 0.842436521259408, 'reg_alpha': 7.830549021819599, 'reg_lambda': 0.0010015913887415513, 'min_child_weight': 15, 'gamma': 1.0590110495780703e-05, 'max_delta_step': 5, 'max_leaves': 151}


# new 3

In [None]:
def extract_stat_features(df):
    text = df['full_text']
    features = pd.DataFrame({
        'char_len': text.str.len(),
        'word_len': text.str.split().str.len(),
        'sentence_count': text.str.count(r'[.!?]') + 1,
        'digit_count': text.str.count(r'\d'),
        'math_symbol_count': text.str.count(r'[\+\-\*/=<>\^]'),
    })

    features['avg_word_len'] = features['char_len'] / (features['word_len'] + 1)
    return features


KEYWORDS = ['dp', 'graph', 'tree', 'recursion','segment', 'bit', 'modulo']

def extract_keyword_features(df):
    text = df['full_text']
    return pd.DataFrame({
        kw: text.str.contains(kw, case=False, regex=False).astype(int)
        for kw in KEYWORDS
    })

X_stat_b = extract_stat_features(df)
X_kw_b = extract_keyword_features(df)

In [None]:
X_base = pd.concat([X_stat_b, X_kw_b], axis=1)
X_full = pd.concat([X_stat,X_kw], axis=1)
y = y_reg.values

Xb_tr, Xb_te, Xf_tr, Xf_te, y_tr, y_te = train_test_split( X_base, X_full, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=np.logspace(-3, 3, 20))
ridge.fit(Xb_tr, y_tr)

base_tr_pred = ridge.predict(Xb_tr)
base_te_pred = ridge.predict(Xb_te)

train_res = y_tr - base_tr_pred
test_res  = y_te - base_te_pred

print("Baseline RMSE:", mean_squared_error(y_te, base_te_pred)**0.5)

Baseline RMSE: 2.1224586068901385


In [None]:
dtrain_res = xgb.DMatrix(Xf_tr.values, label=train_res)

def objective(trial):
    grow_policy = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'verbosity': 0,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-6, 1.0, log=True),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'grow_policy': grow_policy,
        'max_bin': 512,}

    if grow_policy == 'depthwise':
        params['max_depth'] = trial.suggest_int('max_depth', 4, 12)
    else:
        params['max_leaves'] = trial.suggest_int('max_leaves', 32, 512)

    cv_results = xgb.cv(params, dtrain_res, num_boost_round=3000, nfold=5, early_stopping_rounds=100, seed=42, verbose_eval=False)
    return cv_results['test-rmse-mean'].min()

# Run the detailed study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 11:08:39,884] A new study created in memory with name: no-name-f5c5d181-6340-4a39-af91-efdb2f7b5f02
[I 2026-01-06 11:08:44,111] Trial 0 finished with value: 2.060536391386007 and parameters: {'grow_policy': 'lossguide', 'learning_rate': 0.020454324943686184, 'subsample': 0.8335406782589164, 'colsample_bytree': 0.8722871787535669, 'reg_alpha': 0.00043878940688485803, 'reg_lambda': 8.72698030287171e-06, 'min_child_weight': 16, 'gamma': 0.009686477419101634, 'max_delta_step': 0, 'max_leaves': 454}. Best is trial 0 with value: 2.060536391386007.
[I 2026-01-06 11:08:45,327] Trial 1 finished with value: 2.0599909656764845 and parameters: {'grow_policy': 'depthwise', 'learning_rate': 0.027681362240579228, 'subsample': 0.7195863970554476, 'colsample_bytree': 0.9681171156379416, 'reg_alpha': 5.719117843533548e-06, 'reg_lambda': 4.737612533056337e-06, 'min_child_weight': 5, 'gamma': 0.0011681535622686358, 'max_delta_step': 9, 'max_depth': 4}. Best is trial 1 with value: 2.059990965

Best RMSE: 2.0584730517408536
Best Params: {'grow_policy': 'depthwise', 'learning_rate': 0.0643519580423065, 'subsample': 0.7044228965420287, 'colsample_bytree': 0.9966342029537365, 'reg_alpha': 0.02423503202587419, 'reg_lambda': 5.956310469409797, 'min_child_weight': 1, 'gamma': 1.0811500159914476e-06, 'max_delta_step': 4, 'max_depth': 4}


In [None]:
xgb_model = xgb.XGBRegressor(
    **study.best_params,
    objective='reg:squarederror',
    tree_method='hist',
    eval_metric='rmse',
    n_estimators=3000
)

xgb_model.fit(Xf_tr, train_res)

In [None]:
test_xgb_pred = xgb_model.predict(Xf_te)
y_pred = base_te_pred + test_xgb_pred

print("FINAL MODEL PERFORMANCE")
print("RMSE:", mean_squared_error(y_te, y_pred)**0.5)
print("MAE :", mean_absolute_error(y_te, y_pred))
print("R2  :", r2_score(y_te, y_pred))

FINAL MODEL PERFORMANCE
RMSE: 2.3804738233244094
MAE : 1.9503778232409565
R2  : -0.18049308248470974


# ridge


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

In [None]:
X = pd.concat([X_stat, X_kw], axis=1)
y = y_reg.values

def ridge_objective(trial):
    alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True)
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])

    model = Ridge(alpha=alpha, fit_intercept=fit_intercept)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []

    for tr, va in kf.split(X):
        model.fit(X.iloc[tr], y[tr])
        preds = model.predict(X.iloc[va])
        rmses.append(mean_squared_error(y[va], preds)**0.5)

    return np.mean(rmses)


study = optuna.create_study(direction='minimize')
study.optimize(ridge_objective, n_trials=50)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-01-06 11:32:03,087] A new study created in memory with name: no-name-6c58cc27-a357-4c8e-bf72-2e8be5a5f0a9
[I 2026-01-06 11:32:03,142] Trial 0 finished with value: 2.0860860721445937 and parameters: {'alpha': 0.04688460071519945, 'fit_intercept': True}. Best is trial 0 with value: 2.0860860721445937.
[I 2026-01-06 11:32:03,195] Trial 1 finished with value: 2.083402464834485 and parameters: {'alpha': 86.35446239127042, 'fit_intercept': False}. Best is trial 1 with value: 2.083402464834485.
[I 2026-01-06 11:32:03,243] Trial 2 finished with value: 2.0844815642901047 and parameters: {'alpha': 0.4864984878259797, 'fit_intercept': False}. Best is trial 1 with value: 2.083402464834485.
[I 2026-01-06 11:32:03,294] Trial 3 finished with value: 2.085942875231835 and parameters: {'alpha': 0.06782785834580757, 'fit_intercept': True}. Best is trial 1 with value: 2.083402464834485.
[I 2026-01-06 11:32:03,344] Trial 4 finished with value: 2.0854181017695717 and parameters: {'alpha': 0.18329069

Best RMSE: 2.0819781324649664
Best Params: {'alpha': 20.008192174296234, 'fit_intercept': False}


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse = []
mae = []
r2 = []
best_ridge = Ridge(**study.best_params)

for tr, va in kf.split(X):
    best_ridge.fit(X.iloc[tr], y[tr])
    preds = best_ridge.predict(X.iloc[va])
    rmse.append(mean_squared_error(y[va], preds)**0.5)
    mae.append(mean_absolute_error(y[va], preds))
    r2.append(r2_score(y[va], preds))

In [None]:
pd.DataFrame({'RMSE':rmse, 'MAE':mae, 'r2':r2})

Unnamed: 0,RMSE,MAE,r2
0,2.115387,1.774598,0.067784
1,2.072498,1.767816,0.088299
2,2.034818,1.710994,0.083383
3,2.104768,1.77119,0.083619
4,2.08242,1.800557,0.091384


In [None]:
# here r2 score is < 0.1 --> data do not have any meanig full signal
# thats why very sphisticated models like XGoost performing same as ridge reg

In [None]:
print(study.best_params)

{'alpha': 20.008192174296234, 'fit_intercept': False}
