In [None]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit,GroupKFold
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from datetime import datetime
import json
import optuna
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)
import warnings
warnings.filterwarnings("ignore")

# CV Control

In [None]:
train = pd.read_csv("train_final.csv")
train = train.sort_values("min_date")
group = train["user_id"]

train["max_date"] = pd.to_datetime(train["max_date"])
train["min_date"] = pd.to_datetime(train["min_date"])

cv=TimeSeriesSplit(n_splits=4)

for fold, (train_idx, test_idx) in enumerate(cv.split(train)):
    train_cv,val_cv = train.iloc[train_idx],train.iloc[test_idx]
    print(train_cv["min_date"].max(),val_cv["min_date"].min())

    print("Train CV duration:",train_cv["min_date"].max()-train_cv["min_date"].min())
    print("VAL CV duration:",val_cv["min_date"].max()-val_cv["min_date"].min())

    print("Train-Test ortak user:", len(set(train_cv["user_id"]) & set(val_cv["user_id"])))
    print("Train-Test ortak session:", len(set(train_cv["user_session"]) & set(val_cv["user_session"])))
    print(train_cv.shape,val_cv.shape)
    print("-"*50)

2025-06-04 13:25:10+00:00 2025-06-04 13:25:11+00:00
Train CV duration: 3 days 13:24:46
VAL CV duration: 3 days 15:58:28
Train-Test ortak user: 761
Train-Test ortak session: 0
(14148, 44) (14147, 44)
--------------------------------------------------
2025-06-08 05:23:39+00:00 2025-06-08 05:23:59+00:00
Train CV duration: 7 days 05:23:15
VAL CV duration: 4 days 08:43:17
Train-Test ortak user: 1481
Train-Test ortak session: 0
(28295, 44) (14147, 44)
--------------------------------------------------
2025-06-12 14:07:16+00:00 2025-06-12 14:09:05+00:00
Train CV duration: 11 days 14:06:52
VAL CV duration: 4 days 18:26:48
Train-Test ortak user: 1959
Train-Test ortak session: 0
(42442, 44) (14147, 44)
--------------------------------------------------
2025-06-17 08:35:53+00:00 2025-06-17 08:35:54+00:00
Train CV duration: 16 days 08:35:29
VAL CV duration: 4 days 15:22:11
Train-Test ortak user: 2384
Train-Test ortak session: 0
(56589, 44) (14147, 44)
----------------------------------------------

In [None]:
train = pd.read_csv("train_final.csv")
train = train.sort_values("min_date")
group = train["user_id"]
train = train.drop(["user_session","min_date","max_date","user_id"],axis=1)

x = train.drop("session_value",axis=1)
y = train["session_value"]

In [None]:
param = {'objective': 'reg:squarederror', 'device': 'gpu', 'random_state': 22,
 'eta': 0.06805925858288127, 'gamma': 0.007598560661707519, 'max_depth': 5, 'min_child_weight': 26,
 'max_delta_step': 10, 'subsample': 0.6501883007229573, 'colsample_bytree': 0.8024275970546287,
 'colsample_bylevel': 0.83207869249364, 'colsample_bynode': 0.8330850030340746, 'lambda': 0.20263332224150976,
 'alpha': 0.04126849049726349, 'n_estimators': 1262}

In [None]:
cv=TimeSeriesSplit(n_splits=4)
scores_list = []
for fold, (train_idx, test_idx) in enumerate(cv.split(train)):
    train_x , val_x = x.iloc[train_idx] , x.iloc[test_idx]
    train_y , val_y = y.iloc[train_idx] , y.iloc[test_idx]
    #print(f"Fold {fold+1}",train_x.shape,train_y.shape,val_x.shape,val_y.shape)
    model = XGBRegressor(**param)
    model.fit(train_x,train_y)
    prediction = model.predict(val_x)
    mse = mean_squared_error(val_y,prediction)
    scores_list.append(mse)

sh: 1: nvidia-smi: not found


In [None]:
scores_list

[387.7487337135722, 279.3182013846541, 433.3302086551719, 266.1794187788691]

# Create x-y

In [None]:
train = pd.read_csv("train_final.csv")

train.replace(np.inf, 99, inplace=True)
train.replace(-np.inf, -99, inplace=True)

train = train.sort_values("min_date")
group = train["user_id"]
train = train.drop(["user_session","min_date","max_date","user_id"],axis=1)

x = train.drop("session_value",axis=1)
y = train["session_value"]

In [None]:
seed = 22

# LGBM

In [None]:
def objective_lgbm(trial):
    param_constant = {
        "objective" : trial.suggest_categorical("objective", ["regression"]),
        "metric" : trial.suggest_categorical("metric", ["mse"]),
        "boosting_type" : trial.suggest_categorical("boosting_type", ["gbdt"]),
        "bagging_freq" : trial.suggest_categorical("bagging_freq", [1]),
        "device" : trial.suggest_categorical("device", ["gpu"]),
        "verbose" : trial.suggest_categorical("verbose", [-1]),
        "seed" : trial.suggest_categorical("seed", [seed]),
    }
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
    }

    if os.path.exists("./results.csv"):
        results = pd.read_csv("./results.csv")
    else:
        results = pd.DataFrame()

    cv=TimeSeriesSplit(n_splits=4)
    scores_list = []
    for fold, (train_idx, test_idx) in enumerate(cv.split(train)):
        train_x , val_x = x.iloc[train_idx] , x.iloc[test_idx]
        train_y , val_y = y.iloc[train_idx] , y.iloc[test_idx]
        #print(f"Fold {fold+1}",train_x.shape,train_y.shape,val_x.shape,val_y.shape)
        model = LGBMRegressor(**param,**param_constant,random_state=seed)
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        mse = mean_squared_error(val_y,prediction)
        scores_list.append(mse)

    df = pd.DataFrame([{
        "model":"lgbm",
        "fold0": scores_list[0],
        "fold1": scores_list[1],
        "fold2": scores_list[2],
        "fold3": scores_list[3],
        "hyperparameters": {**param,**param_constant},
    }])
    results = pd.concat([results,df],ignore_index=True)
    results.to_csv("./results.csv", index=False)

    return np.mean(scores_list)

In [None]:
study = optuna.create_study(direction='minimize',study_name="BTK")
study.optimize(objective_lgbm, n_trials=30)

print("En iyi parametreler:", study.best_params)
print("En iyi doğruluk:", study.best_value)

[I 2025-08-29 15:07:28,776] A new study created in memory with name: BTK
[I 2025-08-29 15:07:30,294] Trial 0 finished with value: 339.20758779726543 and parameters: {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt', 'bagging_freq': 1, 'device': 'gpu', 'verbose': -1, 'seed': 22, 'max_depth': 7, 'num_leaves': 202, 'learning_rate': 0.1593616573453373, 'feature_fraction': 0.6845528204389949, 'bagging_fraction': 0.8906117784121719, 'min_child_samples': 20, 'reg_alpha': 2.300565606887274, 'reg_lambda': 0.6547322514108548}. Best is trial 0 with value: 339.20758779726543.
[I 2025-08-29 15:07:31,759] Trial 1 finished with value: 1623.77800310935 and parameters: {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt', 'bagging_freq': 1, 'device': 'gpu', 'verbose': -1, 'seed': 22, 'max_depth': 6, 'num_leaves': 176, 'learning_rate': 0.001691576425962088, 'feature_fraction': 0.8243809589412546, 'bagging_fraction': 0.9273472433933018, 'min_child_samples': 96, 'reg_alp

En iyi parametreler: {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt', 'bagging_freq': 1, 'device': 'gpu', 'verbose': -1, 'seed': 22, 'max_depth': 10, 'num_leaves': 229, 'learning_rate': 0.06157366271765317, 'feature_fraction': 0.6545899638170656, 'bagging_fraction': 0.7516865875740123, 'min_child_samples': 13, 'reg_alpha': 0.013081812306701291, 'reg_lambda': 0.06803827535232987}
En iyi doğruluk: 313.28299165943724


# XGBoost

In [None]:
def objective_xgb(trial):
    param_constant = {
        "objective" : trial.suggest_categorical("objective", ["reg:squarederror"]),
        "device" : trial.suggest_categorical("device", ["gpu"]),
        #"verbose" : trial.suggest_categorical("verbose", [-1]),
        "random_state" : trial.suggest_categorical("random_state", [seed]),
    }
    param = {
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-3,10.0,log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "max_delta_step": trial.suggest_int("max_delta_step", 0,  10),
        "subsample": trial.suggest_float("subsample", 0.2 , 1.0),

        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.2, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.2, 1.0),
        #"boosting_type" : trial.suggest_categorical("boosting_type", ["gbtree","gblinear","dart"]),

        "lambda": trial.suggest_float("lambda", 1e-3, 20.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 20.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
    }

    if os.path.exists("./results.csv"):
        results = pd.read_csv("./results.csv")
    else:
        results = pd.DataFrame()

    cv=TimeSeriesSplit(n_splits=4)
    scores_list = []
    for fold, (train_idx, test_idx) in enumerate(cv.split(train)):
        train_x , val_x = x.iloc[train_idx] , x.iloc[test_idx]
        train_y , val_y = y.iloc[train_idx] , y.iloc[test_idx]
        #print(f"Fold {fold+1}",train_x.shape,train_y.shape,val_x.shape,val_y.shape)
        model = XGBRegressor(**param,**param_constant)
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        mse = mean_squared_error(val_y,prediction)
        scores_list.append(mse)

    df = pd.DataFrame([{
        "model":"xgb",
        "fold0": scores_list[0],
        "fold1": scores_list[1],
        "fold2": scores_list[2],
        "fold3": scores_list[3],
        "hyperparameters": {**param,**param_constant},
    }])
    results = pd.concat([results,df],ignore_index=True)
    results.to_csv("./results.csv", index=False)

    return np.mean(scores_list)

In [None]:
study = optuna.create_study(direction='minimize',study_name="BTK")
study.optimize(objective_xgb, n_trials=50)

print("En iyi parametreler:", study.best_params)
print("En iyi doğruluk:", study.best_value)

[I 2025-08-28 13:46:33,280] A new study created in memory with name: BTK
sh: 1: nvidia-smi: not found
[I 2025-08-28 13:47:05,647] Trial 0 finished with value: 393.3774447861857 and parameters: {'objective': 'reg:squarederror', 'device': 'gpu', 'random_state': 22, 'eta': 0.06135367156054932, 'gamma': 1.3420806916262773, 'max_depth': 12, 'min_child_weight': 20, 'max_delta_step': 0, 'subsample': 0.9442896999743711, 'colsample_bytree': 0.9917181509433546, 'colsample_bylevel': 0.7085777137062459, 'colsample_bynode': 0.5914579780006612, 'lambda': 0.0027758118406004347, 'alpha': 0.4524196550597537, 'n_estimators': 885}. Best is trial 0 with value: 393.3774447861857.
[I 2025-08-28 13:47:30,007] Trial 1 finished with value: 383.7940578048996 and parameters: {'objective': 'reg:squarederror', 'device': 'gpu', 'random_state': 22, 'eta': 0.09360749750936005, 'gamma': 2.8897520629707674, 'max_depth': 9, 'min_child_weight': 10, 'max_delta_step': 7, 'subsample': 0.24062899210207497, 'colsample_bytree'

En iyi parametreler: {'objective': 'reg:squarederror', 'device': 'gpu', 'random_state': 22, 'eta': 0.06805925858288127, 'gamma': 0.007598560661707519, 'max_depth': 5, 'min_child_weight': 26, 'max_delta_step': 10, 'subsample': 0.6501883007229573, 'colsample_bytree': 0.8024275970546287, 'colsample_bylevel': 0.83207869249364, 'colsample_bynode': 0.8330850030340746, 'lambda': 0.20263332224150976, 'alpha': 0.04126849049726349, 'n_estimators': 1262}
En iyi doğruluk: 341.64414063306685


# CatBoost

In [None]:
def objective_catboost(trial):
    param_constant = {
        "loss_function" : trial.suggest_categorical("loss_function", ["RMSE"]),
        "task_type" : trial.suggest_categorical("task_type", ["GPU"]),
        "verbose" : trial.suggest_categorical("verbose", [False]),
        "random_state" : trial.suggest_categorical("random_state", [seed]),
    }
    param = {
        "iterations": trial.suggest_int("iterations", 100, 3500),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        #"subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
    }

    if os.path.exists("./results.csv"):
        results = pd.read_csv("./results.csv")
    else:
        results = pd.DataFrame()

    cv=TimeSeriesSplit(n_splits=4)
    scores_list = []
    for fold, (train_idx, test_idx) in enumerate(cv.split(train)):
        train_x , val_x = x.iloc[train_idx] , x.iloc[test_idx]
        train_y , val_y = y.iloc[train_idx] , y.iloc[test_idx]
        #print(f"Fold {fold+1}",train_x.shape,train_y.shape,val_x.shape,val_y.shape)
        model = CatBoostRegressor(**param,**param_constant)
        model.fit(train_x,train_y,verbose=False)
        prediction = model.predict(val_x)
        mse = mean_squared_error(val_y,prediction)
        scores_list.append(mse)

    df = pd.DataFrame([{
        "model":"catboost",
        "fold0": scores_list[0],
        "fold1": scores_list[1],
        "fold2": scores_list[2],
        "fold3": scores_list[3],
        "hyperparameters": {**param,**param_constant},
    }])
    results = pd.concat([results,df],ignore_index=True)
    results.to_csv("./results.csv", index=False)

    return np.mean(scores_list)

In [None]:
study = optuna.create_study(direction='minimize',study_name="BTK")
study.optimize(objective_catboost, n_trials=50)

print("En iyi parametreler:", study.best_params)
print("En iyi doğruluk:", study.best_value)