In [1]:
import sys; sys.path.append("../../automl/")

In [153]:
from pathlib import Path
import yaml
import joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from src.automl.model.lightgbm import LightGBMClassification
from src.automl.loggers import configure_root_logger
from src.automl.constants import create_ml_data_dir
from src.automl.model.metrics import RocAuc

In [3]:
create_ml_data_dir()
configure_root_logger()

## Constants

In [4]:
RANDOM_SEED = 77
DATA_PATH = Path("../../data/")
CONFIG_PATH = Path("../../configs/config.yaml")
N_JOBS = 16

In [5]:
with CONFIG_PATH.open() as f:
    cfg = yaml.load(f, Loader=yaml.SafeLoader)

## Data

In [136]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc.parquet")
df_train, df_test = df_train.sort_values(by="id").iloc[:300_000], df_train.sort_values(by="id").iloc[300_000:]

In [137]:
df_train["target"].value_counts()

0    282256
1     17744
Name: target, dtype: int64

In [138]:
# undersample the 0 class
df_train = pd.concat([df_train.loc[df_train.target == 1], df_train.loc[df_train.target == 0].sample(200_000, random_state=RANDOM_SEED)], ignore_index=True)

In [139]:
ohe_cols = df_train.columns[df_train.columns.str.startswith("OneHotEncoder")].values.tolist()
oe_cols = df_train.columns[df_train.columns.str.startswith("OrdinalEncoder")].values.tolist()
te_cols = df_train.columns[df_train.columns.str.startswith("MeanTargetEncoder")].values.tolist()

In [140]:
# take target encoded columns
X_train, y_train = df_train[cfg["selected_features"] + te_cols + ohe_cols], df_train["target"]
X_test, y_test = df_test[cfg["selected_features"] + te_cols + ohe_cols], df_test["target"]

In [109]:
# take all columns
X_train, y_train = df_train.drop(columns=["target", "id", "smpl"]), df_train["target"]
X_test, y_test = df_test.drop(columns=["target", "id", "smpl"]), df_test["target"]

In [73]:
# take ordinal encoded columns
X_train, y_train = df_train[cfg["selected_features"] + oe_cols + ohe_cols], df_train["target"]
X_test, y_test = df_test[cfg["selected_features"] + oe_cols + ohe_cols], df_test["target"]

In [141]:
display(y_train.value_counts(normalize=True))
display(y_test.value_counts(normalize=True))

0    0.91851
1    0.08149
Name: target, dtype: float64

0    0.941322
1    0.058678
Name: target, dtype: float64

In [142]:
categorical_features = ohe_cols# + oe_cols

## Model

In [143]:
params = {'objective_type': 'binary',
 'boosting': 'gbdt',
 'num_iterations': 284,
 'max_depth': 7,
 'learning_rate': 0.03,
 'num_leaves': 85,
 'min_data_in_leaf': 162,
 'bagging_fraction': 0.9073942790005392,
 'bagging_freq': 20,
 'feature_fraction': 0.4632002607000075,
 'early_stopping_round': 100,
 'lambda_l1': 0.9117760912120141,
 'lambda_l2': 6.182177721979992,
 'min_gain_to_split': 2.764898005468358,
 'n_jobs': 16,
 'random_state': 77,
 'is_unbalance': False}
metric = RocAuc()

In [146]:
model.tune?

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mtune[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mseries[0m[0;34m.[0m[0mSeries[0m[0;34m,[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric[0m[0;34m=[0m[0;34m<[0m[0msrc[0m[0;34m.[0m[0mautoml[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0mmetrics[0m[0;34m.[0m[0mregression_metrics[0m[0;34m.[0m[0mMSE[0m [0mobject[0m [0mat[0m [0;36m0x7f814d543b80[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeout[0m[0;34m

In [147]:
model = LightGBMClassification(n_jobs=16)
model.tune(X_train, y_train, metric, timeout=60 * 60, categorical_features=categorical_features)
oof = model.fit(X_train, y_train, categorical_features=categorical_features)
y_pred = model.predict(X_test)

print(metric(y_train, oof))
print(metric(y_test, y_pred))

[2024-11-06 07:57:08,551] - [   START    ] - Tuning LightGBMClassification
[2024-11-06 07:57:13,468] - [   OPTUNA   ] - Trial 0. New best score 0.7917480438929874 with parameters {'max_depth': 6, 'num_leaves': 488, 'min_data_in_leaf': 188, 'bagging_fraction': 0.7993292420985183, 'bagging_freq': 0, 'feature_fraction': 0.49359671220172163, 'lambda_l1': 0.5808361216819946, 'lambda_l2': 8.661761457749352, 'min_gain_to_split': 12.022300234864176, 'is_unbalance': True, 'num_iterations': 4}
[2024-11-06 07:57:29,896] - [   OPTUNA   ] - Trial 2. New best score 0.7933652131390204 with parameters {'max_depth': 5, 'num_leaves': 194, 'min_data_in_leaf': 117, 'bagging_fraction': 0.8925879806965068, 'bagging_freq': 0, 'feature_fraction': 0.708540663048167, 'lambda_l1': 5.924145688620425, 'lambda_l2': 0.46450412719997725, 'min_gain_to_split': 12.150897038028766, 'is_unbalance': True, 'num_iterations': 4}
[2024-11-06 07:57:36,159] - [   OPTUNA   ] - Trial 3. New best score 0.8023578174727592 with param

In [165]:
MODEL_NAME = "lgb_8055"
MODEL_DIR = Path(f"../../data/models/{MODEL_NAME}")
MODEL_DIR.mkdir(exist_ok=True)

In [167]:
res = pd.DataFrame()
res[MODEL_NAME] = oof[:, 1]
res.to_csv(MODEL_DIR / "oof.csv", index=False)
joblib.dump(model, MODEL_DIR / f"{MODEL_NAME}.joblib")

with (MODEL_DIR / "params.yaml").open("w") as f:
    yaml.dump(model.params, f)

with (MODEL_DIR / "score.txt").open("w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    print("Test:", metric(y_test, y_pred), file=f)

In [154]:
joblib.dump(model, "../../data/models/lgb_8055/lgb_8055.joblib")


['../../data/models/lgb_8055/lgb_8055.joblib']

In [158]:
with open("../../data/models/lgb_8055/params.yaml", "w") as f:
    yaml.dump(model.params, f)

In [161]:
with open("../../data/models/lgb_8055/score.txt", "w") as f:
    print("OOF:", metric(y_train, oof), file=f)
    print("Test:", metric(y_test, y_pred), file=f)

In [145]:
print(metric(y_train, oof))
print(metric(y_test, y_pred))

0.8047647204688908
0.7972289963856585


In [86]:
print(metric(y_train, oof))
print(metric(y_test, y_pred))

0.8009657912533814
0.7932043263129364


In [47]:
print(metric(y_train, oof))
print(metric(y_test, y_pred))

0.8036673066952209
0.7969396115051322


In [37]:
print(metric(y_train, oof))
print(metric(y_test, y_pred))

0.8042716798166517
0.7975841531905153


In [15]:
test = pd.read_parquet(DATA_PATH / "test_preproc.parquet")
test["target"] = model.predict(test[cfg["selected_features"] + te_cols + ohe_cols])[:, 1]
test[['id', 'target']].to_csv('lgb_full_dataset.csv', index=False)

In [25]:
pred_1 = pd.read_csv("lama_utilized.csv")
pred_2 = pd.read_csv("lgmb_oe_ohe_cols_0805.csv")
pred_3 = pd.read_csv("catboost_ts.csv")

In [26]:
pred_1["target"] = 0.6 * pred_1["target"] + 0.2 * pred_2["target"] + 0.2 * pred_3["target"]

In [29]:
pred_1.to_csv("blend.csv", index=False)

In [166]:
MODEL_DIR.open?

[0;31mSignature:[0m
[0mMODEL_DIR[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'r'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffering[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Open the file pointed by this path and return a file object, as
the built-in open() function does.
[0;31mFile:[0m      /usr/lib/python3.10/pathlib.py
[0;31mType:[0m      method