In [5]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("training_data.csv")
test_df  = pd.read_csv("test_data.csv")


In [6]:
X = train_df.drop(columns=["target"])
y = train_df["target"]

X_test = test_df.copy()


In [19]:
N_FOLDS = 7

lgb_oof = np.zeros(len(X))          # out-of-fold predictions
lgb_test_folds = np.zeros((len(X_test), N_FOLDS))  # test preds per fold


In [20]:
skf = StratifiedKFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=42
)


In [21]:
import lightgbm as lgb
pos=(y==1).sum()
neg=(y==0).sum()
ratio=neg/pos
print(ratio)

26.436992221261885


In [22]:
from lightgbm import LGBMClassifier

model_params = {
    "objective": "binary",
    "scale_pos_weight": ratio,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "min_data_in_leaf": 200,
    "n_estimators": 500,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "reg_alpha": 0.5,
    "reg_lambda": 1.0,
    "random_state": 42,
    "n_jobs": -1
}


In [23]:
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.metrics import roc_auc_score # Added this import, assuming it was meant to be present

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):



    X_train, X_val = X.iloc[train_idx].drop(columns=["id"]), X.iloc[val_idx].drop(columns=["id"])
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LGBMClassifier(**model_params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[log_evaluation(100)]
    )

    val_preds = model.predict_proba(X_val)[:, 1]
    lgb_oof[val_idx] = val_preds


    lgb_test_folds[:, fold] = model.predict_proba(X_test.drop(columns=["id"]))[:, 1]

    fold_auc = roc_auc_score(y_val, val_preds)
    print(f"Fold AUC: {fold_auc:.4f}")


[LightGBM] [Info] Number of positive: 14875, number of negative: 393269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1306
[LightGBM] [Info] Number of data points in the train set: 408144, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036445 -> initscore=-3.274812
[LightGBM] [Info] Start training from score -3.274812
[100]	valid_0's auc: 0.626925	valid_0's binary_logloss: 0.639847
[200]	valid_0's auc: 0.626564	valid_0's binary_logloss: 0.630549
[300]	valid_0's auc: 0.627063	valid_0's binary_logloss: 0.618923
[400]	valid_0's auc: 0.625984	valid_0's binary_logloss: 0.606503
[500]	valid_0's auc: 0.62541	valid_0's binary_logloss: 0.596533
Fold AUC: 0.6254
[LightGBM] [Info] Number of positive: 14876, number of negative: 393269
[LightGBM] [Info] Auto-choo

In [24]:
auc = roc_auc_score(y, lgb_oof)
gini = 2 * auc - 1

print("CV AUC:", auc)
print("CV Gini:", gini)


CV AUC: 0.6333105590716481
CV Gini: 0.26662111814329625
