In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("training_data.csv")
test_df  = pd.read_csv("test_data.csv")


In [None]:
X = train_df.drop(columns=["target"])
y = train_df["target"]

X_test = test_df.copy()


In [None]:
N_FOLDS = 5

lgb_oof = np.zeros(len(X))          # out-of-fold predictions
lgb_test_folds = np.zeros((len(X_test), N_FOLDS))  # test preds per fold


In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=42
)

In [None]:
import lightgbm as lgb
pos=(y==1).sum()
neg=(y==0).sum()
ratio=neg/pos
print(ratio)

26.436992221261885


In [None]:
model_params = {
    "objective": "binary",
    "scale_pos_weight": ratio,
    "learning_rate": 0.01,           # ← increased from 0.01
    "num_leaves": 31,                # ← increased from 31
    "min_data_in_leaf": 200,          # ← decreased from 200
    "n_estimators": 800,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "reg_alpha": 0.5,
    "reg_lambda": 1.0,
    "random_state": 42,
    "n_jobs": -1
}


In [None]:
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.metrics import roc_auc_score

lgb_models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    X_train = X.iloc[train_idx].drop(columns=["id"])
    X_val   = X.iloc[val_idx].drop(columns=["id"])

    y_train = y.iloc[train_idx]
    y_val   = y.iloc[val_idx]

    model = LGBMClassifier(**model_params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[log_evaluation(100)]
    )

    lgb_models.append(model)

    val_preds = model.predict_proba(X_val)[:, 1]
    lgb_oof[val_idx] = val_preds

    lgb_test_folds[:, fold] = model.predict_proba(
        X_test.drop(columns=["id"])
    )[:, 1]

    fold_auc = roc_auc_score(y_val, val_preds)
    print(f"Fold {fold} AUC: {fold_auc:.4f}")


[LightGBM] [Info] Number of positive: 13884, number of negative: 367051
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1302
[LightGBM] [Info] Number of data points in the train set: 380935, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
[100]	valid_0's auc: 0.63191	valid_0's binary_logloss: 0.404895
[200]	valid_0's auc: 0.633381	valid_0's binary_logloss: 0.549774
[300]	valid_0's auc: 0.634543	valid_0's binary_logloss: 0.609391
[400]	valid_0's auc: 0.635135	valid_0's binary_logloss: 0.63041
[500]	valid_0's auc: 0.635549	valid_0's binary_logloss: 0.636433
[600]	valid_0's auc: 0.63576	valid_0's binary_logloss: 0.636509
[700]	valid_0's auc: 0.636022	valid_0's binary_loglos

In [None]:
auc = roc_auc_score(y, lgb_oof)
gini = 2 * auc - 1

print("CV AUC:", auc)
print("CV Gini:", gini)


CV AUC: 0.6401863502376878
CV Gini: 0.28037270047537555


In [None]:
X_test_new = pd.read_csv("test_data.csv")   # or whatever the file name is
test_ids = X_test_new["id"]            # if id column exists

X_test_new = X_test_new.drop(columns=["id"], errors="ignore")


In [None]:
print(X_train.shape)
print(X_test_new.shape)

print(set(X_train.columns) - set(X_test_new.columns))
print(set(X_test_new.columns) - set(X_train.columns))



(380936, 50)
(119043, 50)
set()
set()


In [None]:
X_test_new = X_test_new[X_train.columns]


In [None]:
test_probs = np.zeros(len(X_test_new))

for model in lgb_models:  # list of trained fold models
    test_probs += model.predict_proba(X_test_new)[:, 1]

test_probs /= len(lgb_models)



In [None]:
print(test_probs)

[0.36450181 0.57505288 0.2895587  ... 0.66674672 0.50072682 0.33083109]


In [None]:
submission_lgb = pd.DataFrame({
    "id": test_ids,
    "target": test_probs
})

submission_lgb.to_csv("submission_lgb_raw.csv", index=False)


In [None]:
lgb_test = lgb_test_folds.mean(axis=1)


In [None]:
pd.DataFrame({
    "row_idx": np.arange(len(lgb_test)),
    "lgb_test": lgb_test
}).to_csv("/content/lgb_test.csv", index=False)


In [None]:
pd.DataFrame({
    "row_idx": np.arange(len(lgb_oof)),
    "lgb_oof":lgb_oof
}).to_csv("lgb_oof.csv", index=False)
