In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import gc

TRAIN_PATH = "dataset/train.csv"
TEST_PATH = "dataset/test.csv"
SUBMISSION_PATH = "dataset/submission_lgbm.csv"

In [7]:
def load_data(train_path, test_path):
    print("Loading data...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    test_ids = test_df['id']
    test_df['y'] = np.nan
    full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

    print(f"Combined data shape: {full_df.shape}")
    return full_df, test_ids

full_df, test_ids = load_data(TRAIN_PATH, TEST_PATH)
full_df.head()

Loading data...
Combined data shape: (1000000, 18)


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0.0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0.0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0.0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0.0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1.0


In [8]:
def feature_engineering(df):
    print("Starting feature engineering...")
    binary_map = {'yes': 1, 'no': 0}
    for col in ['default', 'housing', 'loan']:
        if col in df.columns:
            df[col] = df[col].map(binary_map)
    df['was_previously_contacted'] = (df['pdays'] != -1).astype(int)

    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')
    
    print("feature engineering complete")
    return df

engineered_df = feature_engineering(full_df)
engineered_df.head()

Starting feature engineering...
feature engineering complete


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,was_previously_contacted
0,0,42,technician,married,secondary,0,7,0,0,cellular,25,aug,117,3,-1,0,unknown,0.0,0
1,1,38,blue-collar,married,secondary,0,514,0,0,unknown,18,jun,185,1,-1,0,unknown,0.0,0
2,2,36,blue-collar,married,secondary,0,602,1,0,unknown,14,may,111,2,-1,0,unknown,0.0,0
3,3,27,student,single,secondary,0,34,1,0,unknown,28,may,10,2,-1,0,unknown,0.0,0
4,4,26,technician,married,secondary,0,889,1,0,cellular,3,feb,902,1,-1,0,unknown,1.0,0


In [9]:
def train_lightgbm_model(full_df, test_ids):
    print("Starting model training...")

    train_df = full_df[full_df['y'].notna()].copy()
    test_df = full_df[full_df['y'].isna()].copy()

    features = [col for col in train_df.columns if col not in ['id', 'y']]
    X = train_df[features]
    y = train_df['y']
    X_test = test_df[features]

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 2000,
        'learning_rate': 0.01,
        'num_leaves': 20,
        'max_depth': 5,
        'seed': 42,
        'n_jobs': -1,
        'verbose': -1,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'is_unbalance': True
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))
    test_preds = np.zeros(len(test_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"--- Fold {fold+1}/5 ---")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(100, verbose=False)])

        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_preds
        fold_auc = roc_auc_score(y_val, val_preds)
        fold_scores.append(fold_auc)
        print(f"Fold {fold+1} AUC: {fold_auc:.5f}")

        test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits
        
        del X_train, y_train, X_val, y_val, model
        gc.collect()

    overall_auc = roc_auc_score(y, oof_preds)
    print(f"\nOverall OOF AUC: {overall_auc:.5f}")
    print(f"Mean Fold AUC: {np.mean(fold_scores):.5f} (+/- {np.std(fold_scores):.5f})")

    return test_preds


final_predictions = train_lightgbm_model(engineered_df, test_ids)

Starting model training...
--- Fold 1/5 ---
Fold 1 AUC: 0.96551
--- Fold 2/5 ---
Fold 2 AUC: 0.96447
--- Fold 3/5 ---
Fold 3 AUC: 0.96464
--- Fold 4/5 ---
Fold 4 AUC: 0.96548
--- Fold 5/5 ---
Fold 5 AUC: 0.96490

Overall OOF AUC: 0.96500
Mean Fold AUC: 0.96500 (+/- 0.00043)


In [10]:
def generate_submission(test_ids, predictions, file_path):
    print("Generating submission file...")
    submission_df=pd.DataFrame({'id':test_ids, 'y': predictions})
    submission_df.to_csv(file_path, index=False)
    print(f"Submission file saved to {file_path}")
    return submission_df.head()

generate_submission(test_ids, final_predictions, SUBMISSION_PATH)

Generating submission file...
Submission file saved to dataset/submission_lgbm.csv


Unnamed: 0,id,y
0,750000,0.02353
1,750001,0.465735
2,750002,0.003958
3,750003,0.000983
4,750004,0.148244
