<a href="https://colab.research.google.com/github/Faryalrifaz/First-Notebook-on-Kaggle/blob/main/bank_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

playground_series_s5e8_path = kagglehub.competition_download('playground-series-s5e8')

print('Data source import complete.')


In [None]:
# Kaggle Playground 2025-08: Bank Term Deposit — CV-safe CatBoost baseline
# Author: Faryal | Seeded, fold-logged, submission-ready
# Notes:
# - Auto-detects train/test under /kaggle/input where possible.
# - Uses StratifiedKFold, CatBoost (categoricals), early stopping.
# - Falls back to LightGBM with simple categorical handling if CatBoost missing.

import os
import sys
import glob
import random
import gc
from typing import List, Tuple, Optional, Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

SEED = 42
FOLDS = 5
RANDOM_STATE = SEED

def seed_everything(seed: int = SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(SEED)

def find_competition_root() -> Optional[str]:
    # Try to locate a directory under /kaggle/input containing train.csv, test.csv, sample_submission.csv
    input_root = "/kaggle/input"
    if not os.path.isdir(input_root):
        return None
    candidates = []
    for path in glob.glob(os.path.join(input_root, "*")):
        if os.path.isdir(path):
            train_p = os.path.join(path, "train.csv")
            test_p  = os.path.join(path, "test.csv")
            sample_p= os.path.join(path, "sample_submission.csv")
            if os.path.isfile(train_p) and os.path.isfile(test_p) and os.path.isfile(sample_p):
                candidates.append(path)
    if len(candidates) == 1:
        return candidates[0]
    # Prefer paths that look like playground series
    for p in candidates:
        name = os.path.basename(p).lower()
        if "playground" in name or "ps" in name:
            return p
    return candidates[0] if candidates else None

def load_data(root: Optional[str]=None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    if root is None:
        root = find_competition_root()
    if root is None:
        raise FileNotFoundError("Could not auto-detect competition data. Set DATA_ROOT manually to the folder containing train.csv/test.csv.")
    print(f"[Info] Using data root: {root}")
    train = pd.read_csv(os.path.join(root, "train.csv"))
    test  = pd.read_csv(os.path.join(root, "test.csv"))
    sample= pd.read_csv(os.path.join(root, "sample_submission.csv"))
    return train, test, sample

def infer_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    cols = df.columns.tolist()
    id_col = "id" if "id" in cols else None
    target_col = "y" if "y" in cols else ("target" if "target" in cols else None)
    if target_col is None:
        raise KeyError("Target column not found. Expected 'y' or 'target'.")
    return {"id": id_col, "target": target_col}

def detect_feature_types(df: pd.DataFrame, id_col: Optional[str], target_col: Optional[str]) -> Tuple[List[str], List[str]]:
    feature_cols = [c for c in df.columns if c not in [id_col, target_col] and c is not None]
    # Categorical: object or low-cardinality integer-like
    cat_cols = []
    num_cols = []
    for c in feature_cols:
        if df[c].dtype == "object":
            cat_cols.append(c)
        else:
            # Heuristic: small unique counts => treat as categorical
            nunique = df[c].nunique(dropna=True)
            if str(df[c].dtype).startswith(("int", "uint")) and nunique <= 32:
                cat_cols.append(c)
            else:
                num_cols.append(c)
    print(f"[Info] Detected {len(feature_cols)} features -> {len(num_cols)} numeric, {len(cat_cols)} categorical")
    return num_cols, cat_cols

def safe_numeric_transform(df: pd.DataFrame, num_cols: List[str]) -> pd.DataFrame:
    # Minimal numeric cleaning: clip extreme outliers at 0.5th and 99.5th percentiles per column
    df = df.copy()
    for c in num_cols:
        if df[c].dtype.kind in "biufc":
            lo, hi = df[c].quantile([0.005, 0.995])
            df[c] = df[c].clip(lo, hi)
    return df

def prepare_catboost_data(train: pd.DataFrame, test: pd.DataFrame, id_col: Optional[str], target_col: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[int]]:
    num_cols, cat_cols = detect_feature_types(train, id_col, target_col)
    # Align dtypes: ensure categorical columns are strings consistently
    for c in cat_cols:
        train[c] = train[c].astype("category")
        test[c]  = test[c].astype("category")
    for c in num_cols:
        # Ensure numeric
        train[c] = pd.to_numeric(train[c], errors="coerce")
        test[c]  = pd.to_numeric(test[c], errors="coerce")
    # Clip numeric outliers
    train[num_cols] = safe_numeric_transform(train, num_cols)[num_cols]
    test[num_cols]  = safe_numeric_transform(test, num_cols)[num_cols]
    features = num_cols + cat_cols
    cat_idx = [features.index(c) for c in cat_cols]
    return train[features + [target_col]], test[features], cat_idx

def train_catboost_cv(train_df: pd.DataFrame, test_df: pd.DataFrame, target_col: str, cat_idx: List[int]) -> Tuple[np.ndarray, np.ndarray, float, dict]:
    try:
        from catboost import CatBoostClassifier, Pool
    except Exception as e:
        print("[Warn] CatBoost not available, falling back to LightGBM. Error:", e)
        return train_lightgbm_cv(train_df, test_df, target_col)

    X = train_df.drop(columns=[target_col])
    y = train_df[target_col].values

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros(len(train_df), dtype=float)
    test_pred = np.zeros(len(test_df), dtype=float)
    fold_scores = []
    feature_importance = None

    params = dict(
        loss_function="Logloss",
        eval_metric="AUC",
        random_state=RANDOM_STATE,
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=3.0,
        iterations=10000,
        od_type="Iter",
        od_wait=500,
        verbose=200
    )

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        train_pool = Pool(X_tr, label=y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, label=y_va, cat_features=cat_idx)
        test_pool  = Pool(test_df, cat_features=cat_idx)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

        preds_va = model.predict_proba(valid_pool)[:, 1]
        preds_te = model.predict_proba(test_pool)[:, 1]

        oof[va_idx] = preds_va
        test_pred += preds_te / FOLDS

        fold_auc = roc_auc_score(y_va, preds_va)
        fold_scores.append(fold_auc)
        print(f"[Fold {fold}] AUC: {fold_auc:.6f}")

        # Save FI from last fold (CatBoost: PredictionValuesChange)
        feature_importance = dict(zip(X.columns, model.get_feature_importance(type='PredictionValuesChange')))

        del model, train_pool, valid_pool, test_pool
        gc.collect()

    cv_auc = roc_auc_score(y, oof)
    print(f"[CV] OOF AUC: {cv_auc:.6f} | Folds: {[round(s,6) for s in fold_scores]}")
    return oof, test_pred, cv_auc, {"fold_scores": fold_scores, "feature_importance": feature_importance}

def train_lightgbm_cv(train_df: pd.DataFrame, test_df: pd.DataFrame, target_col: str) -> Tuple[np.ndarray, np.ndarray, float, dict]:
    import lightgbm as lgb

    # Simple categorical handling: one-hot <= 32 unique, frequency encode > 32
    y = train_df[target_col].values
    X = train_df.drop(columns=[target_col]).copy()
    X_te = test_df.copy()

    # Determine cat vs numeric again
    cat_cols = [c for c in X.columns if str(X[c].dtype) in ("object", "category")]
    low_card = [c for c in cat_cols if X[c].nunique(dropna=True) <= 32]
    high_card = [c for c in cat_cols if c not in low_card]

    # One-hot low-card
    X = pd.get_dummies(X, columns=low_card, dummy_na=True)
    X_te = pd.get_dummies(X_te, columns=low_card, dummy_na=True)

    # Align columns
    X, X_te = X.align(X_te, join="left", axis=1, fill_value=0)

    # Frequency encode high-card
    for c in high_card:
        freq = train_df[c].value_counts(dropna=True)
        X[c+"_freq"] = train_df[c].map(freq).fillna(0).values
        X_te[c+"_freq"] = test_df[c].map(freq).fillna(0).values
        # Drop raw col
        X.drop(columns=[c], inplace=True, errors="ignore")
        X_te.drop(columns=[c], inplace=True, errors="ignore")

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros(len(X), dtype=float)
    test_pred = np.zeros(len(X_te), dtype=float)
    fold_scores = []

    params = dict(
        objective="binary",
        metric="auc",
        learning_rate=0.03,
        num_leaves=64,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        min_data_in_leaf=20,
        lambda_l1=0.0,
        lambda_l2=0.0,
        max_depth=-1,
        n_estimators=10000,
        random_state=RANDOM_STATE,
        verbose=-1
    )

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        lgb_train = lgb.Dataset(X_tr, label=y_tr)
        lgb_valid = lgb.Dataset(X_va, label=y_va, reference=lgb_train)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train", "valid"],
            verbose_eval=200,
            early_stopping_rounds=500
        )

        preds_va = model.predict(X_va, num_iteration=model.best_iteration)
        preds_te = model.predict(X_te, num_iteration=model.best_iteration)

        oof[va_idx] = preds_va
        test_pred += preds_te / FOLDS

        fold_auc = roc_auc_score(y_va, preds_va)
        fold_scores.append(fold_auc)
        print(f"[Fold {fold}] AUC: {fold_auc:.6f}")

        del model, lgb_train, lgb_valid
        gc.collect()

    cv_auc = roc_auc_score(y, oof)
    print(f"[CV] OOF AUC: {cv_auc:.6f} | Folds: {[round(s,6) for s in fold_scores]}")
    return oof, test_pred, cv_auc, {"fold_scores": fold_scores}

def main():
    train, test, sample = load_data()
    cols = infer_columns(train)
    id_col, target_col = cols["id"], cols["target"]
    print(f"[Info] id_col={id_col}, target_col={target_col}")

    # Basic target check
    pos_rate = train[target_col].mean()
    print(f"[Info] Training rows: {len(train)} | Pos rate (y=1): {pos_rate:.4f}")

    # Prepare data for CatBoost (handles categorical cleanly)
    train_pre, test_pre, cat_idx = prepare_catboost_data(train, test, id_col, target_col)

    # Train CV
    oof, test_pred, cv_auc, extras = train_catboost_cv(train_pre, test_pre, target_col, cat_idx)

    # Simple error slicing example (optional, cheap)
    if id_col and id_col in train.columns:
        try:
            df_oof = train[[id_col, target_col]].copy()
            df_oof["oof"] = oof
            # Print AUC on top/bottom deciles by oof
            quant = pd.qcut(df_oof["oof"], 10, duplicates="drop")
            by_dec = df_oof.groupby(quant)[target_col].mean()
            print("[Info] Target rate by OOF decile (low→high):")
            print(by_dec.to_string())
        except Exception as e:
            print("[Warn] OOF slicing skipped:", e)

    # Submission
    sub = sample.copy()
    # Ensure correct column names
    if "y" in sub.columns:
        sub["y"] = test_pred
    elif "target" in sub.columns:
        sub["target"] = test_pred
    else:
        # Fallback to 'y'
        if "y" not in sub.columns:
            sub.columns = ["id", "y"]
        sub["y"] = test_pred

    out_name = f"submission_{cv_auc:.5f}.csv"
    sub.to_csv(out_name, index=False)
    print(f"[Done] CV AUC: {cv_auc:.6f} | Wrote {out_name}")

if __name__ == "__main__":
    main()