In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

from catboost import CatBoostClassifier, Pool

# =========================
# File Upload Handling
# =========================
try:
    from google.colab import files
    uploaded = files.upload()
    for fn in uploaded.keys():
        TRAIN_PATH = fn
        print(f"[OK] Using dataset: {fn}")
except:
    files_in_dir = [f for f in os.listdir('.') if f.endswith('.csv')]
    if files_in_dir:
        TRAIN_PATH = files_in_dir[0]
        print(f"[OK] Using dataset: {TRAIN_PATH}")
    else:
        raise FileNotFoundError("No CSV file found. Please upload your dataset.")

# =========================
# Load Dataset
# =========================
df = pd.read_csv(TRAIN_PATH)
print("[INFO] Loaded shape:", df.shape)

# Target variable
TARGET = "TARGET"
y = df[TARGET]
X = df.drop(columns=[TARGET])

# Identify categorical and numeric features
cat_cols_all = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols_all = X.select_dtypes(include=[np.number]).columns.tolist()

# =========================
# Preprocessing for Logistic Regression
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols_all),
        ("cat", categorical_transformer, cat_cols_all)
    ]
)

# =========================
# Train/Validation Split
# =========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =========================
# Logistic Regression Model
# =========================
print("[INFO] Training Logistic Regression ...")
log_reg = Pipeline(steps=[("preprocessor", preprocessor),
                         ("classifier", LogisticRegression(max_iter=1000))])

log_reg.fit(X_train, y_train)
probs_lr = log_reg.predict_proba(X_valid)[:, 1]
auc_lr = roc_auc_score(y_valid, probs_lr)
print(f"[RESULT] Logistic Regression AUC: {auc_lr:.4f}")

# =========================
# CatBoost Model
# =========================
print("[INFO] Training CatBoost ...")

# Fill NaNs in categorical columns for CatBoost
for col in cat_cols_all:
    X_train[col] = X_train[col].astype(str).fillna("Missing")
    X_valid[col] = X_valid[col].astype(str).fillna("Missing")

cat_indices = [X.columns.get_loc(c) for c in cat_cols_all]

train_pool = Pool(X_train, y_train, cat_features=cat_indices)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_indices)

cat_model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.1,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    random_seed=42
)

cat_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
probs_cat = cat_model.predict_proba(valid_pool)[:, 1]
auc_cat = roc_auc_score(y_valid, probs_cat)
print(f"[RESULT] CatBoost AUC: {auc_cat:.4f}")

# =========================
# Business Cost Optimization
# =========================
COST_FALSE_POSITIVE = 100  # Approving bad loan
COST_FALSE_NEGATIVE = 500  # Rejecting good customer

thresholds = np.linspace(0, 1, 101)

best_threshold = 0.5
best_cost = float("inf")

for t in thresholds:
    preds = (probs_cat >= t).astype(int)
    cm = confusion_matrix(y_valid, preds)
    tn, fp, fn, tp = cm.ravel()
    total_cost = fp * COST_FALSE_POSITIVE + fn * COST_FALSE_NEGATIVE
    if total_cost < best_cost:
        best_cost = total_cost
        best_threshold = t

print(f"[OPTIMIZATION] Best Threshold: {best_threshold:.2f}, Min Business Cost: {best_cost}")

# =========================
# Evaluation at Optimal Threshold
# =========================
final_preds = (probs_cat >= best_threshold).astype(int)
cm_final = confusion_matrix(y_valid, final_preds)
print("[CONFUSION MATRIX at Optimal Threshold]:\n", cm_final)
print("[Accuracy]", accuracy_score(y_valid, final_preds))

# =========================
# Feature Importances
# =========================
print("[Top 20 CatBoost Features]:")
feat_importances = pd.Series(cat_model.get_feature_importance(), index=X.columns)
print(feat_importances.sort_values(ascending=False).head(20))

# Save results
results = pd.DataFrame({
    "true": y_valid,
    "probability": probs_cat,
    "prediction": final_preds
})
results.to_csv("validation_scores.csv", index=False)
print("[SAVED] validation_scores.csv")


Saving HC_application_train.csv to HC_application_train (2).csv
[OK] Using dataset: HC_application_train (2).csv
[INFO] Loaded shape: (307511, 122)
[INFO] Training Logistic Regression ...
[RESULT] Logistic Regression AUC: 0.7483
[INFO] Training CatBoost ...
[RESULT] CatBoost AUC: 0.7620
[OPTIMIZATION] Best Threshold: 0.18, Min Business Cost: 2064700
[CONFUSION MATRIX at Optimal Threshold]:
 [[52341  4197]
 [ 3290  1675]]
[Accuracy] 0.8782661008406094
[Top 20 CatBoost Features]:
EXT_SOURCE_3                  11.391126
EXT_SOURCE_2                  10.636839
DAYS_BIRTH                     5.399223
EXT_SOURCE_1                   4.878071
AMT_CREDIT                     4.728810
AMT_GOODS_PRICE                4.337385
AMT_ANNUITY                    4.196658
DAYS_EMPLOYED                  3.071948
DAYS_LAST_PHONE_CHANGE         2.967281
DAYS_ID_PUBLISH                2.962375
DAYS_REGISTRATION              2.742720
AMT_INCOME_TOTAL               2.458882
CODE_GENDER                    2.1141