In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import requests

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, 
                             average_precision_score, confusion_matrix, classification_report)

# Optional SHAP
try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

np.random.seed(42)
sns.set(style="whitegrid")

DATA_PATH = Path("default of credit card clients.csv")

if not DATA_PATH.exists():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
    response = requests.get(url)
    DATA_PATH.write_bytes(response.content)
    print(f"Downloaded dataset to {DATA_PATH}")
else:
    print(f"Dataset already exists at {DATA_PATH}")

# Load dataset
try:
    df = pd.read_excel(DATA_PATH, skiprows=1)
except:
    df = pd.read_csv(DATA_PATH, encoding='utf-8', skiprows=1)

# Clean column names
def clean_col(c):
    c = str(c).strip().lower().replace("(", "").replace(")", "").replace("-", "_").replace(".", "_")
    return "_".join(c.split())

df.columns = [clean_col(c) for c in df.columns]

# Ensure numeric types
for c in df.columns:
    if df[c].dtype == object:
        tmp = pd.to_numeric(df[c].astype(str).str.replace(",", "").str.replace("%",""), errors="coerce")
        if tmp.notna().mean() > 0.8:
            df[c] = tmp

# Detect target
target = "default_payment_next_month"
df[target] = pd.to_numeric(df[target], errors="coerce").astype("Int64")

# Quick EDA
print(df.info())
print(df[target].value_counts())
