In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# ============================================================
# 1. LOAD DATA
# ============================================================
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')


# ============================================================
# 2. DROP USELESS COLUMNS
# ============================================================
cols_to_drop = ['id','CustomerId','Surname']

train_df.drop(columns=cols_to_drop, inplace=True)
test_ids = test_df['id']
test_df.drop(columns=cols_to_drop, inplace=True)


# ============================================================
# 3. ONE HOT ENCODE CATEGORICAL FEATURES
# ============================================================
train_df = pd.get_dummies(train_df, columns=['Gender', 'Geography'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Gender', 'Geography'], drop_first=True)

# ensure both have same columns  
test_df = test_df.reindex(columns=train_df.drop('Exited', axis=1).columns, fill_value=0)


# ============================================================
# 4. OUTLIER REMOVAL (IQR)
# ============================================================
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

def remove_outliers_iqr(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

train_df = remove_outliers_iqr(train_df, numeric_cols)


# ============================================================
# 5. TRAINâ€“VALID SPLIT
# ============================================================
X = train_df.drop('Exited', axis=1)
y = train_df['Exited']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


# ============================================================
# 6. RANDOM FOREST MODEL
# ============================================================
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42
)

model.fit(X_train, y_train)


# ============================================================
# 7. VALIDATION SCORES
# ============================================================
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_proba))
print("F1 Score:", f1_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))


# ============================================================
# 8. TEST SET PREDICTION
# ============================================================
X_test_scaled = scaler.transform(test_df)
test_proba = model.predict_proba(X_test_scaled)[:, 1]


# ============================================================
# 9. SUBMISSION FILE
# ============================================================
sub_df = pd.DataFrame({
    'id': test_ids,
    'Exited': test_proba
})

sub_df.to_csv('submission_Bank.csv', index=False)
sub_df.head()
