In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

train_df=pd.read_csv('/content/train.csv')
test_df=pd.read_csv('/content/test.csv')



train_df.info()
train_df.head()
train_df.drop(['id','CustomerId','Surname'],axis=1,inplace=True)
train_df.head()
train_df.isnull().sum()
train_df.duplicated().sum()
train_df.nunique()

train_df['Geography'].value_counts()

df_encoded = pd.get_dummies(train_df, columns=['Gender', 'Geography'], drop_first=True, dtype=int)
df_encoded.info()

train_df=df_encoded
train_df.head()

category_col = 'Exited'
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=category_col, y=col, data=train_df)
    plt.title(f'Boxplot of {col} by {category_col}')
    plt.xlabel(category_col)
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df
df_clean = remove_outliers_iqr(train_df, numeric_cols)
print(f"Before: {train_df.shape[0]} rows, After: {df_clean.shape[0]} rows")
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=category_col, y=col, data=df_clean)
    plt.title(f'Boxplot of {col} by {category_col}')
    plt.xlabel(category_col)
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()
corr=train_df.corr(numeric_only=True)
plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap", fontsize=14)
plt.show()
sns.pairplot(train_df, vars=['CreditScore', 'Balance', 'Age', 'EstimatedSalary'],hue='Exited', diag_kind='kde', palette='coolwarm')
plt.suptitle("Pairplot of Features w.r.t. Target (Exited)", y=1.02, fontsize=14)
plt.show()

X = train_df.drop('Exited', axis=1)
y = train_df['Exited']
# if numerical data, use StandardScler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_res)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_res, test_size=0.2, random_state=42, stratify=y_res)

model = RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_split=2, min_samples_leaf=1, max_features='sqrt')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)
f1score = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"F1 Score: {f1score:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# rf=RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_split=2, min_samples_leaf=1, max_features='sqrt',oob_score=True, random_state=42)
# rf.fit(X_scaled, y_res)

test_df.info()
test_ids = test_df['id']
test_df.drop(['id','CustomerId','Surname'],axis=1,inplace=True)
test_df.duplicated().sum()


df_encoded = pd.get_dummies(test_df, columns=['Gender', 'Geography'], drop_first=True, dtype=int)
df_encoded.info()
X_test=df_encoded
X_test.info()

X_test_scaled = scaler.transform(X_test)
y_pred=rf.predict_proba(X_test_scaled)

sub_df = pd.DataFrame({
    'id': test_ids,
    'Exited': y_pred[:,1]
})
sub_df.to_csv('submission_Bank.csv', index=False,float_format='%.6f')