# 1. Load data

In [96]:
import pandas as pd

df = pd.read_csv('assets/BankChurners.csv')

df.columns

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

# 2. Data Cleaning

In [97]:
df.drop([
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
    'CLIENTNUM'],
    axis=1,
    inplace=True)

In [98]:
education_level = {
    'Uneducated':0, 'High School':1, 'College':2, 'Graduate':3,
    'Post-Graduate':4, 'Doctorate':5, 'Unknown':6
    }
df['Education_Level'] = df['Education_Level'].replace(education_level)

marital_status = {'Single':0, 'Married':1, 'Divorced':2, 'Unknown':3}
df['Marital_Status'] = df['Marital_Status'].replace(marital_status)

income_category = {
    'Less than $40K': 0, '$40K - $60K': 1, '$60K - $80K': 2,
    '$80K - $120K': 3, '$120K +': 4, 'Unknown': 5
    }
df['Income_Category'] = df['Income_Category'].replace(income_category)

attrition_flag = {'Existing Customer': 0, 'Attrited Customer': 1}
df['Attrition_Flag'] = df['Attrition_Flag'].replace(attrition_flag)

gender = {'F': 0, 'M': 1}
df['Gender'] = df['Gender'].replace(gender)

card_category = {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}
df['Card_Category'] = df['Card_Category'].replace(card_category)

  df['Education_Level'] = df['Education_Level'].replace(education_level)
  df['Marital_Status'] = df['Marital_Status'].replace(marital_status)
  df['Income_Category'] = df['Income_Category'].replace(income_category)
  df['Attrition_Flag'] = df['Attrition_Flag'].replace(attrition_flag)
  df['Gender'] = df['Gender'].replace(gender)
  df['Card_Category'] = df['Card_Category'].replace(card_category)


# 3. Data Preprocessing Pipeline

# 4. Prepare Data for Training and Testing

### 4.1 Split features and target

In [99]:
X = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']

### 4.2 Handling Imbalanced Data

In [100]:
from imblearn.over_sampling import SMOTE

In [101]:
X_res, y_res = SMOTE().fit_resample(X, y)

### 4.3 Split Data

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# 5. Model Building

### 5.1. Random Forest Classifier

In [104]:
from sklearn.ensemble import RandomForestClassifier

In [106]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 6. Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print('-----')
print("Classification Report:\n", classification_report(y_test, y_pred))
print('-----')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9615004935834156
-----
Classification Report:
                    precision    recall  f1-score   support

Attrited Customer       0.93      0.82      0.87       325
Existing Customer       0.97      0.99      0.98      1701

         accuracy                           0.96      2026
        macro avg       0.95      0.90      0.92      2026
     weighted avg       0.96      0.96      0.96      2026

-----
Confusion Matrix:
 [[ 266   59]
 [  19 1682]]


In [107]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print('-----')
print("Classification Report:\n", classification_report(y_test, y_pred))
print('-----')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9802941176470589
-----
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1676
           1       0.97      0.99      0.98      1724

    accuracy                           0.98      3400
   macro avg       0.98      0.98      0.98      3400
weighted avg       0.98      0.98      0.98      3400

-----
Confusion Matrix:
 [[1626   50]
 [  17 1707]]


In [110]:
import numpy as np
y_pred_proba = model.predict_proba(X_test)

churn_probabilities = y_pred_proba[:, 1]
high_churn_probabilities = churn_probabilities > 0.9
num_high_churn_samples = np.sum(high_churn_probabilities)
print("Number of samples with churn probability > 90%:", num_high_churn_samples)

Number of samples with churn probability > 90%: 1319
