# 1. Load data

In [214]:
import pandas as pd

df = pd.read_csv('assets/BankChurners.csv')

clientnums_df = df[['CLIENTNUM']]

df.columns

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

# 2. Data Cleaning

In [215]:
df.drop([
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
    'CLIENTNUM'],
    axis=1,
    inplace=True)

In [216]:
education_level = {
    'Uneducated':0, 'High School':1, 'College':2, 'Graduate':3,
    'Post-Graduate':4, 'Doctorate':5, 'Unknown':6
    }
df['Education_Level'] = df['Education_Level'].replace(education_level)

marital_status = {'Single':0, 'Married':1, 'Divorced':2, 'Unknown':3}
df['Marital_Status'] = df['Marital_Status'].replace(marital_status)

income_category = {
    'Less than $40K': 0, '$40K - $60K': 1, '$60K - $80K': 2,
    '$80K - $120K': 3, '$120K +': 4, 'Unknown': 5
    }
df['Income_Category'] = df['Income_Category'].replace(income_category)

attrition_flag = {'Existing Customer': 0, 'Attrited Customer': 1}
df['Attrition_Flag'] = df['Attrition_Flag'].replace(attrition_flag)

gender = {'F': 0, 'M': 1}
df['Gender'] = df['Gender'].replace(gender)

card_category = {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}
df['Card_Category'] = df['Card_Category'].replace(card_category)

  df['Education_Level'] = df['Education_Level'].replace(education_level)
  df['Marital_Status'] = df['Marital_Status'].replace(marital_status)
  df['Income_Category'] = df['Income_Category'].replace(income_category)
  df['Attrition_Flag'] = df['Attrition_Flag'].replace(attrition_flag)
  df['Gender'] = df['Gender'].replace(gender)
  df['Card_Category'] = df['Card_Category'].replace(card_category)


# 3. Data Preprocessing Pipeline (TBD)

# 4. Prepare Data for Training and Testing

### 4.1 Split features and target

In [217]:
X = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']

### 4.2 Handling Imbalanced Data

In [218]:
from imblearn.over_sampling import SMOTE

In [219]:
X_res, y_res = SMOTE().fit_resample(X, y)

### 4.3 Split Data

In [220]:
from sklearn.model_selection import train_test_split

In [221]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# 5. Model Building

### 5.1. Random Forest Classifier

In [222]:
from sklearn.ensemble import RandomForestClassifier

In [223]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 6. Model Evaluation

In [224]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [225]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print('-----')
print("Classification Report:\n", classification_report(y_test, y_pred))
print('-----')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9791176470588235
-----
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1676
           1       0.97      0.99      0.98      1724

    accuracy                           0.98      3400
   macro avg       0.98      0.98      0.98      3400
weighted avg       0.98      0.98      0.98      3400

-----
Confusion Matrix:
 [[1625   51]
 [  20 1704]]


In [226]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print('-----')
print("Classification Report:\n", classification_report(y_test, y_pred))
print('-----')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9791176470588235
-----
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1676
           1       0.97      0.99      0.98      1724

    accuracy                           0.98      3400
   macro avg       0.98      0.98      0.98      3400
weighted avg       0.98      0.98      0.98      3400

-----
Confusion Matrix:
 [[1625   51]
 [  20 1704]]


In [227]:
import numpy as np
y_pred_proba = model.predict_proba(X_test)

churn_probabilities = y_pred_proba[:, 1]
high_churn_probabilities = churn_probabilities > 0.5
num_high_churn_samples = np.sum(high_churn_probabilities)
print("Number of samples with churn probability > 50%:", num_high_churn_samples)

Number of samples with churn probability > 50%: 1755


In [228]:
X_test.shape

(3400, 19)

In [229]:
attrited_customer_prediction_total = model.predict(X)
print(f'Accuracy: {accuracy_score(y, attrited_customer_prediction_total)}')
print("-----")
print("Classification Report:\n", classification_report(y, attrited_customer_prediction_total))
print("-----")
print("Confusion Matrix:\n", confusion_matrix(y, attrited_customer_prediction_total))

Accuracy: 0.9938777525427076
-----
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      8500
           1       0.97      0.99      0.98      1627

    accuracy                           0.99     10127
   macro avg       0.98      0.99      0.99     10127
weighted avg       0.99      0.99      0.99     10127

-----
Confusion Matrix:
 [[8449   51]
 [  11 1616]]


In [230]:
import numpy as np
y_pred_proba = model.predict_proba(X)

churn_probabilities = y_pred_proba[:, 1]
high_churn_probabilities = churn_probabilities > 0.5
num_high_churn_samples = np.sum(high_churn_probabilities)
print("Number of samples with churn probability > 50%:", num_high_churn_samples)

Number of samples with churn probability > 50%: 1667


# Finding potential attrited customers


In [231]:
df_not_attrited_customers = df[df['Attrition_Flag'] == 0]

In [232]:
df_not_attrited_customers.Attrition_Flag.value_counts()

Attrition_Flag
0    8500
Name: count, dtype: int64

In [233]:
X_not_attrited = df_not_attrited_customers.drop(['Attrition_Flag'], axis=1)
y_not_attrited = df_not_attrited_customers['Attrition_Flag']

In [234]:
import numpy as np
y_pred_proba = model.predict_proba(X_not_attrited)

churn_probabilities = y_pred_proba[:, 1]
high_churn_probabilities = churn_probabilities > 0.50
num_high_churn_samples = np.sum(high_churn_probabilities)
print("Number of samples with churn probability > 50%:", num_high_churn_samples)

Number of samples with churn probability > 50%: 51


In [235]:
clientnums = clientnums_df['CLIENTNUM']
probas = model.predict_proba(X_not_attrited)
result_dict = {clientnum: proba[1] for clientnum, proba in zip(clientnums, probas)}

In [236]:
print(result_dict)

{768805383: 0.02, 818770008: 0.02, 713982108: 0.07, 769911858: 0.09, 709106358: 0.21, 713061558: 0.01, 810347208: 0.02, 818906208: 0.18, 710930508: 0.04, 719661558: 0.01, 708790833: 0.03, 710821833: 0.0, 710599683: 0.07, 816082233: 0.0, 712396908: 0.09, 714885258: 0.01, 709967358: 0.02, 753327333: 0.03, 806160108: 0.1, 709327383: 0.04, 806165208: 0.0, 708508758: 0.11, 784725333: 0.2, 811604133: 0.1, 789124683: 0.01, 771071958: 0.05, 720466383: 0.05, 804424383: 0.18, 718813833: 0.04, 806624208: 0.11, 778348233: 0.18, 712991808: 0.01, 709029408: 0.0, 788658483: 0.01, 787937058: 0.12, 715318008: 0.0, 713962233: 0.23, 785432733: 0.12, 715190283: 0.39, 708300483: 0.08, 827111283: 0.03, 758551608: 0.05, 773146383: 0.18, 778493808: 0.08, 720572508: 0.01, 712661433: 0.02, 789172683: 0.02, 738406533: 0.03, 799723908: 0.01, 771490833: 0.0, 720756708: 0.23, 779471883: 0.07, 711525033: 0.14, 712813458: 0.0, 714374133: 0.01, 717891558: 0.04, 716632758: 0.05, 768563658: 0.01, 711427458: 0.63, 714091

In [237]:
result = dict((k, v) for k, v in result_dict.items() if v > 0.6)

In [238]:
print(result)

{711427458: 0.63, 712454058: 0.66, 778867533: 0.82, 807226683: 0.96, 721347408: 0.77, 771492408: 0.92, 719788833: 0.7, 714306258: 0.66, 721129383: 0.82, 709624533: 0.67, 714244383: 0.83, 711848658: 0.85, 789611283: 0.63, 719358333: 0.74, 712820208: 0.7, 711258108: 0.86, 739117233: 0.61, 713769258: 0.88, 768928833: 0.81, 720532908: 0.77, 716670783: 0.62, 714345783: 0.89, 778693458: 0.61, 714840858: 0.74, 718373658: 0.65, 721023708: 0.78, 716154258: 0.62, 779833908: 0.67, 710248233: 0.74, 717655458: 0.62, 796330983: 0.67, 715659333: 0.63, 715356258: 0.74, 715581633: 0.62, 709714008: 0.62}


# Finding attrited customers who are likely to become customers again


In [239]:
df_attrited_customers = df[df['Attrition_Flag'] == 1]

In [240]:
df_attrited_customers.Attrition_Flag.value_counts()

Attrition_Flag
1    1627
Name: count, dtype: int64

In [241]:
X_attrited = df_attrited_customers.drop(['Attrition_Flag'], axis=1)
y_attrited = df_attrited_customers['Attrition_Flag']

In [242]:
import numpy as np
y_pred_proba = model.predict_proba(X_attrited)

churn_probabilities = y_pred_proba[:, 0]
high_churn_probabilities = churn_probabilities > 0.5
num_high_churn_samples = np.sum(high_churn_probabilities)
print("Number of samples with churn probability > 50%:", num_high_churn_samples)

Number of samples with churn probability > 50%: 11


TEST