# library preparation 

In [51]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Data source:
https://www.kaggle.com/datasets/thedevastator/predicting-credit-card-customer-attrition-with-m?select=BankChurners.csv

In [52]:
pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [53]:
import opendatasets as od
od.download(
    "https://www.kaggle.com/datasets/thedevastator/predicting-credit-card-customer-attrition-with-m")

Skipping, found downloaded files in "./predicting-credit-card-customer-attrition-with-m" (use force=True to force download)


In [54]:
exit()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

#1. Exploratory data analysis

In [2]:
df = pd.read_csv('/content/predicting-credit-card-customer-attrition-with-m/BankChurners.csv')

In [None]:
df_pycaret = df.copy()

In [None]:
df.head()

In [None]:
df = df.rename(columns={'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1': 'nb1',
                        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2': 'nb2'})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#plot numerical data
df.hist(figsize = (15,15))

In [None]:
#plot categorical data
categorical_features = df.dtypes[df.dtypes == object].index
for col in categorical_features:
  fig, ax = plt.subplots(figsize = (7,3))
  sns.countplot(col, data = df, color = 'lightblue')
  plt.show()

In [None]:
# Seperate customer groups into Attrition and exist customers 
attrition = df.loc[df['Attrition_Flag'] == 'Attrited Customer']

In [None]:
exist = df.loc[df['Attrition_Flag'] == 'Existing Customer']

In [None]:
attrition.describe()

In [None]:
exist.describe()

In [None]:
attrition.hist(figsize = (15,15))

In [None]:
exist.hist(figsize = (15,15))

In [None]:
categorical_features = df.dtypes[df.dtypes == object].index
for col in categorical_features:
  fig, ax = plt.subplots(figsize = (7,3))
  sns.countplot(col, data = attrition, color = 'lightblue', order = attrition[col].value_counts().index)
  plt.show()

In [None]:
categorical_features = df.dtypes[df.dtypes == object].index
for col in categorical_features:
  fig, ax = plt.subplots(figsize = (7,3))
  sns.countplot(col, data = exist, color = 'lightblue', order = exist[col].value_counts().index)
  plt.show()

In [None]:
df.groupby(by = 'Education_Level')[['Income_Category']].agg(pd.Series.mode)

In [None]:
educationLevel = list(df['Education_Level'].unique())

In [None]:
plt.figure(figsize = (50,100))
for i, education in enumerate(educationLevel):
  axes = plt.subplot(10, 3, i+1)
  (df.loc[df['Education_Level'] == education][['Income_Category']]
    .value_counts().plot(kind = 'barh', fontsize = 30))
  plt.title(education, fontsize = 30)
  plt.ylabel('Income Category', fontsize = 30)
plt.tight_layout()
plt.show()

In [None]:
df.groupby(by = 'Card_Category')[['Income_Category']].agg(pd.Series.mode)

In [None]:
CardCat = list(df['Card_Category'].unique())

In [None]:
plt.figure(figsize = (50,100))
for i, card in enumerate(CardCat):
  axes = plt.subplot(10, 3, i+1)
  (df.loc[df['Card_Category'] == card][['Income_Category']].value_counts()
                                  .plot(kind = 'barh', fontsize = 30))
  plt.title(card, fontsize = 30)
  plt.ylabel('Income Category', fontsize = 30)
plt.tight_layout()
plt.show()

In [None]:
for card in CardCat:
  (df.loc[df['Card_Category'] == card][['Total_Revolving_Bal']]
                          .plot(kind = 'hist', legend=False))
  plt.title(card, fontsize = 15)
  plt.ylabel('Frequency', fontsize = 10)
  plt.xlabel('Total_Revolving_Bal', fontsize = 10)
  plt.tight_layout()
  plt.show()

In [None]:
df2 =  df.drop(['CLIENTNUM', 'nb1', 'nb2'], axis = 1)

In [None]:
df2.info()

In [None]:
fig = plt.figure(figsize = (12,10))
chart = sns.heatmap(df2.corr(), vmin = -1, vmax = 1,annot=True)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
chart.set_yticklabels(chart.get_yticklabels(), rotation=0)

# 2. Preprocessing and Modeling using PyCaret 
- Clustering group of customers
- Classification for attrition prediction 

### 2.1 Clustering

In [None]:
from pycaret.clustering import *

In [None]:
df_pycaret.head()

In [None]:
s = setup(df_pycaret, normalize = False, 
                    ignore_features = ['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                                       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
                    numeric_features = ['Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon'],
                    categorical_features = ['Gender', 'Marital_Status', 'Card_Category'],
                    ordinal_features = {'Education_Level' : ['Uneducated','High School', 'College', 'Graduate',   
                                                            'Post-Graduate', 'Doctorate', 'Unknown'],
                                        'Income_Category' : ['Less than $40K', '$40K - $60K', '$60K - $80K',  '$80K - $120K', '$120K +', 'Unknown']
                                        },
                    session_id = 123)

In [None]:
kmeans = create_model('kmeans')

In [None]:
print(kmeans)

In [None]:
evaluate_model(kmeans)

In [None]:
get_metrics()

In [None]:
kmean_results = assign_model(kmeans)
kmean_results.head()

In [None]:
save_model(kmeans, 'clustering_model_k4')

In [None]:
plot_kws = {'scatter_kws': {'s': 5}}
sns.scatterplot(data = kmean_results, x ='Customer_Age', y = 'Avg_Open_To_Buy', hue="Cluster")
plt.show()

CLIENTNUM: Unique identifier for each customer. (Integer)
Attrition_Flag: Flag indicating whether or not the customer has churned out. (Boolean)
Customer_Age: Age of customer. (Integer)
Gender: Gender of customer. (String)
Dependent_count: Number of dependents that customer has. (Integer)
Education_Level: Education level of customer. (String)
Marital_Status: Marital status of customer. (String)
Income_Category: Income category of customer. (String)
Card_Category: Type of card held by customer. (String)
Months_on_book: How long customer has been on the books. (Integer)
Total_Relationship_Count: Total number of relationships customer has with the credit card provider. (Integer)
Months_Inactive_12_mon: Number of months customer has been inactive in the last twelve months. (Integer)
Contacts_Count_12_mon: Number of contacts customer has had in the last twelve months. (Integer)
Credit_Limit: Credit limit of customer. (Integer)
Total_Revolving_Bal: Total revolving balance of customer. (Integer)
Avg_Open_To_Buy: Average open to buy ratio of customer. (Integer)
Total_Amt_Chng_Q4_Q1: Total amount changed from quarter 4 to quarter 1. (Integer)
Total_Trans_Amt: Total transaction amount. (Integer)
Total_Trans_Ct: Total transaction count. (Integer)
Total_Ct_Chng_Q4_Q1: Total count changed from quarter 4 to quarter 1. (Integer)
Avg_Utilization_Ratio: Average utilization ratio of customer. (Integer)
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1: Naive Bayes classifier for predicting whether or not someone will churn based on characteristics such

In [None]:
#Explore characteristics of each cluster
kmean_results.groupby(['Cluster'])[['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'
                                 ]].agg(pd.Series.mode)

In [None]:
kmean_results.groupby(['Cluster', 'Card_Category'])[['Education_Level', 'Marital_Status', 'Income_Category', 
                                 ]].agg(pd.Series.mode)

In [None]:
kmean_results.groupby(['Cluster', 'Education_Level'])[[ 'Marital_Status', 'Income_Category'
                                 ]].agg(pd.Series.mode)

In [None]:
kmean_results.groupby(['Cluster', 'Marital_Status'])[[ 'Customer_Age', 'Dependent_count',  'Total_Relationship_Count', 'Credit_Limit', 
       'Avg_Open_To_Buy',  'Total_Trans_Amt',
       'Total_Trans_Ct',  'Avg_Utilization_Ratio']].agg(pd.Series.mean)

From the data each customer cluster has characteristics as following;
- Cluster 0: Most are married with low income even they had varies educational levels. They have low purchasing power and low credit score (Utirization ratio).  

- Cluster 1: Most are single with high income. They have high purchasing power and great credit score. This group might represent upper class people or upper-middle class.

- Cluster 2: Most are married with low income. Although they didn't have much income, from average credit limit and credit score, they seem to have better money management when comparing with cluster 0 group. 

- Cluster 3: Most are married with high income. This group might be represent middle to upper-middle class who has moderate purchasing power and good credit score. 

### 2.2 Classification 

In [None]:
from sklearn.model_selection import train_test_split
df_pycaret_train, df_pycaret_test = train_test_split(df_pycaret, test_size=0.2, random_state=42)

In [None]:
#Clsssification for attrition prediction
from pycaret.classification import *
s2 = setup(data = df_pycaret_train, target = 'Attrition_Flag',
           ignore_features = ['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                                       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
           numeric_features = ['Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon'],
           categorical_features = ['Gender', 'Marital_Status', 'Card_Category'],
           ordinal_features = {'Education_Level' : ['Uneducated','High School', 'College', 'Graduate',   
                                                            'Post-Graduate', 'Doctorate', 'Unknown'],
                                        'Income_Category' : ['Less than $40K', '$40K - $60K', '$60K - $80K',  '$80K - $120K', '$120K +', 'Unknown']
                                        },
           session_id = 123)

In [None]:
best = compare_models()

In [None]:
print(best)

In [None]:
evaluate_model(best)

In [None]:
predict_model(best)

In [None]:
predictions = predict_model(best, data=df_pycaret_test, raw_score=True)
predictions.head()