# 2. BankChurn - Imbalanced Sampling Evaluation

In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import  accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


df = pd.read_csv('data/BankChurn_le.csv')
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 500)

print(df['Attrition_Flag'].value_counts() / len(df['Attrition_Flag']), '\n')
df.info()
df.head()

0    0.83934
1    0.16066
Name: Attrition_Flag, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  int64  
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  int64  
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  int64  
 5   Marital_Status            10127 non-null  int64  
 6   Income_Category           10127 non-null  int64  
 7   Card_Category             10127 non-null  int64  
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,45,0,3,0,0,0,0,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,0,49,1,5,1,1,1,0,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,0,51,0,3,1,0,2,0,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,0,40,1,4,0,2,1,0,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,0,40,0,3,2,0,0,0,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


No missing values but some categorical variables to take care of

In [12]:
# Minority Churn class is now 0
print(df['Attrition_Flag'].value_counts())

0    8500
1    1627
Name: Attrition_Flag, dtype: int64


### Reducing the variance in float columns 

In [13]:
df.select_dtypes(include=[int, float]).var().round()

Attrition_Flag                     0.0
Customer_Age                      64.0
Gender                             0.0
Dependent_count                    2.0
Education_Level                    3.0
Marital_Status                     1.0
Income_Category                    2.0
Card_Category                      0.0
Months_on_book                    64.0
Total_Relationship_Count           2.0
Months_Inactive_12_mon             1.0
Contacts_Count_12_mon              1.0
Credit_Limit                82605861.0
Total_Revolving_Bal           664204.0
Avg_Open_To_Buy             82640560.0
Total_Amt_Chng_Q4_Q1               0.0
Total_Trans_Amt             11540487.0
Total_Trans_Ct                   551.0
Total_Ct_Chng_Q4_Q1                0.0
Avg_Utilization_Ratio              0.0
dtype: float64

In [14]:
# Scaling data
scaler = StandardScaler()

# z = (x - u) / s
# where the sample/feature is x, u is the mean of the training samples or zero if with_mean=False, 
# and s is the standard deviation of the training samples or one if with_std=False.

cols_to_scale = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal',
                 'Avg_Open_To_Buy', 'Total_Trans_Amt', 'Total_Trans_Ct']

df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

df[cols_to_scale].var()

Customer_Age           1.000099
Months_on_book         1.000099
Credit_Limit           1.000099
Total_Revolving_Bal    1.000099
Avg_Open_To_Buy        1.000099
Total_Trans_Amt        1.000099
Total_Trans_Ct         1.000099
dtype: float64

In [15]:
df.to_csv('data/BankChurn_Processed.csv', index= False)

# Baseline Data Modelling

In [16]:
# Instantiating DecisionTreeClassifier 
dtc = DecisionTreeClassifier()

In [17]:
#Creating input/output variables
X = df.drop('Attrition_Flag', axis='columns')
y = df['Attrition_Flag']

In [18]:
# Splitting the df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=32, stratify=y)
print(y_train.value_counts())
print(y_test.value_counts())

0    5695
1    1090
Name: Attrition_Flag, dtype: int64
0    2805
1     537
Name: Attrition_Flag, dtype: int64


In [19]:
# Creating a simple function to load the model 

def run_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    score = accuracy_score(y_test, y_preds)
    cm = confusion_matrix(y_test, y_preds, labels = [0, 1])  

    print('Accuracy Score:', score, '\n')
    print(cm)

In [20]:
# Calling the custom function
run_model(dtc, X_train, y_train, X_test, y_test)

Accuracy Score: 0.9356672651107122 

[[2701  104]
 [ 111  426]]


## Imbalanced Data: Undersampling Majority Class
Balancing the data by reducing the majority class count, the downside less training data for the model learn from, not ideal with an untuned model. Nevertheless let's try it.

In [21]:
# Creating class filters
df_class_0 = df[df['Attrition_Flag'] == 0]
df_class_1 = df[df['Attrition_Flag'] == 1]

count_class_1, count_class_0 = df['Attrition_Flag'].value_counts()
print(count_class_0, count_class_1)

1627 8500


In [22]:
# Balanced target variable dataframe
df_class_1_under = df_class_1.sample(count_class_0)
df_balanced_us = pd.concat([df_class_1_under, df_class_0], axis='rows')

df_balanced_us['Attrition_Flag'].value_counts()

0    8500
1    1627
Name: Attrition_Flag, dtype: int64

In [23]:
# Splitting the balanced df into train test
X = df_balanced_us.drop('Attrition_Flag', axis='columns')
y = df_balanced_us['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 32, stratify = y)

In [24]:
# Calling the custom function to load and score the model
run_model(dtc, X_train, y_train, X_test, y_test)

Accuracy Score: 0.9347695990424896 

[[2697  108]
 [ 110  427]]


## Imbalanced Data: Oversampling Minority Class
Balancing the data by increasing the minority class count, this increases the liklihood over overfitting the model. Nevertheless let's try it.



In [25]:
# Calling the class filters from earlier and reversing the order
df_class_0_over = df_class_0.sample(count_class_1, replace = True)
df_balanced_os = pd.concat([df_class_0_over, df_class_1], axis = 'rows')

print('df_balanced_os shape', df_balanced_os.shape)

df_balanced_os shape (10127, 20)


In [26]:
# Splitting the balanced df into train test
X = df_balanced_os.drop('Attrition_Flag', axis = 'columns')
y = df_balanced_os['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 32, stratify = y)

In [27]:
run_model(dtc, X_train, y_train, X_test, y_test)

Accuracy Score: 0.957211250748055 

[[2749   56]
 [  87  450]]


## Imbalanced Data: SMOTE
Using Imbalanced Learns synthetic minority oversampling technique to create synthtic copies of the minority class and bring the class count in line with the majority. This alleviates overfitting caused by random oversampling as synthetic examples are generated rather than replication of instances and theres no loss of information.

However, while generating synthetic examples, SMOTE does not take into consideration neighboring examples can be from other classes. This can increase the overlapping of classes and can introduce additional noise. Also, SMOTE is not  practical for high dimensional data.

In [28]:
X = df.drop('Attrition_Flag', axis='columns')
y = df['Attrition_Flag']

In [29]:
# Instatiating the imblearn library 
# Fitting the data to the model and X, y
smote = SMOTE(sampling_strategy = 'minority')
X_sm, y_sm = smote.fit_sample(X,y)

y_sm.value_counts()

1    8500
0    8500
Name: Attrition_Flag, dtype: int64

In [30]:
# Splitting the data into train test
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.33, random_state = 32, stratify = y_sm)

In [31]:
# Running the custom function again this with the SMOTE df
run_model(dtc, X_train, y_train, X_test, y_test)

Accuracy Score: 0.9404634581105169 

[[2639  166]
 [ 168 2637]]


In [32]:
# Seems like the synthetic copies created are bleeding into the majority class. 
# As mentioned above SMOTE does not take into consideration neighboring examples can be from other classes

## Imbalanced Data: Ensemble with Undersampling
Undersampling the majority class in the training data by splitting it in proportion to the minority class and modelling each split against a test set. In essence breaking one imbalanced problem into many balanced problems. 

Then using a majority voting system has the potential to be more robust than simply undersampling the majority class.

In [33]:
# Regain Original features and labels
X = df.drop('Attrition_Flag', axis='columns')
y = df['Attrition_Flag'] 

In [34]:
# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 32, stratify = y)
y_train.shape, y_test.shape

((6785,), (3342,))

In [35]:
# Duplicate the train data for splitting
df_en = X_train.copy()
df_en['Attrition_Flag'] = y_train

In [36]:
# Creating class filters from the training set
df_en_class_0 = df_en[df_en.Attrition_Flag == 0]
df_en_class_1 = df_en[df_en.Attrition_Flag == 1]

In [37]:
df_en_class_0.shape, df_en_class_1.shape

((5695, 20), (1090, 20))

In [38]:
# Get X, y train by spliting the data and concatonating a better balance of classes 
def get_batches(majority, minority, start, end):
    df_train = pd.concat([majority[start:end], minority], axis='rows')
    X_train = df_train.drop('Attrition_Flag', axis='columns')
    y_train = df_train['Attrition_Flag']
    return X_train, y_train

In [39]:
# Sample 1
X_train, y_train = get_batches(df_en_class_1, df_en_class_0, 0, 1899)

dtc.fit(X_train, y_train)
y_preds1 = dtc.predict(X_test)

In [40]:
# Sample 2
X_train, y_train = get_batches(df_en_class_1, df_en_class_0, 1898, 3797)

dtc.fit(X_train, y_train)
y_preds2 = dtc.predict(X_test)

In [41]:
# Sample 3
X_train, y_train = get_batches(df_en_class_1, df_en_class_0, 3796, 5696)

dtc.fit(X_train, y_train)
y_preds3 = dtc.predict(X_test)

In [42]:
len(y_preds1), len(y_test)

(3342, 3342)

In [47]:
# Capturing the votes from each chunk
y_preds_final = y_preds1.copy()

for i in range(len(y_preds1)):
    votes = y_preds1[i] + y_preds2[i] + y_preds3[i]
    if votes >=1:
        y_preds_final[i] + 1
    else:
        y_preds_final[i] = 0
    

In [48]:
score = accuracy_score(y_preds_final, y_test)
cm = confusion_matrix(y_preds_final, y_test)
print('Accuracy Score:', score, '\n')
print(cm)

Accuracy Score: 0.9353680430879713 

[[2698  109]
 [ 107  428]]


## Summary
* In general this dataset could be classified without an sampling technuiques, though out of interest I will continue with SMOTE pending some visual inspection of the synthetic entries.

* The highest percision is seen from oversampling the minority class, though the potential for overfitting is severe.

### Next Steps
* I will visualize the SMOTE samples and run the dataset through a more complex function which will iterate over and evaluate the performance of several classifier models (Bayesian and Decision Trees) to find the best fit.

In [45]:
smote_df = pd.concat([X_sm, y_sm], axis=1)
smote_df.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-0.165406,0,3,0,0,0,0,0.384621,5,1,3,0.446622,-0.473422,0.488971,1.335,-0.959707,-0.973895,1.625,0.061,0
1,0.33357,1,5,1,1,1,0,1.010715,6,1,2,-0.041367,-0.366667,-0.008486,1.541,-0.916433,-1.35734,3.714,0.105,0
2,0.583058,0,3,1,0,2,0,0.008965,4,1,0,-0.573698,-1.426858,-0.445658,2.594,-0.740982,-1.911206,2.333,0.0,0
3,-0.789126,1,4,0,2,1,0,-0.241473,3,4,1,-0.585251,1.661686,-0.7341,1.405,-0.951758,-1.911206,2.333,0.76,0
4,-0.789126,0,3,2,0,0,0,-1.869317,5,1,0,-0.430877,-1.426858,-0.302868,2.175,-1.056263,-1.570365,2.5,0.0,0


In [46]:
smote_df.to_csv('data/BankChurn_Smote.csv', index=False)