# Supervised machine learning process:

## Import dependencies

First import of libraries that will help with wrangling and to visualize data. 

In [16]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

## Preprocessing 

Taking a second look at the data after the cleaning process, to identify the features and outcomes of the machine learning model.

In [17]:
# Load the data and display first rows 

path = '../all_data.csv'

df = pd.read_csv(path)

df.head(5)

Unnamed: 0,clientnum,attrition_flag,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,...,credit_limit,total_revolving_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,Weekly_median_earnings,Yearly_median_earnings
0,768805383,Existing Customer ...,45,M ...,3,High School ...,Married ...,$60K - $80K ...,Blue ...,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,809.0,42068.0
1,818770008,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,963.0,50076.0
2,818770008,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,1334.0,69368.0
3,818770008,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,1924.0,100048.0
4,713982108,Existing Customer ...,51,M ...,3,Graduate ...,Married ...,$80K - $120K ...,Blue ...,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,963.0,50076.0


In [10]:
# Drop the unique values that won't serve to the model and display first rows 

df = df.drop(columns=['clientnum'], axis =1)

df.head(5)

Unnamed: 0,attrition_flag,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,...,credit_limit,total_revolving_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,Weekly_median_earnings,Yearly_median_earnings
0,Existing Customer ...,45,M ...,3,High School ...,Married ...,$60K - $80K ...,Blue ...,39,5,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,809.0,42068.0
1,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,6,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,963.0,50076.0
2,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,6,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,1334.0,69368.0
3,Existing Customer ...,49,F ...,5,Graduate ...,Single ...,Less than $40K ...,Blue ...,44,6,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,1924.0,100048.0
4,Existing Customer ...,51,M ...,3,Graduate ...,Married ...,$80K - $120K ...,Blue ...,36,4,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,963.0,50076.0


### Identify and separate dependent and independent variables 

In [21]:
# Identifying categorical and non-categorical variables 

categorical = ['gender','education_level', 'marital_status', 'income_category', 'card_category']

non_categorical = ['customer_age', 'dependent_count',
       'months_on_book', 'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt',
       'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio',
       'Weekly_median_earnings', 'Yearly_median_earnings']

In [28]:
# Separating dependent and independent variables 

y = df[['attrition_flag']]

X = df[['customer_age', 'gender', 'dependent_count',
       'education_level', 'marital_status', 'income_category', 'card_category',
       'months_on_book', 'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt',
       'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio',
       'Weekly_median_earnings', 'Yearly_median_earnings']] 

In [29]:
# Encode categorical features (object types) as a one-hot numeric array 

# Get dummies 

X = pd.get_dummies(data =X, columns=categorical)

X.head()

Unnamed: 0,customer_age,dependent_count,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,avg_open_to_buy,total_amt_chng_q4_q1,...,income_category_$120K +,income_category_$40K - $60K,income_category_$60K - $80K,income_category_$80K - $120K,income_category_Less than $40K,income_category_Unknown,card_category_Blue,card_category_Gold,card_category_Platinum,card_category_Silver
0,45,3,39,5,1,3,12691.0,777,11914.0,1.335,...,0,0,1,0,0,0,1,0,0,0
1,49,5,44,6,1,2,8256.0,864,7392.0,1.541,...,0,0,0,0,1,0,1,0,0,0
2,49,5,44,6,1,2,8256.0,864,7392.0,1.541,...,0,0,0,0,1,0,1,0,0,0
3,49,5,44,6,1,2,8256.0,864,7392.0,1.541,...,0,0,0,0,1,0,1,0,0,0
4,51,3,36,4,1,0,3418.0,0,3418.0,2.594,...,0,0,0,1,0,0,1,0,0,0


In [67]:
# Split train-test with Stratification

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=18)

## Logistic Regression

In [68]:
# Train the Logistic Regression model 

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=18)

model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=18)

In [69]:
# Calculate the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)

balanced_accuracy_score(y_test,y_pred)

0.6093181503495745

In [70]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)

array([[ 168,  508],
       [ 106, 3441]])

In [71]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test,y_pred))

                         pre       rec       spe        f1       geo       iba       sup

Attrited Customer       0.61      0.25      0.97      0.35      0.49      0.22       676
Existing Customer       0.87      0.97      0.25      0.92      0.49      0.26      3547

      avg / total       0.83      0.85      0.36      0.83      0.49      0.25      4223



# Boosting Algorithm

Bag of balanced boosted learners also known as EasyEnsemble.

In [72]:
# Train the EasyEnsembleClassifier

from imblearn.ensemble import EasyEnsembleClassifier

emodel = EasyEnsembleClassifier(n_estimators=50, random_state=18)

emodel.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


EasyEnsembleClassifier(n_estimators=50, random_state=18)

In [73]:
# Calculated the balanced accuracy score

y_pred = emodel.predict(X_test)

balanced_accuracy_score(y_test,y_pred)

0.9443689808705749

In [74]:
# Display the confusion matrix

confusion_matrix(y_test,y_pred)

array([[ 641,   35],
       [ 211, 3336]])

In [75]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test,y_pred))

                         pre       rec       spe        f1       geo       iba       sup

Attrited Customer       0.75      0.95      0.94      0.84      0.94      0.89       676
Existing Customer       0.99      0.94      0.95      0.96      0.94      0.89      3547

      avg / total       0.95      0.94      0.95      0.94      0.94      0.89      4223



# Bagging Algorithm

A balanced random forest classifier.

In [76]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier 

rmodel = BalancedRandomForestClassifier(random_state=18)

rmodel.fit(X_train,y_train)

  import sys


BalancedRandomForestClassifier(random_state=18)

In [77]:
# Calculated the balanced accuracy score

y_pred = rmodel.predict(X_test)

balanced_accuracy_score(y_test,y_pred)

0.971493745026633

In [78]:
# Display the confusion matrix

confusion_matrix(y_test,y_pred)

array([[ 667,    9],
       [ 155, 3392]])

In [79]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test,y_pred))

                         pre       rec       spe        f1       geo       iba       sup

Attrited Customer       0.81      0.99      0.96      0.89      0.97      0.95       676
Existing Customer       1.00      0.96      0.99      0.98      0.97      0.94      3547

      avg / total       0.97      0.96      0.98      0.96      0.97      0.94      4223



In [81]:
# List the features sorted in descending order by feature importance

sorted(zip(rmodel.feature_importances_,X.columns), reverse=True)

[(0.20226453780520584, 'total_trans_ct'),
 (0.17537330595627235, 'total_trans_amt'),
 (0.11918319958011743, 'total_revolving_bal'),
 (0.08693640879160756, 'total_ct_chng_q4_q1'),
 (0.06061446088597262, 'total_amt_chng_q4_q1'),
 (0.058928931705023106, 'avg_utilization_ratio'),
 (0.04450067212270005, 'total_relationship_count'),
 (0.03204209260592183, 'avg_open_to_buy'),
 (0.03179934705130955, 'credit_limit'),
 (0.027440179500341173, 'customer_age'),
 (0.02652177918468796, 'contacts_count_12_mon'),
 (0.025463534363409472, 'months_inactive_12_mon'),
 (0.024757086739955626, 'months_on_book'),
 (0.012992750873485741, 'dependent_count'),
 (0.0067925192357183445, 'Weekly_median_earnings'),
 (0.006747657502712946, 'Yearly_median_earnings'),
 (0.005785969538032246,
  'gender_F                                                 '),
 (0.005513951631453769,
  'gender_M                                                 '),
 (0.005246286617669329,
  'marital_status_Married                                

## Reducing features for model 
Noticing that categorical values are not strongly influencing the model, neither the values for median salaries.
They will be put aside and just consider the other numerical variables.

In [150]:
# Selecting features for new model 

important_features = ['total_trans_ct','total_trans_amt','total_revolving_bal','total_ct_chng_q4_q1',\
                      'total_amt_chng_q4_q1',\
'avg_utilization_ratio','total_relationship_count','avg_open_to_buy','credit_limit','customer_age',\
'contacts_count_12_mon','months_inactive_12_mon','months_on_book','dependent_count']

In [151]:
# Creating new target and features 

new_y = new_df[['attrition_flag']]

new_y['attrition_flag'] = new_y['attrition_flag'].map(str.strip)

new_X = new_df[['total_trans_ct','total_trans_amt','total_revolving_bal','total_ct_chng_q4_q1',\
                      'total_amt_chng_q4_q1',\
'avg_utilization_ratio','total_relationship_count','avg_open_to_buy','credit_limit','customer_age',\
'contacts_count_12_mon','months_inactive_12_mon','months_on_book','dependent_count']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [155]:
# Splitting training and test data 

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X,new_y, stratify=new_y, random_state=18)

In [156]:
# Creating a new instance for model selected 

new_rmodel = BalancedRandomForestClassifier(random_state=18)

new_rmodel.fit(new_X_train,new_y_train)

  """


BalancedRandomForestClassifier(random_state=18)

In [157]:
# Calculated the balanced accuracy score

new_y_pred = new_rmodel.predict(new_X_test)

balanced_accuracy_score(new_y_test,new_y_pred)

0.9697325266956158

In [158]:
# Display the confusion matrix

confusion_matrix(new_y_test,new_y_pred)

array([[ 665,   11],
       [ 157, 3390]])

In [160]:
# Print the imbalanced classification report

print(classification_report_imbalanced(new_y_test,new_y_pred))

                         pre       rec       spe        f1       geo       iba       sup

Attrited Customer       0.81      0.98      0.96      0.89      0.97      0.94       676
Existing Customer       1.00      0.96      0.98      0.98      0.97      0.94      3547

      avg / total       0.97      0.96      0.98      0.96      0.97      0.94      4223



In [161]:
# Saving machine learning model file

from joblib import dump

dump(new_rmodel, 'new_rmodel.joblib')

['new_rmodel.joblib']

In [191]:
import pickle

pickle.dump(new_rmodel, open('model.pkl','wb'))