In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
%matplotlib inline

In [138]:
df = pd.read_csv('../../data/Customer Churn Data.csv')

In [139]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [140]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [141]:
df['international plan'] = (df['international plan']=='yes').astype(int)
df['voice mail plan'] = (df['voice mail plan']=='yes').astype(int)
## using sparse to compite the matrix to one row. 
ohe = OneHotEncoder(sparse = False)
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
df = pd.concat([df, ohe_states], axis = 1)
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
df = df.drop(['phone number', 'state', 'area code'], axis=1).copy()

In [143]:
df.head()

### 'area code' needs to dummy code. 

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,1,0,0,299.4,71,50.9,61.9,88,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
X = df.drop('churn', axis=1)
y = df['churn']

In [145]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)

In [146]:
ss = StandardScaler()
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

In [147]:
estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('log', LogisticRegression(solver = 'liblinear')),
              ('grad', GradientBoostingClassifier())]
stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
stack.fit(X_train1, y_train1);
stack.score(X_train1, y_train1)

0.9834578441835645

In [152]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    print('\n')

In [154]:
for i in stack.estimators_:
    print(i)
    metrics(y_test1, i.predict(X_test1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')
Accuracy: 0.8672
Precision: 0.0
Recall: 0.0
F1: 0.0


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.9472
Precision: 0.9310344827586207
Recall: 0.6506024096385542
F1: 0.7659574468085105


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scalin

  _warn_prf(average, modifier, msg_start, len(result))


In [160]:
stack.final_estimator_.coef_

array([[0.20620601, 3.33292118, 0.67159377, 4.83531121]])

In [171]:
stack.estimators_[3].feature_importances_

array([6.69325625e-03, 1.01933879e-01, 2.81718567e-02, 4.87059751e-02,
       1.65258591e-01, 8.45096965e-03, 1.28362750e-01, 6.19150100e-02,
       3.71325287e-03, 5.16994041e-02, 1.05539984e-02, 1.03181822e-02,
       9.43131756e-03, 2.50229048e-02, 1.01469496e-01, 5.73376331e-02,
       1.62335196e-01, 0.00000000e+00, 0.00000000e+00, 2.94337098e-04,
       5.58279357e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.77881331e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       4.86624994e-04, 1.08832127e-03, 1.37108906e-04, 0.00000000e+00,
       0.00000000e+00, 1.48614519e-03, 0.00000000e+00, 0.00000000e+00,
       2.97909502e-03, 5.91667959e-04, 1.13120913e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.52001978e-04,
       0.00000000e+00, 0.00000000e+00, 2.58386208e-04, 0.00000000e+00,
       3.85955009e-03, 0.00000000e+00, 0.00000000e+00, 4.49296175e-05,
       5.48168680e-04, 0.00000000e+00, 2.04862947e-03, 0.00000000e+00,
      

In [172]:
stack.estimators_[1].feature_importances_

array([3.49208356e-02, 6.57665960e-02, 1.65633990e-02, 2.20001688e-02,
       1.06145143e-01, 3.61974805e-02, 1.41065589e-01, 5.25013047e-02,
       3.02147227e-02, 5.55451002e-02, 3.68512595e-02, 3.41233884e-02,
       3.71615106e-02, 4.60261223e-02, 3.51762175e-02, 4.52125583e-02,
       1.14965681e-01, 1.33124404e-03, 8.37316185e-04, 2.15404962e-03,
       2.04959175e-03, 1.81779802e-03, 1.53753334e-03, 2.32331196e-03,
       1.92486135e-03, 1.25945033e-03, 1.50179439e-03, 8.36182493e-04,
       7.57414829e-04, 8.46729220e-04, 1.37802731e-03, 1.38540578e-03,
       2.64647448e-03, 2.03606567e-03, 1.45614451e-03, 1.27484538e-03,
       3.23133050e-03, 5.13562717e-03, 2.34743067e-03, 2.79446351e-03,
       3.20899545e-03, 8.33410221e-04, 2.40980456e-03, 1.51393777e-03,
       2.18257378e-03, 1.45008050e-03, 1.30744798e-03, 1.43342252e-03,
       4.22617118e-03, 1.03557873e-03, 1.99738571e-03, 1.67174959e-03,
       2.86133749e-03, 5.96378190e-04, 2.55549824e-03, 8.41012053e-04,
      