In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
%matplotlib inline

In [2]:
df = pd.read_csv('../../data/Customer Churn Data.csv')

In [3]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [5]:
df['international plan'] = (df['international plan']=='yes').astype(int)
df['voice mail plan'] = (df['voice mail plan']=='yes').astype(int)
## using sparse to compite the matrix to one row. 
ohe = OneHotEncoder(sparse = False)
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
df = pd.concat([df, ohe_states], axis = 1)
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df = df.drop(['phone number', 'state', 'area code'], axis=1).copy()

In [7]:
df.head()

### 'area code' needs to dummy code. 

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,1,0,0,299.4,71,50.9,61.9,88,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X = df.drop('churn', axis=1)
y = df['churn']

In [9]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)

In [10]:
ss = StandardScaler()
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

In [11]:
estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('log', LogisticRegression(solver = 'liblinear')),
              ('grad', GradientBoostingClassifier())]
stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
stack.fit(X_train1, y_train1);
stack.score(X_train1, y_train1)

0.9909284951974386

In [12]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    print('\n')

In [13]:
for i in stack.estimators_:
    print(i)
    metrics(y_test1, i.predict(X_test1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')
Accuracy: 0.8512
Precision: 0.0
Recall: 0.0
F1: 0.0


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.9264
Precision: 1.0
Recall: 0.5053763440860215
F1: 0.6714285714285714


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=N

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
stack.final_estimator_.coef_

array([[0.15466237, 4.26383839, 1.12504949, 4.39163782]])

In [15]:
stack.estimators_[3].feature_importances_

array([1.02291050e-02, 7.21558759e-02, 2.94478687e-02, 3.92226229e-02,
       1.56579144e-01, 1.00533946e-02, 1.55084444e-01, 6.40887048e-02,
       2.05577302e-03, 7.13214681e-02, 1.54710960e-02, 4.20178846e-03,
       1.26998812e-02, 4.20125000e-02, 7.71357812e-02, 4.85038792e-02,
       1.71352899e-01, 0.00000000e+00, 0.00000000e+00, 2.69906175e-04,
       2.22081167e-04, 1.31474854e-03, 0.00000000e+00, 1.02191774e-03,
       5.80148855e-04, 0.00000000e+00, 2.58294094e-04, 0.00000000e+00,
       3.23155932e-04, 4.29370833e-04, 5.68388140e-04, 6.03918621e-05,
       0.00000000e+00, 3.42784117e-04, 0.00000000e+00, 0.00000000e+00,
       7.18662223e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.58713087e-05, 0.00000000e+00, 3.49197570e-03, 1.85100481e-04,
       0.00000000e+00, 5.35727987e-05, 8.95600635e-04, 0.00000000e+00,
       0.00000000e+00, 5.29368533e-04, 0.00000000e+00, 8.56544766e-04,
       4.05226796e-04, 0.00000000e+00, 9.86652602e-04, 0.00000000e+00,
      

In [16]:
stack.estimators_[1].feature_importances_

array([0.0338313 , 0.05023668, 0.01365397, 0.02213225, 0.12390476,
       0.03514609, 0.14236173, 0.05387273, 0.03136086, 0.05537498,
       0.03479567, 0.03504564, 0.03655307, 0.04074137, 0.04297663,
       0.04012062, 0.11545523, 0.00062378, 0.00129144, 0.00199915,
       0.00174512, 0.00328581, 0.00083084, 0.00229489, 0.00243525,
       0.00221795, 0.00090919, 0.00090968, 0.00057465, 0.00121216,
       0.0027022 , 0.00224815, 0.00163384, 0.00167914, 0.00063136,
       0.00100279, 0.00225371, 0.00202564, 0.00243435, 0.00268948,
       0.0015212 , 0.00171187, 0.00555959, 0.00279277, 0.00077911,
       0.00176711, 0.00124121, 0.00147998, 0.00315738, 0.00142361,
       0.00134645, 0.00201774, 0.00296463, 0.00075593, 0.00136949,
       0.00167323, 0.0011958 , 0.00290366, 0.00120387, 0.00124291,
       0.00467907, 0.00236926, 0.00078713, 0.00154387, 0.00174114,
       0.00181568, 0.00093636, 0.00082583])

In [18]:
stack.estimators_[2].predict(X_test1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,