In [42]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [50]:
df = pd.read_csv('../../data/Customer Churn Data.csv')
df['international plan'] = (df['international plan'] == 'yes').astype(int)
df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)

ohe = OneHotEncoder(sparse = False)
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
#df = pd.concat([df, ohe_states], axis = 1)
df = df.drop(['state'], axis = 1)

df.head()

Unnamed: 0,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,128,415,382-4657,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,371-7191,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,358-1921,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,375-9999,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,330-6626,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [39]:
y = df['churn']
X = df.copy()
X.drop(['churn', 'area code', 'phone number'], axis = 1, inplace = True)

In [40]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    
def model(X_train, X_test, y_train, y_test):
    estimators = [('rf', RandomForestClassifier()),
                  ('log', LogisticRegression(solver = 'liblinear')),
                  ('grad', GradientBoostingClassifier())]
    stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
    stack.fit(X_train, y_train)
    metrics(y_test, stack.predict(X_test))
    return stack

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 42)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, random_state = 420)

ss = StandardScaler()
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

model = model(X_train1, X_test1, y_train1, y_test1);

Accuracy: 0.9584
Precision: 0.9493670886075949
Recall: 0.7731958762886598
F1: 0.8522727272727272


In [41]:
confusion_matrix(y_test1, model.predict(X_test1))

array([[524,   4],
       [ 22,  75]])

In [14]:
for i in model.estimators_:
    metrics(y_test1, i.predict(X_test1))

Accuracy: 0.9152
Precision: 1.0
Recall: 0.4536082474226804
F1: 0.624113475177305
Accuracy: 0.848
Precision: 0.525
Recall: 0.21649484536082475
F1: 0.30656934306569344
Accuracy: 0.9504
Precision: 0.9342105263157895
Recall: 0.7319587628865979
F1: 0.8208092485549132


In [49]:
'''
features_importances = {}
for i in model.estimators_:
    try:
        features = pd.DataFrame(i.feature_importances_).transpose()
        features.columns = X.columns
        features_importances[i] = features
    except:
        features_importances[i] = ':('
''';

In [43]:
def pickler(model):
    output_file = open('model.pickle', 'wb')
    pickle.dump(model, output_file)
    output_file.close()

def read_pickle(file_name = 'model.pickle'):
    model_file = open(file_name, "rb") # "rb" means "read as bytes"
    model = pickle.load(model_file)
    model_file.close()
    return model

In [45]:
pickler(model)

In [46]:
loaded_model = read_pickle()

In [47]:
model = metrics(y_test1, loaded_model.predict(X_test1));

Accuracy: 0.9584
Precision: 0.9493670886075949
Recall: 0.7731958762886598
F1: 0.8522727272727272
