In [4]:
#Import Libraries

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
#Load data
df = pd.read_csv('../../data/Customer Churn Data.csv')
#Turn International Plan from a categorical variable to binary (yes = 1, no = 0)
df['international plan'] = (df['international plan'] == 'yes').astype(int)
#Turn Voice Mail Plan from a categorical variable to binary (yes = 1, no = 0)
df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)
#Initiate OneHotEncoder
ohe = OneHotEncoder(sparse = False)
#Create an ohe_states DF where you split the state column into new columns with the state name 
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
#Combine the 2 dataframes 
df = pd.concat([df, ohe_states], axis = 1)
#Drop state and area code (irrelevant)
df = df.drop(['state'], axis = 1)

In [6]:
#Set target variable as churn
y = df['churn']
#Copy X
X = df.copy()
#Drop churn and phone number from X (could have dropped phone number earlier)
X.drop(['churn', 'area code','phone number'], axis = 1, inplace = True)

In [7]:
#Split the initial data into train and holdout (holdout is for final evaluation)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y)
#Split train into a train and test set (to build your model)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)

#Initiate a standard scaler 
ss = StandardScaler()
#Scale X_train and X_test 
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

In [8]:
#Set our estimators, 4 classification models
#Questions: What is solver "Liblinear"?

estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('log', LogisticRegression(solver = 'liblinear')),
              ('grad', GradientBoostingClassifier())]

#Initiate a stack classifier

stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)

#Fit the model to our sub-train data 

stack.fit(X_train1, y_train1);

#Calculate accuracy score 

stack.score(X_train1, y_train1)

0.9861259338313767

In [9]:
#Evaluate metrics of our model based on the sub-test data 
#Accuracy is # of predictions our model got right (correct/total)
#Precision is when it guessed true, how many times was it correct (# of correct positive/total positive) 
#Recall is how many actual positives were guessed correctly (true positives/true positives + false negatives)
#Since false negatives are considered actual positives 
#F1 score is balance between precision and recall 

metrics(y_test1, stack.predict(X_test1))

NameError: name 'metrics' is not defined

In [None]:
#Create a function that prints the scores 

def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))

In [None]:
#Write a for loop to print metrics for each model 

for i in stack.estimators_:
    metrics(y_test1, i.predict(X_test1))