In [7]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

file_name = '../../data/Customer Churn Data.csv'

def pipeline(pickle = True):
    X_train, X_test, y_train, y_test = get_train_and_test_data()
    model = make_model(X_train, y_train)
    if pickle:
        pickler(model, 'model.pickle')
    return model

    
def get_train_and_test_data():
    '''
    Returns testing and training data
    '''
    data = get_data()
    return split_data(data)
    
    
def get_data():
    '''
    Gets data from datafile and does some pruning.
    Drops columns that worsen the model and agregates the charges columns (This helps the model)
    
    Returns
    -------
    Returns the data frame to be used in making the model
    '''
    df = pd.read_csv(file_name)
    
    df['international plan'] = (df['international plan'] == 'yes').astype(int)
    df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)

    df['total charge'] = df['total day charge'] + df['total eve charge'] + df['total intl charge'] + df['total night charge']
    df = df.drop(['total day charge', 'total eve charge', 'total intl charge', 'total night charge'], axis = 1)
    
    df = df.drop(['area code', 'phone number', 'state'], axis = 1)
    return df
    
    
def split_data(data):
    '''
    Does a train test split on the passed in with churn as the target
    
    Parameters
    ----------
    data: churn data to be split
    
    Returns
    -------
    Training predictors, test predictor, training target, test target
    '''
    target = data['churn']
    X = data.copy()
    X = X.drop(['churn'], axis = 1)
    return train_test_split(X, target, test_size = 0.30, random_state = 42)


def make_model(X_train, y_train):
    '''
    fits and returns a stacking model based on the data passed in
    '''
    estimators = [('rf', RandomForestClassifier()),
                  ('log', LogisticRegression(solver = 'liblinear')),
                  ('grad', GradientBoostingClassifier())]
    stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
    stack.fit(X_train, y_train)
    return stack    
    

def metrics(y_true, y_pred):
    '''
    returns some metrics
    '''
    metric_dictionary = {}
    metric_dictionary['Accuracy'] = str(accuracy_score(y_true, y_pred))
    metric_dictionary['Precision'] = str(precision_score(y_true, y_pred))
    metric_dictionary['Recall'] = str(recall_score(y_true, y_pred))
    metric_dictionary['F1'] = str(f1_score(y_true, y_pred))
    metric_dictionary['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    return metric_dictionary    
    
    
def pickler(model, file_name):
    '''
    turns a model into a pickle file
    '''
    output_file = open(file_name, 'wb')
    pickle.dump(model, output_file)
    output_file.close()

    
def read_pickle(file_name):
    '''
    reads a pickle file
    '''
    model_file = open(file_name, "rb")
    model = pickle.load(model_file)
    model_file.close()
    return model

In [8]:
model = pipeline(pickle = False)

In [19]:
X_train, X_test, y_train, y_test = get_train_and_test_data()

In [13]:
metrics(y_train, model.predict(X_train))

{'Accuracy': '0.9841405915130733',
 'Precision': '1.0',
 'Recall': '0.8911764705882353',
 'F1': '0.9424572317262832',
 'confusion_matrix': array([[1993,    0],
        [  37,  303]])}

In [44]:
model.estimators_[1].predict_proba(scaler.transform(X_test))

array([[0.99157499, 0.00842501],
       [0.99491524, 0.00508476],
       [0.9785255 , 0.0214745 ],
       ...,
       [0.99037921, 0.00962079],
       [0.99286747, 0.00713253],
       [0.99585668, 0.00414332]])

In [4]:
df = pd.read_csv(file_name)

Index(['account length', 'international plan', 'voice mail plan', 'number vmail messages', 'total day minutes', 'total day calls', 'total eve minutes', 'total eve calls', 'total night minutes', 'total night calls', 'total intl minutes', 'total intl calls', 'customer service calls', 'churn', 'total charge'], dtype='object')

In [2]:
model = read_pickle('../../src/model.pickle')

In [3]:
X_train, X_test, y_train, y_test = get_train_and_test_data()

In [4]:
metrics(y_test, model.predict(X_test))

{'Accuracy': '0.984',
 'Precision': '1.0',
 'Recall': '0.8881118881118881',
 'F1': '0.9407407407407408',
 'confusion_matrix': array([[857,   0],
        [ 16, 127]])}

In [5]:
estimator_metrics = []
for i in model.estimators_:
    estimator_metrics.append(metrics(y_test, i.predict(X_test)))

In [6]:
estimator_metrics

[{'Accuracy': '0.983',
  'Precision': '1.0',
  'Recall': '0.8811188811188811',
  'F1': '0.9368029739776951',
  'confusion_matrix': array([[857,   0],
         [ 17, 126]])},
 {'Accuracy': '0.858',
  'Precision': '0.5116279069767442',
  'Recall': '0.15384615384615385',
  'F1': '0.23655913978494622',
  'confusion_matrix': array([[836,  21],
         [121,  22]])},
 {'Accuracy': '0.982',
  'Precision': '0.9844961240310077',
  'Recall': '0.8881118881118881',
  'F1': '0.9338235294117646',
  'confusion_matrix': array([[855,   2],
         [ 16, 127]])}]

In [7]:
model.estimators_

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            

In [18]:
#Import Libraries

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Load data
df = pd.read_csv('../../data/Customer Churn Data.csv')
#Turn International Plan from a categorical variable to binary (yes = 1, no = 0)
df['international plan'] = (df['international plan'] == 'yes').astype(int)
#Turn Voice Mail Plan from a categorical variable to binary (yes = 1, no = 0)
df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)
#Initiate OneHotEncoder
ohe = OneHotEncoder(sparse = False)
#Create an ohe_states DF where you split the state column into new columns with the state name 
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
#Combine the 2 dataframes 
df = pd.concat([df, ohe_states], axis = 1)
#Drop state and area code (irrelevant)
df = df.drop(['state'], axis = 1)

#Set target variable as churn
y = df['churn']
#Copy X
X = df.copy()
#Drop churn and phone number from X (could have dropped phone number earlier)
X.drop(['churn', 'area code','phone number'], axis = 1, inplace = True)

#Split the initial data into train and holdout (holdout is for final evaluation)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y)
#Split train into a train and test set (to build your model)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)



estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('grad', GradientBoostingClassifier())]

#Initiate a stack classifier

stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)

#Fit the model to our sub-train data 

stack.fit(X_train1, y_train1);

#Create a function that prints the scores 

def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    
metrics(y_test1, stack.predict(X_test1))

Accuracy: 0.9472
Precision: 0.9552238805970149
Recall: 0.6808510638297872
F1: 0.7950310559006212


In [19]:
pickler(stack, '../../src/base_model.pickle')