In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [48]:
churn_data = pd.read_csv('/Users/amybrown/Thinkful/Capstone/Data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
churn_data['SeniorCitizen'] = churn_data['SeniorCitizen'].astype(str)

In [49]:
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [50]:
df_sex = pd.get_dummies(churn_data['gender'])
df_partner = pd.get_dummies(churn_data['Partner'], prefix='Partner',prefix_sep=':')
df_depend = pd.get_dummies(churn_data['Dependents'], prefix='Dependent',prefix_sep=':')
df_phone = pd.get_dummies(churn_data['PhoneService'], prefix='Phone',prefix_sep=':')
df_lines = pd.get_dummies(churn_data['MultipleLines'], prefix='Multi-lines',prefix_sep=':')
df_internet = pd.get_dummies(churn_data['InternetService'], prefix='Internet', prefix_sep=':')
df_secure = pd.get_dummies(churn_data['OnlineSecurity'], prefix='Security', prefix_sep=':')
df_backup =  pd.get_dummies(churn_data['OnlineBackup'], prefix='Backup', prefix_sep=':')
df_protect = pd.get_dummies(churn_data['DeviceProtection'], prefix='Protection', prefix_sep=':')
df_support = pd.get_dummies(churn_data['TechSupport'], prefix='Support', prefix_sep=':')
df_streamtv = pd.get_dummies(churn_data['StreamingTV'], prefix='StreamTV', prefix_sep=':')
df_streammov = pd.get_dummies(churn_data['StreamingMovies'], prefix='StreamMov', prefix_sep=':')
df_contract = pd.get_dummies(churn_data['Contract'], prefix='Contract', prefix_sep=':')
df_billing = pd.get_dummies(churn_data['PaperlessBilling'], prefix='PaperlessBill', prefix_sep=':')
df_payment = pd.get_dummies(churn_data['PaymentMethod'], prefix='Method', prefix_sep=':')
df_churn = pd.get_dummies(churn_data['Churn'], prefix='Churn',prefix_sep=':')

In [51]:
churn_dummies = pd.concat([churn_data, df_sex, df_partner, df_depend, df_phone, df_lines, df_internet,
                           df_secure, df_backup, df_protect, df_support, df_streamtv, df_streammov, df_contract,
                           df_billing, df_payment, df_churn], axis=1)

In [52]:
churn_dummies['CountProducts'] = churn_dummies['Security:Yes'] + churn_dummies['Backup:Yes'] + churn_dummies['Protection:Yes'] + churn_dummies['Support:Yes'] + churn_dummies['StreamTV:Yes'] + churn_dummies['StreamMov:Yes'] 

In [53]:
churn_dummies['CountProducts'].value_counts()

0.0    2219
3.0    1118
2.0    1033
1.0     966
4.0     852
5.0     571
6.0     284
Name: CountProducts, dtype: int64

In [54]:
# outcome data
churn_outcome = churn_dummies['Churn:Yes']
y = np.where(churn_outcome == 1, 1, 0)

In [55]:
# features
churn_features = churn_dummies[['Female', 'SeniorCitizen', 'Partner:Yes', 'Dependent:Yes', 'tenure',
                                'Phone:Yes', 'Multi-lines:Yes', 'Multi-lines:No', 
                                'Internet:DSL', 'Internet:Fiber optic', 'CountProducts', 
                                'Contract:One year', 'Contract:Two year', 'PaperlessBill:Yes', 
                                'Method:Mailed check', 'Method:Bank transfer (automatic)', 
                                'Method:Credit card (automatic)']]

In [56]:
X = churn_features
X = churn_features.as_matrix().astype(np.float)
scale = StandardScaler()
X = scale.fit_transform(X)

In [57]:
""" 0 rate classifier """

y_true = y

all_churn_df = churn_dummies
all_churn_df['all_churn'] = 1.0
all_churn = all_churn_df['all_churn']
y_pred = np.where(all_churn == 1, 1, 0)

In [58]:
""" Models """

from sklearn.linear_model import LogisticRegressionCV as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF

classifiers = (LR,DT,RF)

In [59]:
"""Hyperparameter Optimization"""

score_metric = 'accuracy'

def optimization(classifier):
    """This function is designed to assign a parameter grid to each of the classification models and 
       obtain information about their performance."""
    if classifier == LR:
        param_grid = {'class_weight': ['balanced'], 'solver': ['liblinear', 'sag'], 'cv': [5], 'refit': ['True', 'False']}
    if classifier == DT:
        param_grid = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'class_weight': ['balanced']}
    if classifier == RF:
        param_grid = {'n_estimators': [10,20,30], 'criterion': ['gini', 'entropy'], 'bootstrap': ['True']}
    print(str(classifier))
    print('Number of tested models: %i' % np.prod([len(param_grid[element]) for element in param_grid]))
    search = GridSearchCV(classifier(), param_grid, cv=10)
    search.fit(X,y)
    print('Best parameters: %s' % search.best_params_)
    print('Best score: ' + str(search.best_score_))
    
# note to self: I need to do research on these hyperparameters, or maybe I just need to throw all parameters in there 
# and see what happens. I just threw these in there to obtain sort of a proof of concept. 

# also, I would like to figure out a way to have this function output a dictionary of the best parameters for
# use in the model fitting stage. Maybe not a dictionary, would need it to output as a series of key-word args?
    
for i in classifiers:
    optimization(i) # want to clean up what gets output here

<class 'sklearn.linear_model.logistic.LogisticRegressionCV'>
Number of tested models: 4
Best parameters: {'cv': 5, 'solver': 'sag', 'refit': 'True', 'class_weight': 'balanced'}
Best score: 0.741871361636
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Number of tested models: 4
Best parameters: {'splitter': 'best', 'class_weight': 'balanced', 'criterion': 'entropy'}
Best score: 0.732784324862
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Number of tested models: 6
Best parameters: {'bootstrap': 'True', 'n_estimators': 30, 'criterion': 'entropy'}
Best score: 0.776231719438


In [60]:
""" cross validation """

def run_cv(X,y,clf_class,**kwargs):
    kf = KFold(len(y),n_folds=5,shuffle=True, random_state=42)
    y_pred = y.copy()

    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [61]:
# I don't think this is the best measure of model fit. Use confusion matrices below to obtain false negative rate.
def avg_correct(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [62]:
print("Zero Rate:")
print("%.3f" % avg_correct(y_true, y_pred))

print('Logistic Regression:')
print("%.3f" % avg_correct(y, run_cv(X,y,LR,refit=True, solver='sag', class_weight='balanced',cv=5)))

print('Decision Tree:')
print("%.3f" % avg_correct(y, run_cv(X,y,DT,splitter='best',criterion='entropy',class_weight='balanced')))

print('Random Forest:')
print("%.3f" % avg_correct(y, run_cv(X,y,RF,n_estimators=10, criterion='entropy',bootstrap=True)))

Zero Rate:
0.265
Logistic Regression:
0.741
Decision Tree:
0.726
Random Forest:
0.773


In [63]:
# Confusion Matrices
y = np.array(y)
class_names = np.unique(y)

In [64]:
zr_cm = confusion_matrix(y_true, y_pred)
print('Zero Rate Classifier Confusion Matrix:' + '\n' + str(zr_cm))

lr_cm = confusion_matrix(y,run_cv(X,y,LR))
print('Logistic Regression Classifier Confusion Matrix:' + '\n' + str(lr_cm))

dt_cm = confusion_matrix(y,run_cv(X,y,DT))
print('Decision Tree Classifier Confusion Matrix:' + '\n' + str(zr_cm))

rf_cm = confusion_matrix(y,run_cv(X,y,DT))
print('Random Forest Classifier Confusion Matrix:' + '\n' + str(rf_cm))

# I think I need to make a dictionary with the model as key and any sort of text to annotate blocks as values
# so I can just reference that and run this in a loop

Zero Rate Classifier Confusion Matrix:
[[   0 5174]
 [   0 1869]]
Logistic Regression Classifier Confusion Matrix:
[[4662  512]
 [ 882  987]]
Decision Tree Classifier Confusion Matrix:
[[   0 5174]
 [   0 1869]]
Random Forest Classifier Confusion Matrix:
[[4192  982]
 [ 995  874]]


In [65]:
def false_neg(confusion_matrix):
    false_negs = confusion_matrix[0,1]
    true_pos = confusion_matrix[0,0]
    denom = false_negs + true_pos
    false_negative_rate = false_negs/denom
    print(false_negative_rate)

In [66]:
cm_list = zr_cm, lr_cm, dt_cm, rf_cm

for x in cm_list:
    false_neg(x)

1.0
0.0989563200618
0.191921144182
0.189795129494
