In [1]:
#the basics
import numpy as np
import pandas as pd

#modeling tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix, f1_score

#My modules
import wrangle
import utils

### Acquire Data

In [2]:
#Get the data
df = wrangle.getData('telco')
#Prep and split into 70/15/15
tr, te, val = wrangle.prep_telco(df,val_ratio=.15,test_ratio=.15)

In [3]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4922 entries, 828 to 4434
Data columns (total 47 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 4922 non-null   object 
 1   senior_citizen                         4922 non-null   int64  
 2   partner                                4922 non-null   object 
 3   dependents                             4922 non-null   object 
 4   tenure                                 4922 non-null   int64  
 5   phone_service                          4922 non-null   object 
 6   multiple_lines                         4922 non-null   object 
 7   online_security                        4922 non-null   object 
 8   online_backup                          4922 non-null   object 
 9   device_protection                      4922 non-null   object 
 10  tech_support                           4922 non-null   object 
 11  st

In [4]:
#Split my encoded data in X and target into y

#Drop all object columns and has_churned
X_tr = tr.select_dtypes(exclude=object).drop(columns='has_churned')
y_tr = tr.churn

X_te = te.select_dtypes(exclude=object).drop(columns='has_churned')
y_te = te.churn

X_val = val.select_dtypes(exclude=object).drop(columns='has_churned')
y_val = val.churn

### Begin modeling:
- want to try decision trees, random forest, and knn
- will be wary of 75/25 split of target variable as it may adversley affect knn
- If logistic regression, want to consider some of the following columns:
  - Tenure
  - contract_type
  - internet_service_type
  - monthly_charges
  - payment_type


#### Set up prediction result dataframe

In [5]:
#Add actual values and rename column
tr_res = pd.DataFrame(tr.churn)
tr_res.rename(columns={'churn':'actual'},inplace=True)

#Create a baseline column from mode of churn
tr_res['baseline'] = tr.churn.mode()[0]

#validate
tr_res.describe().T

Unnamed: 0,count,unique,top,freq
actual,4922,2,No,3613
baseline,4922,1,No,4922


### Decision Tree Classifiers
- will try a max_depth of 2 to 7

In [6]:
#create models dictionary
model_dict = {}

#Loop over max depth values
for md in range(2,8):
    #generate model name
    model_name = 'dt_md' + str(md)
    
    #define decision tree params
    clf = DecisionTreeClassifier(max_depth=md,random_state=88)
    #fit training data
    clf = clf.fit(X_tr,y_tr)
    
    #Generate train predictions and store to result dataframe
    tr_res[model_name] = clf.predict(X_tr)
    
    #Store model information in a dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model": clf,
        "model_type": "DecisionTreeClassifier",
        "max_depth":md
    }

### Random Forest
- Our train dataset has ~5000 values.  I think I am less concerned about 
- If it is randomly choosing the features in the tree...I don't want a low max depth as I have lots of attributes....so really I want to put a smaller subset of features through t

In [7]:
#Create parameter list for model generation [max_depth,n_estimators]
params = [[5,250],[10,150],[15,100]] 

#Loop over our desired parameter combinations and create, fit and predict model for each
for i in params:
    #grab parameters
    md = i[0]
    ne = i[1]
    #create model name
    model_name = 'rf_md'+ str(md) +'_ne'+ str(ne)
    
    #create model & fit
    rf = RandomForestClassifier(max_depth=md,n_estimators=ne,random_state=88)
    rf = rf.fit(X_tr,y_tr)
    
    #Get predictions and store to train results - SAME DF FOR ALL TITANIC    
    tr_res[model_name] = rf.predict(X_tr)
    
    #Now store everything in our dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model":rf,
        "model_type": "RandomForestClassifier",
        "max_depth":md,
        "n_estimators": ne
    }

### K-Nearest Neighbors
- Be wary of split of target outcomes
- will use low # neighbors for distance calculation and mid (5,10) for uniform

In [8]:
#Create parameter list for model generation [n_neighbors,weights]
u='uniform'
d='distance'
params = [[3,d],[5,d],[5,u],[10,u]] 

#Loop over our desired parameter combinations and create, fit and predict model for each
for i in params:
    #grab parameters
    nn = i[0]
    w = i[1]
    #create model name
    model_name = 'knn_n'+str(nn)+'_w'+str(w)[0].upper()
    
    #create model & fit
    knn = KNeighborsClassifier(n_neighbors=nn,weights=w)
    knn = knn.fit(X_tr,y_tr)
    
    #Get predictions and store to train results - SAME DF FOR ALL TITANIC    
    tr_res[model_name] = knn.predict(X_tr)
    
    #Now store everything in our dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model":knn,
        "model_type": "KNeighborsClassifier",
        "n_neighbors":nn,
        "weight":w
    }

### Model Evaluation

In [9]:
#Positive is churn = Yes
pos = 'Yes'

#Run model outcomes through stats evaluation and store results in dataframe
#create model performance df
mod_perf = pd.DataFrame()

#loop over columns (models) in training results df
for model in tr_res.columns:
    #skip actual
    if model=='actual': continue
    #get model stats in df, don't print
    df = utils.get_model_stats(tr_res.actual,tr_res[model],pos,to_screen=False,ret_df=True)
    #concat together
    mod_perf = pd.concat([mod_perf,df], axis=0)
        

We are most interested in accurately identifying people who will churn (True positives).  We are most worried about missing those who would churn (False Negatives), but at the same time we don't want to think too many people are churning that aren't (False Positives).

If we will use this for marketing, we want to minimize False Negatives while False Positives aren't that big of a deal:
- Target a high sensitivity/recall, can tolerate lower precision

If we will use this for promotions, we want to minimize False Negatives and False Positives
- Target a high recall, with decent F1 score and accuracy

In [10]:
mod_perf

Unnamed: 0,Accuracy,precision,recall,F1,FNR,FPR,support_pos,support_neg,TP,FP,FN,TN
baseline,0.734051,0.0,0.0,0.0,1.0,0.0,1309,3613,0,0,1309,3613
dt_md2,0.790532,0.675949,0.407945,0.508814,0.592055,0.070855,1309,3613,534,256,775,3357
dt_md3,0.790532,0.675949,0.407945,0.508814,0.592055,0.070855,1309,3613,534,256,775,3357
dt_md4,0.791751,0.637864,0.50191,0.561779,0.49809,0.103238,1309,3613,657,373,652,3240
dt_md5,0.798862,0.707953,0.41482,0.523121,0.58518,0.061998,1309,3613,543,224,766,3389
dt_md6,0.811459,0.661031,0.597403,0.627608,0.402597,0.110988,1309,3613,782,401,527,3212
dt_md7,0.823649,0.678254,0.640947,0.659073,0.359053,0.110158,1309,3613,839,398,470,3215
rf_md5_ne250,0.79805,0.698113,0.423988,0.527567,0.576012,0.066427,1309,3613,555,240,754,3373
rf_md10_ne150,0.869159,0.818792,0.652406,0.72619,0.347594,0.052311,1309,3613,854,189,455,3424
rf_md15_ne100,0.992686,0.983295,0.989305,0.986291,0.010695,0.006089,1309,3613,1295,22,14,3591


Looking at the above:
- Decision Trees:
  - depths of 2 and 3 have the exact same results.
  - depth 4 has better recall than 2,3 and 5
  - depth 6 and 7 show improvements, however they are likely overfitting
  - RUN max depth 4 past validate
- Random Forest:
  - max depth of 15, with 100 estimators shows very high performance and is likely overfitting
  - max depth of 10 with 150 estimators shows better results than the decision trees
  - max depth 5, with 250 estimators has poor recall.  Low depth probably insufficient for this quantity of features
  - RUN max depth 10, with 150 estimators past validate
- K-Nearest Neighbors:
  - Distance weights have same results, and really high performance. Train set may just align better for this
  - Uniform weights show poorer results with more neighbors, likely due to outcome ratios in target variable
  - RUN distance weight, w/ 3 neighbors and uniform weight with 5 neighbors
    - distance w/ less neighbors will be less process intensive for likely the same results as more neighbors
    - uniform w/ less neighbors may be sufficient >> more neighbors likely to be offset by portion of outcomes   
 

In [60]:
#Given the above notes, here's a subset of model names to run past validate
val_model_list = ['dt_md4', 'rf_md10_ne150', 'knn_n3_wD', 'knn_n5_wU']

#create val results dataframe:  in this case, columns are models
val_res = pd.DataFrame(y_val)
val_res.rename(columns={'churn':'actual'},inplace=True)
#add baseline
val_res['baseline']=val.churn.mode()[0]

#create new model performance dataframe
mod_perf_comp = pd.DataFrame()

#Loop over models:
for model_name in val_model_list:    
    #new indices to use
    ind_val = model_name + "_val"
    ind_diff = model_name + "_diff"

    #grab model from dictionary:
    clf = model_dict[model_name]['model']
    
    #generate prediction on val dataset - adds new column
    val_res[model_name] = clf.predict(X_val)
    
    #calculate val model stats - df where rows are models, stats are columns
    v_perf = utils.get_model_stats(val_res.actual,val_res[model_name],pos,to_screen=False,ret_df=True)
    #rename index to represent validate results
    v_perf.rename(index={model_name:ind_val},inplace=True)
    #add another column with difference between val and train
    v_perf.loc[ind_diff] = v_perf.loc[ind_val,'Accuracy':'FPR'] - mod_perf.loc[model_name,'Accuracy':'FPR']

    #store train, validate model stats and difference in new dataframe
    mod_perf_comp = pd.concat([mod_perf_comp,mod_perf[mod_perf.index == model_name],v_perf],axis=0)


In [70]:
for i in range(0,4):
    rs = i*3
    print(mod_perf_comp.iloc[rs:rs+3,0:4])
    print('\n')
    
# mod_perf_comp[['Accuracy','recall','F1']]

             Accuracy  precision    recall        F1
dt_md4       0.791751   0.637864  0.501910  0.561779
dt_md4_val   0.796209   0.640693  0.528571  0.579256
dt_md4_diff  0.004457   0.002829  0.026662  0.017478


                    Accuracy  precision    recall        F1
rf_md10_ne150       0.869159   0.818792  0.652406  0.726190
rf_md10_ne150_val   0.799052   0.661905  0.496429  0.567347
rf_md10_ne150_diff -0.070107  -0.156887 -0.155978 -0.158844


                Accuracy  precision    recall        F1
knn_n3_wD       0.997968   0.999231  0.993125  0.996169
knn_n3_wD_val   0.728910   0.487395  0.414286  0.447876
knn_n3_wD_diff -0.269058  -0.511836 -0.578839 -0.548292


                Accuracy  precision    recall        F1
knn_n5_wU       0.833401   0.745236  0.567609  0.644406
knn_n5_wU_val   0.750711   0.538813  0.421429  0.472946
knn_n5_wU_diff -0.082690  -0.206423 -0.146180 -0.171460




### Model Evaluation con't

- The decision tree performed well against the validation subset, seeing a slight increase in recall and accuracy.
- The random forest model saw a major decrease in performance both on accuracy and recall.
- Both knns did poorly on the validation set.  the distance weight saw the recal cut in half, while the uniform weight saw a smaller, but notable decrease in recall and accuracy.



In [71]:
import inspect

def stats_result(p,null_h,**kwargs):
    """
    Compares p value to alpha and outputs whether or not the null hypothesis
    is rejected or if it failed to be rejected.
    DOES NOT HANDLE 1-TAILED T TESTS
    
    Required inputs:  p, null_h (str)
    Optional inputs: alpha (default = .05), chi2, r, t
    
    """
    #get r value if passed, else none
    t=kwargs.get('t',None)
    r=kwargs.get('r',None)
    chi2=kwargs.get('chi2',None)
    alpha=kwargs.get('alpha',.05) #default value of alpha is .05
    print(f'\n\033[1mH\u2080:\033[0m {null_h}')
    
    if p < alpha: print(f"\033[1mWe reject the null hypothesis\033[0m, p = {p} | α = {alpha}")
    else: print(f"We failed to reject the null hypothesis, p = {p} | α = {alpha}")
    
    if 't' in kwargs: print(f'  t: {t}')
    if 'r' in kwargs: print(f'  r: {r}')
    if 'chi2' in kwargs: print(f'  chi2: {chi2}')

    return None

