In [1]:
# import the usual libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# import skleand funtions needed for this project 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

#Turn off warning 
import warnings
warnings.filterwarnings("ignore")

#import local libraries
import prepare
import acquire
import env

In [2]:
df= acquire.get_telco_data()
df.head()

Reading from csv file...


Unnamed: 0,payment_type_id,contract_type_id,internet_service_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,internet_service_type,contract_type,payment_type
0,2,2,1,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,DSL,One year,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,DSL,Month-to-month,Mailed check
2,1,1,2,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Fiber optic,Month-to-month,Electronic check
3,1,1,2,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Fiber optic,Month-to-month,Electronic check
4,2,1,2,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Fiber optic,Month-to-month,Mailed check


In [3]:
df = prepare.prep_telco(df)
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,contract_type_month-to-month,contract_type_one_year,contract_type_two_year,internet_service_type_dsl,internet_service_type_fiber_optic,internet_service_type_none,payment_type_bank_transfer_automatic),payment_type_credit_card_automatic),payment_type_electronic_check,payment_type_mailed_check
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,0,1,0,1,0,0,0,0,0,1
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,1,0,0,1,0,0,0,0,0,1
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,1,0,0,0,1,0,0,0,1,0
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,1,0,0,0,1,0,0,0,1,0
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,1,0,0,0,1,0,0,0,0,1


In [4]:
obj_cols = df.columns[[df[col].dtype == 'O' for col in df.columns]]
obj_cols

Index(['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines',
       'online_security', 'online_backup', 'device_protection', 'tech_support',
       'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn',
       'internet_service_type', 'contract_type', 'payment_type'],
      dtype='object')

In [5]:
df.drop(columns= obj_cols, inplace=True)

In [6]:
train, validate, test = prepare.split_telco_data(df)
train.shape, validate.shape, test.shape

((3937, 41), (1688, 41), (1407, 41))

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3937 entries, 1392 to 414
Data columns (total 41 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   senior_citizen                         3937 non-null   int64  
 1   tenure                                 3937 non-null   int64  
 2   monthly_charges                        3937 non-null   float64
 3   total_charges                          3937 non-null   float64
 4   gender_encoded                         3937 non-null   int64  
 5   partner_encoded                        3937 non-null   int64  
 6   dependents_encoded                     3937 non-null   int64  
 7   phone_service_encoded                  3937 non-null   int64  
 8   paperless_billing_encoded              3937 non-null   int64  
 9   churn_encoded                          3937 non-null   int64  
 10  multiple_lines_no                      3937 non-null   uint8  
 11  mu

In [8]:
x_train = train.drop(columns=['churn_encoded'])
y_train = train.churn_encoded

x_validate = validate.drop(columns=['churn_encoded'])
y_validate = validate.churn_encoded

x_test = test.drop(columns=['churn_encoded'])
y_test = test.churn_encoded

In [9]:
baseline = y_train.mode()[0]
baseline

0

In [53]:
baseline_accuracy = (train.churn_encoded==0).mean()
baseline_accuracy
print(f'Baseline accuracy: {baseline_accuracy:.4%}')

Baseline accuracy: 73.4315%


## Decission Tree Classifier

In [11]:
for i in range(1, 16):
    # Make the thing
    tree = DecisionTreeClassifier(max_depth=i, random_state=177)

    # Fit the thing
    tree = tree.fit(x_train, y_train)

    # Use the thing
    y_predictions = tree.predict(x_train)

    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print('______________________________________________________________________')

Tree with max depth of 1
                     0       1  accuracy    macro avg  weighted avg
precision     0.734315     0.0  0.734315     0.367158      0.539219
recall        1.000000     0.0  0.734315     0.500000      0.734315
f1-score      0.846807     0.0  0.734315     0.423404      0.621824
support    2891.000000  1046.0  0.734315  3937.000000   3937.000000
______________________________________________________________________
Tree with max depth of 2
                     0            1  accuracy    macro avg  weighted avg
precision     0.849002     0.532995  0.754128     0.690998      0.765044
recall        0.809063     0.602294  0.754128     0.705679      0.754128
f1-score      0.828551     0.565530  0.754128     0.697040      0.758670
support    2891.000000  1046.000000  0.754128  3937.000000   3937.000000
______________________________________________________________________
Tree with max depth of 3
                     0            1  accuracy    macro avg  weighted avg
preci

In [55]:
metrics = []

for i in range(1, 16):
    tree = DecisionTreeClassifier(max_depth=i, random_state=177)
    tree = tree.fit(x_train, y_train)
    in_sample_accuracy = tree.score(x_train, y_train)
    out_of_sample_accuracy = tree.score(x_validate, y_validate)
    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.734315,0.734005,0.000311
1,2,0.754128,0.783768,-0.02964
2,3,0.788672,0.799763,-0.011091
3,4,0.797308,0.793246,0.004061
4,5,0.801118,0.787322,0.013795
5,6,0.811786,0.798578,0.013207
6,7,0.824486,0.792062,0.032424
7,8,0.843282,0.774289,0.068993
8,9,0.869444,0.763033,0.106411
9,10,0.892304,0.768957,0.123346


## Random Forest Classifier

In [47]:
for i in range(1, 11):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=177)

    # Fit the model (on train and only train)
    forest = forest.fit(x_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = forest.predict(x_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 1
                     0       1  accuracy    macro avg  weighted avg
precision     0.734315     0.0  0.734315     0.367158      0.539219
recall        1.000000     0.0  0.734315     0.500000      0.734315
f1-score      0.846807     0.0  0.734315     0.423404      0.621824
support    2891.000000  1046.0  0.734315  3937.000000   3937.000000

Tree with max depth of 2
                     0       1  accuracy    macro avg  weighted avg
precision     0.734315     0.0  0.734315     0.367158      0.539219
recall        1.000000     0.0  0.734315     0.500000      0.734315
f1-score      0.846807     0.0  0.734315     0.423404      0.621824
support    2891.000000  1046.0  0.734315  3937.000000   3937.000000

Tree with max depth of 3
                     0            1  accuracy    macro avg  weighted avg
precision     0.792426     0.745882  0.787402     0.769154      0.780060
recall        0.962643     0.303059  0.787402     0.632851      0.787402
f1-score      0.869280  

In [49]:
metrics = []

for i in range(1, 25):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(x_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(x_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.734315,0.734005,0.000311
1,2,0.752096,0.763626,-0.01153
2,3,0.786894,0.795616,-0.008723
3,4,0.796292,0.804502,-0.008211
4,5,0.802388,0.800948,0.00144
5,6,0.81077,0.797986,0.012784
6,7,0.824486,0.798578,0.025907
7,8,0.839218,0.806872,0.032346
8,9,0.86284,0.807464,0.055375
9,10,0.895606,0.803318,0.092288


In [51]:
metrics = []
max_depth = 10

for i in range(1, max_depth):
    # Make the thing
    depth = max_depth -i
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=1349)

    # Fit the thing
    forest = forest.fit(x_train, y_train)

    # Use the thing
    sample_accuracy = forest.score(x_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,9,0.859284,0.805687,0.053597
1,2,8,0.835408,0.807464,0.027943
2,3,7,0.822708,0.80391,0.018798
3,4,6,0.815342,0.798578,0.016763
4,5,5,0.803912,0.80391,2e-06
5,6,4,0.797308,0.80154,-0.004233
6,7,3,0.790196,0.793839,-0.003643
7,8,2,0.75743,0.768957,-0.011528
8,9,1,0.734315,0.734005,0.000311


## K Neighbors Classifier

In [31]:
knc = KNeighborsClassifier(20)
knc.fit(x_train, y_train)
y_predictions = knc.predict(x_train)

mod_score = knc.score(x_train, y_train)
con_matrix = pd.DataFrame(confusion_matrix(y_train, y_predictions))
class_report = classification_report(y_train, y_predictions)

print(f'Accuracy using Model Score: {mod_score:.2%}')
print(f'Using Confusion Matrix:\n{con_matrix}')
print(f'Class report:\n{class_report}')

tn = con_matrix.loc[0,0]
fn = con_matrix.loc[1,0]
fp = con_matrix.loc[0,1]
tp = con_matrix.loc[1,1]
all = tp+fp+fn+tn
print(f'True Positive(tp): {tp} \nFalse Positive(fp): {fp} \nFalse Negative(fn): {fn} \nTrue Negative(tn): {tn}')
accuracy = (tp + tn)/all
print(f"Accuracy: {accuracy:.4}")
true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate:.4}")
false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate:.4}")
true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate:.4}")
false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate:.4}")
precision = tp/(tp+fp)
print(f"Precision: {precision:.4}")
recall = tp/(tp+fn)
print(f"Recall: {recall:.4}")
f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score:.4}")
support_pos = tp+fn
print(f"Support (0): {support_pos}")
support_neg = fp+tn
print(f"Support (1): {support_neg}")

Accuracy using Model Score: 79.50%
Using Confusion Matrix:
      0    1
0  2740  151
1   656  390
Class report:
              precision    recall  f1-score   support

           0       0.81      0.95      0.87      2891
           1       0.72      0.37      0.49      1046

    accuracy                           0.80      3937
   macro avg       0.76      0.66      0.68      3937
weighted avg       0.78      0.80      0.77      3937

True Positive(tp): 390 
False Positive(fp): 151 
False Negative(fn): 656 
True Negative(tn): 2740
Accuracy: 0.795
True Positive Rate: 0.3728
False Positive Rate: 0.05223
True Negative Rate: 0.9478
False Negative Rate: 0.6272
Precision: 0.7209
Recall: 0.3728
F1 Score: 0.4915
Support (0): 1046
Support (1): 2891


In [32]:
y_predictions = knc.predict(x_validate)
mod_score = knc.score(x_validate, y_validate)
con_matrix = pd.DataFrame(confusion_matrix(y_validate, y_predictions))
class_report = classification_report(y_validate, y_predictions)

print(f'Accuracy using Model Score: {mod_score:.2%}')
print(f'Using Confusion Matrix:\n{con_matrix}')
print(f'Class report:\n{class_report}')

tn = con_matrix.loc[0,0]
fn = con_matrix.loc[1,0]
fp = con_matrix.loc[0,1]
tp = con_matrix.loc[1,1]
all = tp+fp+fn+tn
print(f'True Positive(tp): {tp} \nFalse Positive(fp): {fp} \nFalse Negative(fn): {fn} \nTrue Negative(tn): {tn}')
accuracy = (tp + tn)/all
print(f"Accuracy: {accuracy:.4}")
true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate:.4}")
false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate:.4}")
true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate:.4}")
false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate:.4}")
precision = tp/(tp+fp)
print(f"Precision: {precision:.4}")
recall = tp/(tp+fn)
print(f"Recall: {recall:.4}")
f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score:.4}")
support_pos = tp+fn
print(f"Support (0): {support_pos}")
support_neg = fp+tn
print(f"Support (1): {support_neg}")

Accuracy using Model Score: 78.08%
Using Confusion Matrix:
      0    1
0  1161   78
1   292  157
Class report:
              precision    recall  f1-score   support

           0       0.80      0.94      0.86      1239
           1       0.67      0.35      0.46       449

    accuracy                           0.78      1688
   macro avg       0.73      0.64      0.66      1688
weighted avg       0.76      0.78      0.76      1688

True Positive(tp): 157 
False Positive(fp): 78 
False Negative(fn): 292 
True Negative(tn): 1161
Accuracy: 0.7808
True Positive Rate: 0.3497
False Positive Rate: 0.06295
True Negative Rate: 0.937
False Negative Rate: 0.6503
Precision: 0.6681
Recall: 0.3497
F1 Score: 0.4591
Support (0): 449
Support (1): 1239


In [33]:
knc = KNeighborsClassifier(10)
knc.fit(x_train, y_train)
y_predictions = knc.predict(x_train)

mod_score = knc.score(x_train, y_train)
con_matrix = pd.DataFrame(confusion_matrix(y_train, y_predictions))
class_report = classification_report(y_train, y_predictions)

print(f'Accuracy using Model Score: {mod_score:.2%}')
print(f'Using Confusion Matrix:\n{con_matrix}')
print(f'Class report:\n{class_report}')

tn = con_matrix.loc[0,0]
fn = con_matrix.loc[1,0]
fp = con_matrix.loc[0,1]
tp = con_matrix.loc[1,1]
all = tp+fp+fn+tn
print(f'True Positive(tp): {tp} \nFalse Positive(fp): {fp} \nFalse Negative(fn): {fn} \nTrue Negative(tn): {tn}')
accuracy = (tp + tn)/all
print(f"Accuracy: {accuracy:.4}")
true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate:.4}")
false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate:.4}")
true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate:.4}")
false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate:.4}")
precision = tp/(tp+fp)
print(f"Precision: {precision:.4}")
recall = tp/(tp+fn)
print(f"Recall: {recall:.4}")
f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score:.4}")
support_pos = tp+fn
print(f"Support (0): {support_pos}")
support_neg = fp+tn
print(f"Support (1): {support_neg}")

Accuracy using Model Score: 80.90%
Using Confusion Matrix:
      0    1
0  2735  156
1   596  450
Class report:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      2891
           1       0.74      0.43      0.54      1046

    accuracy                           0.81      3937
   macro avg       0.78      0.69      0.71      3937
weighted avg       0.80      0.81      0.79      3937

True Positive(tp): 450 
False Positive(fp): 156 
False Negative(fn): 596 
True Negative(tn): 2735
Accuracy: 0.809
True Positive Rate: 0.4302
False Positive Rate: 0.05396
True Negative Rate: 0.946
False Negative Rate: 0.5698
Precision: 0.7426
Recall: 0.4302
F1 Score: 0.5448
Support (0): 1046
Support (1): 2891


In [34]:
y_predictions = knc.predict(x_validate)
mod_score = knc.score(x_validate, y_validate)
con_matrix = pd.DataFrame(confusion_matrix(y_validate, y_predictions))
class_report = classification_report(y_validate, y_predictions)

print(f'Accuracy using Model Score: {mod_score:.2%}')
print(f'Using Confusion Matrix:\n{con_matrix}')
print(f'Class report:\n{class_report}')

tn = con_matrix.loc[0,0]
fn = con_matrix.loc[1,0]
fp = con_matrix.loc[0,1]
tp = con_matrix.loc[1,1]
all = tp+fp+fn+tn
print(f'True Positive(tp): {tp} \nFalse Positive(fp): {fp} \nFalse Negative(fn): {fn} \nTrue Negative(tn): {tn}')
accuracy = (tp + tn)/all
print(f"Accuracy: {accuracy:.4}")
true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate:.4}")
false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate:.4}")
true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate:.4}")
false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate:.4}")
precision = tp/(tp+fp)
print(f"Precision: {precision:.4}")
recall = tp/(tp+fn)
print(f"Recall: {recall:.4}")
f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score:.4}")
support_pos = tp+fn
print(f"Support (0): {support_pos}")
support_neg = fp+tn
print(f"Support (1): {support_neg}")

Accuracy using Model Score: 77.13%
Using Confusion Matrix:
      0    1
0  1136  103
1   283  166
Class report:
              precision    recall  f1-score   support

           0       0.80      0.92      0.85      1239
           1       0.62      0.37      0.46       449

    accuracy                           0.77      1688
   macro avg       0.71      0.64      0.66      1688
weighted avg       0.75      0.77      0.75      1688

True Positive(tp): 166 
False Positive(fp): 103 
False Negative(fn): 283 
True Negative(tn): 1136
Accuracy: 0.7713
True Positive Rate: 0.3697
False Positive Rate: 0.08313
True Negative Rate: 0.9169
False Negative Rate: 0.6303
Precision: 0.6171
Recall: 0.3697
F1 Score: 0.4624
Support (0): 449
Support (1): 1239
