In [1]:
#imports to be able to pull data and prepare it for us
import acquire
import prepare

#imports for needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import pydataset

# ignoring warnings
import warnings
warnings.filterwarnings("ignore")

#imports for modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
    #import to be able to do decision tress
from sklearn.neighbors import KNeighborsClassifier
    #import to be able to do KNN
from sklearn.linear_model import LogisticRegression
    #import to be able to do logistic regression
from sklearn.ensemble import RandomForestClassifier
    #import to be able to do random forest

In [2]:
df = acquire.get_churn_data()
df = prepare.clean_telco(df)


In [3]:
train, test, validate = prepare.train_validate_test_split(df)

# Baseline Accuracy

In [4]:
X_train = train.drop(columns = ['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

#x is feature
# y is traget variable

In [5]:
y_train.value_counts()
# most customers have not churned

0    2891
1    1046
Name: churn, dtype: int64

In [6]:
baseline_accuracy = (train.churn == 0).mean()
round(baseline_accuracy, 3)

0.734

In [7]:
train.head()

Unnamed: 0,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,...,total_charges,churn,DSL,fiber_optic,no_internet,bank_transfer,credit_card,electronic_check,mailed_check,is_female
6191,0,0,0,3,1,0,0,0,0,0,...,71.2,0,0,0,1,0,0,0,1,1
1850,0,1,1,66,1,1,1,1,1,1,...,5958.85,0,1,0,0,0,1,0,0,1
5438,0,1,1,72,1,1,1,1,1,1,...,8306.05,0,0,1,0,1,0,0,0,0
2171,0,1,1,71,1,1,1,1,1,1,...,5224.95,0,1,0,0,0,0,0,1,1
6458,0,0,1,62,1,0,0,0,0,0,...,1250.1,0,0,0,1,1,0,0,0,0


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3937 entries, 6191 to 4689
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   senior_citizen     3937 non-null   int64  
 1   partner            3937 non-null   int64  
 2   dependents         3937 non-null   int64  
 3   tenure             3937 non-null   int64  
 4   phone_service      3937 non-null   int64  
 5   multiple_lines     3937 non-null   int64  
 6   online_security    3937 non-null   int64  
 7   online_backup      3937 non-null   int64  
 8   device_protection  3937 non-null   int64  
 9   tech_support       3937 non-null   int64  
 10  streaming_tv       3937 non-null   int64  
 11  streaming_movies   3937 non-null   int64  
 12  paperless_billing  3937 non-null   int64  
 13  monthly_charges    3937 non-null   float64
 14  total_charges      3937 non-null   float64
 15  churn              3937 non-null   int64  
 16  DSL                39

In [9]:
logit1 = LogisticRegression(random_state=123)
#the features we are using
features = ["monthly_charges", "tenure", 
            "fiber_optic", "electronic_check",
           "no_internet"]

# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit1.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit1.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Baseline is 0.73
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.80


- There are four ways to check if the predictions are right or wrong:
1. TN / True Negative: the case was negative and predicted negative
2. TP / True Positive: the case was positive and predicted positive
3. FN / False Negative: the case was positive but predicted negative
4. FP / False Positive: the case was negative but predicted positive
- **Precision**
    - what percent of my predictions wer correct?
        - precision for not churned: 0.83
        - precision for churned: 0.66
- **Recall**
    - what percent of the postive cases did I catch?
        - recall for not churned: 0.91
        - recall for churned: 0.49
- **F1 score**
    - what percent of positive predictions were correct?
        - f1 score for not churned: 0.87
        - f1 score for churned: 0.56
    - what is the accuracy of the f1 score prediction?
        - 0.8

In [10]:
print(classification_report(y_train, y_pred))
# accuracy of .8 looks good

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      2891
           1       0.66      0.49      0.56      1046

    accuracy                           0.80      3937
   macro avg       0.75      0.70      0.72      3937
weighted avg       0.79      0.80      0.79      3937



## Lets find the best model!

In [11]:
logit1 = LogisticRegression(random_state=123)
#the features we are using
features = ["monthly_charges", "tenure", 
            "fiber_optic", "electronic_check",
           "no_internet"]
# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit1.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit1.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Baseline is 0.73
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.80


In [12]:
# All features, all default hyperparameters
logit2 = LogisticRegression(random_state=123)

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)

print("Model trained on all features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Model trained on all features
Accuracy of Logistic Regression classifier on training set: 0.80


In [13]:
# All features, but we'll use the class_weights to hold the actual ratios`
logit3 = LogisticRegression(random_state=123, class_weight='balanced')

logit3.fit(X_train, y_train)

y_pred = logit3.predict(X_train)

accuracy = logit3.score(X_train, y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.76


In [14]:
# Only Monthly Charges 
features = ["monthly_charges"]

# All features, but we'll use the class_weights to hold the actual ratios
logit4 = LogisticRegression(random_state=123)

logit4.fit(X_train[features], y_train)

y_pred = logit4.predict(X_train[features])

accuracy = logit4.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.73


In [15]:
# Only tenure
features = ["tenure"]

# All features, but we'll use the class_weights to hold the actual ratios
logit5 = LogisticRegression(random_state=123)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.73


In [16]:
# Only fiber optics
features = ["fiber_optic"]

# All features, but we'll use the class_weights to hold the actual ratios
logit5 = LogisticRegression(random_state=123)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.73


In [17]:
# Only electronic check
feature = ["electronic_check"]

# All features, but we'll use the class_weights to hold the actual ratios
logit5 = LogisticRegression(random_state=123)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.73


In [18]:
# Only no internet
features = ["no_internet"]

# All features, but we'll use the class_weights to hold the actual ratios
logit5 = LogisticRegression(random_state=123)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.73


# My best model is logit 1!
- with the feature:
    - monthly charges
    - tenure
    - fiber optic
    - electronic_check
    - no internet service
With a baseline of 0.73 and a n accuracy of 0.8 none of the other models could rival it.

In [21]:
logit1 = LogisticRegression(random_state=123)
#the features we are using
features = ["monthly_charges", "tenure", 
            "fiber_optic", "electronic_check",
           "no_internet"]
# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit1.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit1.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Baseline is 0.73
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.80


# Decision tree for our model

In [46]:
clf1 = DecisionTreeClassifier(max_depth=3)

In [47]:
X_train = train.drop(columns='churn')
y_train = train.churn

In [48]:
clf1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [49]:
y_pred = clf1.predict(X_train)

In [50]:
pd.Series(y_pred).value_counts()
# did not churn is higher

0    3598
1     339
dtype: int64

In [51]:
#model accuracy
clf1.score(X_train, y_train)


0.7802895605791211

In [52]:
confusion_matrix(y_train, y_pred)


array([[2812,   79],
       [ 786,  260]])

In [53]:
class_report = classification_report(y_train, y_pred, output_dict=True)

In [54]:
pd.DataFrame(class_report).rename(columns={'0': 'did not churn', '1': 'churned'}).T

Unnamed: 0,precision,recall,f1-score,support
did not churn,0.781545,0.972674,0.866697,2891.0
churned,0.766962,0.248566,0.375451,1046.0
accuracy,0.78029,0.78029,0.78029,0.78029
macro avg,0.774253,0.61062,0.621074,3937.0
weighted avg,0.777671,0.78029,0.736181,3937.0


In [55]:
pd.DataFrame(conf, columns=['predict_didnt_churn', 'predict_churn'], index=['actual_didnt_churn', 'actual_churn'])

Unnamed: 0,predict_didnt_churn,predict_churn
actual_didnt_churn,2890,1
actual_churn,3,1043


In [56]:
pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


In [57]:
def get_metrics_binary(clf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = clf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [58]:
report_df = get_metrics_binary(clf1)


    The accuracy for our model is 0.7803
    The True Positive Rate is 0.249, The False Positive Rate is 0.0273,
    The True Negative Rate is 0.973, and the False Negative Rate is 0.751
    


# KNN for best model

In [59]:
knn = KNeighborsClassifier()

In [60]:
# Now let's train the model!
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [61]:
# Let's check the accuracy
accuracy = knn.score(X_train, y_train)
print(f"accuracy is {accuracy:.3}")

accuracy is 0.834


In [62]:
# Evaluate the model
y_pred = knn.predict(X_train)

In [63]:
# Let's check our other classification metrics
# y_train is the actual labels for the target variable
# y_pred is the predictions that the model makes based off our X features
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89      2891
           1       0.75      0.56      0.64      1046

    accuracy                           0.83      3937
   macro avg       0.80      0.75      0.77      3937
weighted avg       0.83      0.83      0.83      3937



In [64]:
# Let's see how well this model performs on out of sample data!
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.78


In [66]:
# Obtain the predictions from the model
y_pred = knn.predict(X_validate)
y_pred[0:4]

array([0, 0, 0, 0])

In [67]:
y_validate.head(4)

3374    0
6498    0
6366    0
5666    0
Name: churn, dtype: int64

In [68]:
# Let's check our other classification metrics
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1033
           1       0.61      0.45      0.52       374

    accuracy                           0.78      1407
   macro avg       0.72      0.67      0.69      1407
weighted avg       0.76      0.78      0.77      1407



In [81]:
print('Accuracy of Decision Tree model with monthly_charges, tenure, fiber_optic, electronic_check, and no_internet on Validate:', clf.score(X_train, y_train))

Accuracy of Decision Tree model with monthly_charges, tenure, fiber_optic, electronic_check, and no_internet on Validate: 0.8948437896875794


In [82]:
print('Accuracy of KNN model with monthly_charges, tenure, fiber_optic, electronic_check, and no_internet on Validate:',knn.score(X_validate, y_validate))

Accuracy of KNN model with monthly_charges, tenure, fiber_optic, electronic_check, and no_internet on Validate: 0.7782515991471215
