In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire as aq
import prepare as pp

Reading from csv file...


# Exercises

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

   #### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.


In [2]:
df = aq.get_titanic_data()
df.head()

Reading from csv file...


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df = pp.prep_titanic(df)
df.head()

Unnamed: 0,survived,passenger_class,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [4]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [5]:
# Our baseline prediction will be that everyone does not survive(0)

df['baseline'] = 0
df = df.drop(columns= ['fare', 'sex', 'embark_town'])
df.head()

Unnamed: 0,survived,passenger_class,sibsp,parch,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline
0,0,3,1,0,0,1,0,1,0
1,1,1,1,0,0,0,0,0,0
2,1,3,0,0,1,0,0,1,0
3,1,1,1,0,0,0,0,1,0
4,0,3,0,0,1,1,0,1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   survived                 891 non-null    int64
 1   passenger_class          891 non-null    int64
 2   sibsp                    891 non-null    int64
 3   parch                    891 non-null    int64
 4   alone                    891 non-null    int64
 5   sex_male                 891 non-null    uint8
 6   embark_town_Queenstown   891 non-null    uint8
 7   embark_town_Southampton  891 non-null    uint8
 8   baseline                 891 non-null    int64
dtypes: int64(6), uint8(3)
memory usage: 44.5 KB


In [7]:
# determine baseline accuracy

baseline_accuracy = (df.baseline == df.survived).mean()
print(f'Our baseline accuracy is {baseline_accuracy: .2%}')


Our baseline accuracy is  61.62%


In [8]:
# split data into train, validate, test

train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=9337)
train.survived.value_counts()

0    441
1    271
Name: survived, dtype: int64

In [9]:
train, validate = train_test_split(train,
                                  train_size = 0.7,
                                  random_state=9337)

validate.survived.value_counts()

0    135
1     79
Name: survived, dtype: int64

In [10]:
test.survived.value_counts()

0    108
1     71
Name: survived, dtype: int64

In [11]:
# set as variables now for later use

x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

## Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [12]:
# Construct Model

tree = DecisionTreeClassifier(max_depth=2, random_state=9337)

# Fit Model

tree = tree.fit(x_train, y_train)

# Use Model to make Predictions

y_model = tree.predict(x_train)

In [13]:
print(export_text(tree, feature_names=x_train.columns.tolist()))

|--- sex_male <= 0.50
|   |--- passenger_class <= 2.50
|   |   |--- class: 1
|   |--- passenger_class >  2.50
|   |   |--- class: 0
|--- sex_male >  0.50
|   |--- passenger_class <= 1.50
|   |   |--- class: 0
|   |--- passenger_class >  1.50
|   |   |--- class: 0



## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [14]:
# Model Score

m_score = tree.score(x_train, y_train)

print(f'The accuracy of the Decision Tree classifier on the training data set is {m_score: .02%}')

The accuracy of the Decision Tree classifier on the training data set is  79.72%


In [15]:
# Classification Report

class_report = classification_report(y_train, y_model, output_dict=True)

pd.DataFrame(class_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.755611,0.990196,0.857143,306.0
1,0.969072,0.489583,0.650519,192.0
accuracy,0.797189,0.797189,0.797189,0.797189
macro avg,0.862342,0.73989,0.753831,498.0
weighted avg,0.837909,0.797189,0.777481,498.0


In [16]:
# Confusion Matrix

pd.DataFrame(confusion_matrix(y_train, y_model))

Unnamed: 0,0,1
0,303,3
1,98,94


### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [17]:
# A classification report gives us all of these values

pd.DataFrame(class_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.755611,0.990196,0.857143,306.0
1,0.969072,0.489583,0.650519,192.0
accuracy,0.797189,0.797189,0.797189,0.797189
macro avg,0.862342,0.73989,0.753831,498.0
weighted avg,0.837909,0.797189,0.777481,498.0


In [18]:
# We can also calculate ourselves
# positive = did not survive (0)

tp = 303
fp = 3
fn = 98
tn = 94


accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * (precision * recall) / (precision + recall))
true_pos = recall
true_neg = tn / (tn + fp)
false_pos = fp / (tn + fp)
false_neg = fn / (tp + fn)
support_pos = tp + fn
support_neg = tn + fp

print(f'Accuracy: {accuracy: .2%}')
print(f'---------------')
print(f'Recall: {recall: .2%}')
print(f'---------------')
print(f'Precision: {precision: .2%}')
print(f'---------------')
print(f'F1 Score: {f1_score: .2%}')
print(f'---------------')
print(f'True Positive Rate: {true_pos: .2%}')
print(f'---------------')
print(f'True Negative Rate: {true_neg: .2%}')
print(f'---------------')
print(f'False Positive Rate: {false_pos: .2%}')
print(f'---------------')
print(f'False Negative Rate: {false_neg: .2%}')
print(f'---------------')
print(f'Support (Did Not Survive(0)): {support_pos}')
print(f'---------------')
print(f'Support (Survived(1)): {support_neg}')


Accuracy:  79.72%
---------------
Recall:  75.56%
---------------
Precision:  99.02%
---------------
F1 Score:  85.71%
---------------
True Positive Rate:  75.56%
---------------
True Negative Rate:  96.91%
---------------
False Positive Rate:  3.09%
---------------
False Negative Rate:  24.44%
---------------
Support (Did Not Survive(0)): 401
---------------
Support (Survived(1)): 97


### Run through steps 2-4 using a different max_depth value.

In [19]:
for num in range(1, 11):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=num, random_state=123)

    # Fit the model
    tree = tree.fit(x_train, y_train)

    # Use the model
    y_model = tree.predict(x_train)

    class_report = classification_report(y_train, y_model, output_dict=True)
    print(f'Decision Tree Model with a max depth of {num}')
    print(pd.DataFrame(class_report).transpose())
    print(f'---------------')

Decision Tree Model with a max depth of 1
              precision    recall  f1-score     support
0              0.804954  0.849673  0.826709  306.000000
1              0.737143  0.671875  0.702997  192.000000
accuracy       0.781124  0.781124  0.781124    0.781124
macro avg      0.771048  0.760774  0.764853  498.000000
weighted avg   0.778810  0.781124  0.779013  498.000000
---------------
Decision Tree Model with a max depth of 2
              precision    recall  f1-score     support
0              0.755611  0.990196  0.857143  306.000000
1              0.969072  0.489583  0.650519  192.000000
accuracy       0.797189  0.797189  0.797189    0.797189
macro avg      0.862342  0.739890  0.753831  498.000000
weighted avg   0.837909  0.797189  0.777481  498.000000
---------------
Decision Tree Model with a max depth of 3
              precision    recall  f1-score     support
0              0.830816  0.898693  0.863422  306.000000
1              0.814371  0.708333  0.757660  192.000000
ac

### Which model performs better on your in-sample data?

In [20]:
# Accuracy seems to level out at around 85% startinmg at depth of 7.  
# I would go with a max depth of 9
# Going higher may lead to over-fitting the model

### Which model performs best on your out-of-sample data, the validate set?

In [21]:
performance = []

for num in range(1, 11):
    # Make the Model
    tree = DecisionTreeClassifier(max_depth=num, random_state=123)

    # Fit the model
    tree = tree.fit(x_train, y_train)
    
    # We're comparing how it performs on training data set vs validate
    train_accuracy = tree.score(x_train, y_train)
    
    validate_accuracy = tree.score(x_validate, y_validate)
    
    data = {
        'max_depth': num,
        'training_accuracy': round(train_accuracy*100, 2),
        'validate_accuracy': round(validate_accuracy*100, 2)
    }
    
    performance.append(data)
    
df = pd.DataFrame(performance)
df['difference'] = df.training_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,training_accuracy,validate_accuracy,difference
0,1,78.11,79.91,-1.8
1,2,79.72,78.5,1.22
2,3,82.53,76.17,6.36
3,4,83.13,78.04,5.09
4,5,83.94,78.5,5.44
5,6,84.54,79.91,4.63
6,7,85.14,78.97,6.17
7,8,85.34,79.91,5.43
8,9,85.54,79.44,6.1
9,10,85.74,79.44,6.3


In [22]:
# Based on how the models perform on training and validate data sets
# I would conclude that keeping a max depth of 2 would create the better fitting model

# Work through these same exercises using the Telco dataset.

In [23]:
df = aq.get_telco_data()
df.head()

Reading from csv file...


Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [24]:
df = pp.prep_telco(df)
df.head()

Unnamed: 0,customer_id,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,0,9,65.6,593.3,0,1,1,1,0,...,0,1,0,1,0,0,0,0,0,1
1,0003-MKNFE,0,9,59.9,542.4,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,0004-TLHLJ,0,4,73.9,280.85,1,0,0,1,0,...,0,1,1,0,0,1,0,0,1,0
3,0011-IGKFF,1,13,98.0,1237.85,1,1,0,1,0,...,1,1,1,0,0,1,0,0,1,0
4,0013-EXCHZ,1,3,83.9,267.4,0,1,0,1,0,...,0,1,1,0,0,1,0,0,0,1


In [25]:
df.churn_Yes.value_counts()

0    5174
1    1869
Name: churn_Yes, dtype: int64

In [26]:
df['baseline'] = 0
df = df.drop(columns=['customer_id'])
df.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,multiple_lines_Yes,...,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,baseline
0,0,9,65.6,593.3,0,1,1,1,0,0,...,1,0,1,0,0,0,0,0,1,0
1,0,9,59.9,542.4,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,4,73.9,280.85,1,0,0,1,0,0,...,1,1,0,0,1,0,0,1,0,0
3,1,13,98.0,1237.85,1,1,0,1,0,0,...,1,1,0,0,1,0,0,1,0,0
4,1,3,83.9,267.4,0,1,0,1,0,0,...,1,1,0,0,1,0,0,0,1,0


In [27]:
baseline_accuracy = (df.baseline == df.churn_Yes).mean()
print(f'Our baseline accuracy is {baseline_accuracy: .2%}')


Our baseline accuracy is  73.46%


In [28]:
train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=9337)
train.churn_Yes.value_counts()

0    4114
1    1520
Name: churn_Yes, dtype: int64

In [29]:
train, validate = train_test_split(train,
                                  train_size = 0.7,
                                  random_state=9337)

validate.churn_Yes.value_counts()

0    1216
1     475
Name: churn_Yes, dtype: int64

In [30]:
test.churn_Yes.value_counts()

0    1060
1     349
Name: churn_Yes, dtype: int64

In [31]:
# set as variables now for later use

x_train = train.drop(columns=['churn_Yes'])
y_train = train.churn_Yes

x_validate = validate.drop(columns=['churn_Yes'])
y_validate = validate.churn_Yes

x_test = test.drop(columns=['churn_Yes'])
y_test = test.churn_Yes

In [33]:
# Construct Model

tree = DecisionTreeClassifier(max_depth=2, random_state=9337)

# Fit Model

tree = tree.fit(x_train, y_train)

# Use Model to make Predictions

y_model = tree.predict(x_train)

In [35]:
print(export_text(tree, feature_names=x_train.columns.tolist()))

|--- tenure <= 16.50
|   |--- internet_service_type_Fiber optic <= 0.50
|   |   |--- class: 0
|   |--- internet_service_type_Fiber optic >  0.50
|   |   |--- class: 1
|--- tenure >  16.50
|   |--- internet_service_type_Fiber optic <= 0.50
|   |   |--- class: 0
|   |--- internet_service_type_Fiber optic >  0.50
|   |   |--- class: 0



In [34]:
# Model Score

m_score = tree.score(x_train, y_train)

print(f'The accuracy of the Decision Tree classifier on the training data set is {m_score: .02%}')

The accuracy of the Decision Tree classifier on the training data set is  79.46%


In [36]:
# Classification Report

class_report = classification_report(y_train, y_model, output_dict=True)

pd.DataFrame(class_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.812201,0.937198,0.870234,2898.0
1,0.69616,0.399043,0.507299,1045.0
accuracy,0.794573,0.794573,0.794573,0.794573
macro avg,0.754181,0.668121,0.688767,3943.0
weighted avg,0.781447,0.794573,0.774047,3943.0


In [37]:
# Confusion Matrix

pd.DataFrame(confusion_matrix(y_train, y_model))

Unnamed: 0,0,1
0,2716,182
1,628,417


In [40]:
# positive = churn(1)
# negative = no churn(0)

tp = 417
fp = 628
fn = 182
tn = 2716


accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * (precision * recall) / (precision + recall))
true_pos = recall
true_neg = tn / (tn + fp)
false_pos = fp / (tn + fp)
false_neg = fn / (tp + fn)
support_pos = tp + fn
support_neg = tn + fp

print(f'Accuracy: {accuracy: .2%}')
print(f'---------------')
print(f'Recall: {recall: .2%}')
print(f'---------------')
print(f'Precision: {precision: .2%}')
print(f'---------------')
print(f'F1 Score: {f1_score: .2%}')
print(f'---------------')
print(f'True Positive Rate: {true_pos: .2%}')
print(f'---------------')
print(f'True Negative Rate: {true_neg: .2%}')
print(f'---------------')
print(f'False Positive Rate: {false_pos: .2%}')
print(f'---------------')
print(f'False Negative Rate: {false_neg: .2%}')
print(f'---------------')
print(f'Support (Churn(1)): {support_pos}')
print(f'---------------')
print(f'Support (No Churn(0)): {support_neg}')

Accuracy:  79.46%
---------------
Recall:  69.62%
---------------
Precision:  39.90%
---------------
F1 Score:  50.73%
---------------
True Positive Rate:  69.62%
---------------
True Negative Rate:  81.22%
---------------
False Positive Rate:  18.78%
---------------
False Negative Rate:  30.38%
---------------
Support (Churn(1)): 599
---------------
Support (No Churn(0)): 3344


In [45]:
performance = []

for num in range(1, 21):
    
    # Make the model
    tree = DecisionTreeClassifier(max_depth=num, random_state=123)

    # Fit the model
    tree = tree.fit(x_train, y_train)

    # Use the model
    y_model = tree.score(x_train, y_train)

    data = {
        'max_depth': num,
        'training_accuracy': round(y_model*100, 2),
    }
    
    performance.append(data)
    
df = pd.DataFrame(performance)
df

Unnamed: 0,max_depth,training_accuracy
0,1,73.5
1,2,79.46
2,3,79.46
3,4,79.69
4,5,80.42
5,6,81.59
6,7,82.65
7,8,84.63
8,9,86.94
9,10,88.99


In [None]:
# Accuracy peaks at a max depth of 18
# to avaoid over fitting I would select a max depth of 15 or 16

In [47]:
performance = []

for num in range(1, 21):
    # Make the Model
    tree = DecisionTreeClassifier(max_depth=num, random_state=123)

    # Fit the model
    tree = tree.fit(x_train, y_train)
    
    # We're comparing how it performs on training data set vs validate
    train_accuracy = tree.score(x_train, y_train)
    
    validate_accuracy = tree.score(x_validate, y_validate)
    
    data = {
        'max_depth': num,
        'training_accuracy': round(train_accuracy*100, 2),
        'validate_accuracy': round(validate_accuracy*100, 2)
    }
    
    performance.append(data)
    
df = pd.DataFrame(performance)
df['difference'] = df.training_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,training_accuracy,validate_accuracy,difference
0,1,73.5,71.91,1.59
1,2,79.46,78.0,1.46
2,3,79.46,78.0,1.46
3,4,79.69,77.65,2.04
4,5,80.42,77.76,2.66
5,6,81.59,76.88,4.71
6,7,82.65,77.05,5.6
7,8,84.63,76.11,8.52
8,9,86.94,75.16,11.78
9,10,88.99,73.74,15.25


In [48]:
# Based on the comparison of train vs validate it would seem having 
# a max depth of 3 - 4 would produce models that give the most consistent accuracy