In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import sklearn.linear_model

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()
df.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [None]:
# Reduce obvious noise
df = df.set_index("passenger_id")
df = df.drop(columns=["class", "embarked"])

In [None]:
# What about nulls?
df.isna().sum()

In [None]:
# Drop deck because there are far too many nulls
df = df.drop(columns=["deck"])

In [None]:
# Let's fill embark_town with the most common observation
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [None]:
# Let's investigate the observations with missing age
# My first thought was empty age values might indicate children
# Looks like most of these individuals were traveling alone
no_age_info = df[df.age.isna()]
no_age_info.alone.value_counts()

In [None]:
df.fare.hist(), no_age_info.fare.hist()

In [None]:
# Let's see how similar this group is to the population
for column in df.drop(columns=["age", "fare"]).columns:
    print(column)
    print("Population:")
    print(df[column].value_counts(normalize=True))
    print("No age")
    print(no_age_info[column].value_counts(normalize=True))
    print()
    print()

In [None]:
# Looking at the distribution of values, it appears that no age subgroup is very close to the population
# If we needed to be more certain, we could perform hypothesis testing
# It looks like there's nothing wildly different about the no age group compared to the population
# So we'll impute using the median age
df.age = df.age.fillna(value=df.age.median())

In [None]:
# Time to encode the encodeable!
dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True])

# Drop the original columns we encoded
df = df.drop(columns=["sex", "embark_town"])

# Stitch the df and the dummy_df together again
df = pd.concat([df, dummy_df], axis=1)
df.head()

In [None]:
# Time to split!
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)

In [None]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

# 1
What is your baseline prediction? 

What is your baseline accuracy? 

remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). 

When you make those predictions, what is your accuracy? 

This is your baseline accuracy.

In [None]:
y_train[0:10]

In [None]:
train.survived.value_counts()

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)

In [None]:
plt.figure(figsize=(12,7))
plot_tree(tree1, feature_names = X)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

In [None]:
# confusion matrix - actual on left, predicted on top
pd.DataFrame(confusion_matrix(y_train, y_predictions))

In [None]:
pd.crosstab(y_train, y_predictions)

In [None]:
print(classification_report(y_train, y_predictions))

In [None]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

In [None]:
# If not-survived is our positive case
TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

In [None]:

for i in range(2, 21):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

In [None]:
# Max depth of 15+ produces the highest accuracy


In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(2, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df[df.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending = [False,True])



# RANDOM FOREST EXERCISES

In [None]:
#make model
forest1 = RandomForestClassifier(max_depth=3, random_state=123)

#fit model on train
forest1.fit(X_train, y_train)

#use
#evaluate
y_predictions = forest1.predict(X_train)

#produce classification report
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

In [None]:
#confusing matrix
pd.DataFrame(confusion_matrix(y_predictions, y_train))

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

In [None]:
#Loop
for i in range(2, 11):
    #make model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    #fit model on train
    forest = forest.fit(X_train, y_train)

    #use
    #evaluate
    y_predictions = forest.predict(X_train)

    #produce classification report
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f'Train with max depth of {i}. \n') 
    print(pd.DataFrame(report))
    print()

In [None]:
metric = []

for i in range(2, 25):
    #make model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    #fit model on train
    forest = forest.fit(X_train, y_train)

    #use
    #evaluate
    in_sample_accuracy = forest.score(X_train,y_train)
    
    out_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'max_depth': i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_sample_accuracy
    }
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df
    

In [None]:

df.set_index('max_depth').plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()


In [None]:
metrics =[]
max_depth = 20

for i in range(2, max_depth):
    #model
    depth = max_depth - i
    n_samples = i
    forest = RandomForestClassifier(max_depth = depth, min_samples_leaf= n_samples, random_state=123)
    
    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)
    
    #use
    #eval on train first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_sample_accuracy = forest.score(X_validate, y_validate)
    
    output = {
        'min_samples_per_leaf': n_samples,
        'max_depth': depth,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_sample_accuracy
        
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df['difference'] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.set_index('max_depth')[['train_accuracy', 'validate_accuracy','difference']].plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()

In [None]:
sns.scatterplot(x="max_depth", y="difference", data=df)

In [None]:
sns.scatterplot(x="min_samples_per_leaf", y="difference", data=df)

In [None]:
sns.scatterplot(x="difference", y="validate_accuracy", data=df)

# increase min samp per leaf AND max depth

In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []
max_depth = 20

for i in range(2, max_depth):
    # Make the model
    depth = i
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df[['train_accuracy', 'validate_accuracy','difference']].plot()
plt.grid()

# fixed depth and increasing min samp per leaf

In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []


for i in range(2, 50):
    # Make the model
    depth = 5
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(X_validate, y_validate)

    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.set_index('min_samples_per_leaf')[['train_accuracy', 'validate_accuracy', 'difference']].plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,50,5))
plt.grid()

# KNN Exercisses

In [None]:
train.shape, validate.shape, test.shape

In [None]:
x_cols = ['pclass', 'age', 'alone', 'fare']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [None]:
X_train.head()

# 1

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# knn classifier with n_neighbour = 1

knn1 = KNeighborsClassifier(1)
knn1.fit(X_train, y_train)
# get_classification_metrics(knn)
y_pred = knn1.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
report = classification_report(y_train, y_pred, output_dict = True)
print('n-neighbor = 1')
pd.DataFrame(report)

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train,y_pred).ravel()
ALL = TP + TN + FP + FN

TN, FP, FN, TP 

In [None]:
def show_scores(TN, FP, FN, TP):
    
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL # How often did the model get it right?
    precision = TP/(TP+FP) # What is the quality of a positive prediction made by the model?
    recall = TP/(TP+FN) # How many of the true positives were found?   
    
    true_positive_rate = TP/(TP+FN) # Same as recall, actually
    true_negative_rate = TN/(TN+FP) # How many of the true negatives were found?
    false_positive_rate = FP/(FP+TN) # How often did we miss the negative and accidentally call it positive?
    false_negative_rate = FN/(FN+TP) # How often did we miss the positive and accidentally call it negative?
    
    f1_score = 2*(precision*recall)/(precision+recall) # Harmonic mean, good for imbalanced data sets
    support_pos = TP + FN # Number of actual positives in the sample
    support_neg = FP + TN # Number of actual negatives in the sample
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"True Positive Rate: {true_positive_rate}")
    print(f"True Negative Rate: {true_negative_rate}")
    print(f"False Positive Rate: {false_positive_rate}")
    print(f"False Negative Rate: {false_negative_rate}")
    print(f"F1 Score: {f1_score}")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [None]:
show_scores(TN, FP, FN, TP)

In [None]:
knn2 = KNeighborsClassifier(10)
knn2.fit(X_train, y_train)
y_pred = knn2.predict(X_train)

report = classification_report(y_train, y_pred, output_dict = True)
print('n-neighbor = 10')
pd.DataFrame(report)

In [None]:
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

In [None]:
knn3 = KNeighborsClassifier(20)
knn3.fit(X_train, y_train)
y_pred = knn3.predict(X_train)

report = classification_report(y_train, y_pred, output_dict=True)
print("n_neighbour = 20")
pd.DataFrame(report)

In [None]:
# confusion_matrix
print('Actual on Left, Predicted on Top')
pd.crosstab(y_train, y_pred)

In [None]:
metrics=[]

for k in range(1, 21):
    
    #DEFINE
    knn = KNeighborsClassifier(n_neighbors= k)
    
    knn.fit(X_train, y_train)
    
    train_accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(X_validate, y_validate)
    
    output = {
        'k': k,
        "train_accuracy": train_accuracy,
        'validate_accuracy': validate_accuracy
    }
    
    metrics.append(output)
    
results = pd.DataFrame(metrics)

results.set_index('k').plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()

# Logistic regression exercises

# 1
Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [5]:
avg_age = df.age.mean()
df.age = df.age.fillna(avg_age)

df['is_female'] = (df.sex == 'female').astype('int')

dummy_df = pd.get_dummies(df[['embark_town']],dummy_na=False, drop_first=True)
df = pd.concat([df,dummy_df], axis =1)

df = df.drop(columns=["passenger_id", "deck", "class", "embarked", "sex", "embark_town"])

df.head(5)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,embark_town_Queenstown.1,embark_town_Southampton.1
0,0,3,22.0,1,0,7.25,0,0,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,1,0,0,0,0
2,1,3,26.0,0,0,7.925,1,1,0,1,0,1
3,1,1,35.0,1,0,53.1,0,1,0,1,0,1
4,0,3,35.0,0,0,8.05,1,0,0,1,0,1


In [6]:
df.isna().sum()

survived                   0
pclass                     0
age                        0
sibsp                      0
parch                      0
fare                       0
alone                      0
is_female                  0
embark_town_Queenstown     0
embark_town_Southampton    0
embark_town_Queenstown     0
embark_town_Southampton    0
dtype: int64

In [8]:
# Time to split!
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)

In [9]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [10]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [11]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 2)

0.62

In [13]:
#create LOG regression
logit = LogisticRegression(random_state=123)

#Specify features
features = ['age','pclass','fare']

# FIT model with specified features
logit.fit(X_train[features], y_train)

# predict on same subset
y_pred = logit.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train[features], y_train)))

Baseline is 0.62
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.70


# 2
nclude sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [14]:
#create LOG regression
logit1 = LogisticRegression(random_state=123)

#Specify features
features = ['age','pclass','fare','is_female']

# FIT model with specified features
logit1.fit(X_train[features], y_train)

# predict on same subset
y_pred = logit1.predict(X_train[features])


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.81


# 3
Try out other combinations of features and models.



In [17]:
#ALL FEATURES

#create LOG regression
logit2 = LogisticRegression(random_state=123)

#Specify features
#features = ['age','pclass','fare']

# FIT model with specified features
logit2.fit(X_train, y_train)

# predict on same subset
y_pred = logit2.predict(X_train)


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.82


In [18]:
#CLASS WEIGHT BALANCED

#create LOG regression
logit3 = LogisticRegression(random_state=123, class_weight='balanced')

#Specify features
#features = ['age','pclass','fare']

# FIT model with specified features
logit3.fit(X_train, y_train)

# predict on same subset
y_pred = logit3.predict(X_train)


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train, y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.81


In [19]:
#only AGE

#create LOG regression
logit4 = LogisticRegression(random_state=123)

#Specify features
features = ['age']

# FIT model with specified features
logit4.fit(X_train[features], y_train)

# predict on same subset
y_pred = logit4.predict(X_train[features])


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train[features], y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.62


In [21]:
#only sex

#create LOG regression
logit41 = LogisticRegression(random_state=123)

#Specify features
features = ['is_female']

# FIT model with specified features
logit41.fit(X_train[features], y_train)

# predict on same subset
y_pred = logit41.predict(X_train[features])


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit41.score(X_train[features], y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.80


In [22]:
#only PCLASS

#create LOG regression
logit5 = LogisticRegression(random_state=123)

#Specify features
features = ['pclass']

# FIT model with specified features
logit5.fit(X_train[features], y_train)

# predict on same subset
y_pred = logit5.predict(X_train[features])


print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit5.score(X_train[features], y_train)))

Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.67


In [32]:
# All Features, C ~ 0
# All features, but we'll use the class_weights to hold the actual ratios
logit6 = LogisticRegression(random_state=123, C=0.00000000000000001)

logit6.fit(X_train, y_train)

y_pred = logit6.predict(X_train)
accuracy = logit6.score(X_train, y_train)

print("All Features, C hyperparameter approaching 0")
print("Baseline is", round(baseline_accuracy, 2))
print(f'Accuracy of this Logistic Regression on training set: {accuracy:.2}')

All Features, C hyperparameter approaching 0
Baseline is 0.62
Accuracy of this Logistic Regression on training set: 0.65


# 4
Use you best 3 models to predict and evaluate on your validate sample.

In [33]:
features = ["age", "pclass", "fare", "is_female"]

y_pred = logit1.predict(X_validate[features])

print('Logit1 model using age, pclass, fare, and is_female as the features')
print(classification_report(y_validate, y_pred))

Logit1 model using age, pclass, fare, and is_female as the features
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214



In [34]:
# Logit2 uses all features
y_pred = logit2.predict(X_validate)

print("Logit2 model using all features and all model defaults")
print(classification_report(y_validate, y_pred))

Logit2 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       132
           1       0.75      0.66      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78       214



In [35]:
# Logit3 uses all features and class_weight='balanced'
y_pred = logit3.predict(X_validate)

print("Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default")
print(classification_report(y_validate, y_pred))

Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       132
           1       0.71      0.72      0.72        82

    accuracy                           0.78       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.78      0.78       214



In [37]:
# Logit41 uses is_female
features = ['is_female']
y_pred = logit41.predict(X_validate[features])

print("Logit41 model using all features and all model defaults")
print(classification_report(y_validate, y_pred))

Logit41 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



# 5
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?


In [38]:
# Logit2 uses all features
y_pred = logit2.predict(X_test)

print("Logit2 model using all features and all model defaults")
print(classification_report(y_test, y_pred))

Logit2 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       110
           1       0.76      0.72      0.74        69

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



# BONUS 1

How do different strategies for handling the missing values in the age column affect model performance?

In [None]:
y_pred_proba = logit3.predict_proba(X_train)


y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()

# BONUS 2

How do different strategies for encoding sex affect model performance?

# BONUS 3

scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.
Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected. 


C
\=
.01
,
.1
,
1
,
10
,
100
,
1000

# telco

In [None]:
df = acquire.get_telco_data()
df.head(2)

In [None]:
df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)

In [None]:
# Drop object columns for ddecision tree

In [None]:
df['total_charges'] = df['total_charges'].str.strip()
df = df[df.total_charges != '']

In [None]:
df.info()

In [None]:
df['total_charges'] = df.total_charges.astype(float)

In [None]:
# Convert binary categorical variables to numeric
df['gender_encoded'] = df.gender.map({'Female': 1, 'Male': 0})
df['partner_encoded'] = df.partner.map({'Yes': 1, 'No': 0})
df['dependents_encoded'] = df.dependents.map({'Yes': 1, 'No': 0})
df['phone_service_encoded'] = df.phone_service.map({'Yes': 1, 'No': 0})
df['paperless_billing_encoded'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
df['churn_encoded'] = df.churn.map({'Yes': 1, 'No': 0})

In [None]:
# Get dummies for non-binary categorical variables
dummy_df = pd.get_dummies(df[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type']], dummy_na=False, \
                              drop_first=True)

In [None]:
df = pd.concat([df, dummy_df], axis=1)

In [None]:
# Drop object columns for ddecision tree

In [None]:
df.info()

In [None]:
train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.churn)
train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.churn)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
X_train = train.drop(columns=["churn_encoded"])
y_train = train.churn_encoded

X_validate = validate.drop(columns=["churn_encoded"])
y_validate = validate.churn_encoded

X_test = test.drop(columns=["churn_encoded"])
y_test = test.churn_encoded

In [None]:
df.churn.head()

In [None]:
y_train[0:10]

In [None]:
train.churn_encoded.value_counts()

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

# Question 2: Fit-Transform

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))