In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pydataset import data
import acquire
import prepare
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# Using the titanic data, in your classification-exercises repository, create a notebook, decision_tree.ipynb where you will do the following:

- ### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
df = pd.read_csv('titanic_df.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# dropping Unnamed column because it is useless
df.drop(columns= ['Unnamed: 0'], inplace = True)
df

In [None]:
# '0' means they didnt survive, '1' means that they did survive
# the baseline will be '0' because it is the most common; the mode
df.survived.value_counts()

In [None]:
df['baseline'] = df.survived.value_counts().idxmax()
df

In [None]:
(df.survived == df.baseline).mean()

- ### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
df = prepare.prep_titanic(df)

In [None]:
df.drop(columns= ['sex', 'embark_town'], inplace = True)

In [None]:
df.head()

def split_data(df, target=''):
        train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=42,
                              stratify=df[target])
        train, val = train_test_split(train,
                             train_size = 0.7,
                             random_state=42,
                             stratify=train[target])
        return train, val, test

In [None]:
train, val, test= split_data(df, target='survived')

In [None]:
train.shape, val.shape, test.shape

In [None]:
train

In [None]:
df

In [None]:
X_train = train.drop(columns= ['survived'])
y_train = train['survived']

X_val = val.drop(columns= ['survived'])
y_val = val['survived']

X_test = test.drop(columns= ['survived'])
y_test = test['survived']

In [None]:
seed = 42
tree1 = DecisionTreeClassifier(max_depth = 3, random_state= 42)

In [None]:
tree1.fit(X_train, y_train)

In [None]:
tree1.score(X_train, y_train)

In [None]:
plt.figure(figsize=(20, 14))
plot_tree(tree1, feature_names= X_train.columns, class_names= np.array(tree1.classes_).astype('str').tolist(), rounded=True)
plt.show()

- ### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
tree1.score(X_train, y_train)

In [None]:
y_preds = pd.DataFrame({
    'y_act': y_train,
    'baseline': 0,
    'model_1': clf.predict(X_train)
    
})

In [None]:
y_preds

In [None]:
confusion_matrix(y_preds.y_act, y_preds.model_1)

- ### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(classification_report(y_preds.y_act, y_preds.model_1))

- ### Run through steps 2-4 using a different max_depth value.

In [None]:
seed = 42
tree2 = DecisionTreeClassifier(max_depth = 10, random_state= 42)

In [None]:
tree2.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(20, 14))
plot_tree(tree2, feature_names= X_train.columns, class_names= np.array(tree2.classes_).astype('str').tolist(), rounded=True)
plt.show()

In [None]:
tree2.score(X_train, y_train)

In [None]:
y_pre = pd.DataFrame({
    'y_act': y_train,
    'baseline': 0,
    'model_2': tree2.predict(X_train)
    
})

In [None]:
m1 = (y_preds.y_act == y_preds.model_1).mean()
m1

In [None]:
m2 = (y_pre.y_act == y_pre.model_2).mean()
m2

- ### Which model performs better on your in-sample data?

In [None]:
m1 = (y_preds.y_act == y_preds.model_1).mean()
m1

In [None]:
m2 = (y_pre.y_act == y_pre.model_2).mean()
m2

- ### Which model performs best on your out-of-sample data, the validate set?

# Work through these same exercises using the Telco dataset

In [None]:
df = pd.read_csv('telco.csv')

In [2]:
telco_df = acquire.get_telco_data()

In [3]:
import prepare

In [5]:
telco_df = prepare.prep_telco_data(telco_df)

In [6]:
telco_df

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,1,0,0,1,0,0,0,0,0,1
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,0,0,1,0,0,0,0,0,0,1
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,0,0,0,0,0,1,0,0,1,0
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,1,0,1,0,0,1,0,0,1,0
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,0,No,No,13,Yes,No,Yes,No,No,...,0,0,0,1,0,0,0,0,0,1
7039,Male,0,Yes,No,22,Yes,Yes,No,No,No,...,0,0,1,0,0,1,0,0,1,0
7040,Male,0,No,No,2,Yes,No,No,Yes,No,...,0,0,0,0,0,0,0,0,0,1
7041,Male,0,Yes,Yes,67,Yes,No,Yes,No,Yes,...,0,0,1,0,1,0,0,0,0,1


In [None]:
telco = prepare.prep_telco_data(df)
telco