In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire as aq
import prepare as pp

# Exercises

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

   #### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.


In [2]:
df = aq.get_titanic_data()
df.head()

Reading from csv file...


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df = pp.prep_titanic(df)
df.head()

Unnamed: 0,survived,passenger_class,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [4]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [5]:
# Our baseline prediction will be that everyone does not survive(0)

df['baseline'] = 0
df = df.drop(columns= ['fare', 'sex', 'embark_town'])
df.head()

Unnamed: 0,survived,passenger_class,sibsp,parch,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline
0,0,3,1,0,0,1,0,1,0
1,1,1,1,0,0,0,0,0,0
2,1,3,0,0,1,0,0,1,0
3,1,1,1,0,0,0,0,1,0
4,0,3,0,0,1,1,0,1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   survived                 891 non-null    int64
 1   passenger_class          891 non-null    int64
 2   sibsp                    891 non-null    int64
 3   parch                    891 non-null    int64
 4   alone                    891 non-null    int64
 5   sex_male                 891 non-null    uint8
 6   embark_town_Queenstown   891 non-null    uint8
 7   embark_town_Southampton  891 non-null    uint8
 8   baseline                 891 non-null    int64
dtypes: int64(6), uint8(3)
memory usage: 44.5 KB


In [7]:
# determine baseline accuracy

baseline_accuracy = (df.baseline == df.survived).mean()
print(f'Our baseline accuracy is {baseline_accuracy: .2%}')


Our baseline accuracy is  61.62%


In [8]:
# split data into train, validate, test

train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=9337)
train.survived.value_counts()

0    441
1    271
Name: survived, dtype: int64

In [9]:
train, validate = train_test_split(train,
                                  train_size = 0.7,
                                  random_state=9337)

validate.survived.value_counts()

0    135
1     79
Name: survived, dtype: int64

In [10]:
test.survived.value_counts()

0    108
1     71
Name: survived, dtype: int64

In [11]:
# set as variables now for later use

x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

## Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [12]:
# Construct Model

tree = DecisionTreeClassifier(max_depth=2, random_state=9337)

# Fit Model

tree = tree.fit(x_train, y_train)

# Use Model to make Predictions

y_model_1 = tree.predict(x_train)

In [17]:
print(export_text(tree, feature_names=x_train.columns.tolist()))

|--- sex_male <= 0.50
|   |--- passenger_class <= 2.50
|   |   |--- class: 1
|   |--- passenger_class >  2.50
|   |   |--- class: 0
|--- sex_male >  0.50
|   |--- passenger_class <= 1.50
|   |   |--- class: 0
|   |--- passenger_class >  1.50
|   |   |--- class: 0



## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [18]:
print(classification_report(y_train, y_model_1))

              precision    recall  f1-score   support

           0       0.76      0.99      0.86       306
           1       0.97      0.49      0.65       192

    accuracy                           0.80       498
   macro avg       0.86      0.74      0.75       498
weighted avg       0.84      0.80      0.78       498



In [19]:
pd.DataFrame(confusion_matrix(y_train, y_model_1))

Unnamed: 0,0,1
0,303,3
1,98,94
