In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import graphviz
from graphviz import Graph

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

### Step One: Acquire

In [4]:
# Acquire Titanic data
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### Step Two: Prepare

#### Goals for Prepare
- Drop duplicate columns
- Set index
- Address nulls
- Split data into train, validate, test
- Encode sex & embark_town columns

In [5]:
#drop duplicate columns
titanic = titanic.drop(columns=["embarked", "class"])

In [6]:
#reset index to passenger_id
titanic = titanic.set_index("passenger_id")
titanic.head()

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,deck,embark_town,alone
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,3,male,22.0,1,0,7.25,,Southampton,0
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,,Southampton,1
3,1,1,female,35.0,1,0,53.1,C,Southampton,0
4,0,3,male,35.0,0,0,8.05,,Southampton,1


In [7]:
#evaluate nulls
titanic.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
deck           688
embark_town      2
alone            0
dtype: int64

In [8]:
#drop deck as there is no way to salvage with large amount of nulls
titanic = titanic.drop(columns=["deck"])
titanic.head()

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1
3,1,1,female,35.0,1,0,53.1,Southampton,0
4,0,3,male,35.0,0,0,8.05,Southampton,1


In [9]:
# Code from review - great way to compare subgroup to population
#Let's see how similar this group is to the population
no_age_info = titanic[titanic.age.isna()]
for column in titanic.drop(columns=["age", "fare"]).columns:
    print(column)
    print("Population:")
    print(titanic[column].value_counts(normalize=True))
    print("No age")
    print(no_age_info[column].value_counts(normalize=True))
    print()
    print()

survived
Population:
0    0.616162
1    0.383838
Name: survived, dtype: float64
No age
0    0.706215
1    0.293785
Name: survived, dtype: float64


pclass
Population:
3    0.551066
1    0.242424
2    0.206510
Name: pclass, dtype: float64
No age
3    0.768362
1    0.169492
2    0.062147
Name: pclass, dtype: float64


sex
Population:
male      0.647587
female    0.352413
Name: sex, dtype: float64
No age
male      0.700565
female    0.299435
Name: sex, dtype: float64


sibsp
Population:
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: sibsp, dtype: float64
No age
0    0.774011
1    0.146893
8    0.039548
3    0.022599
2    0.016949
Name: sibsp, dtype: float64


parch
Population:
0    0.760943
1    0.132435
2    0.089787
3    0.005612
5    0.005612
4    0.004489
6    0.001122
Name: parch, dtype: float64
No age
0    0.887006
2    0.067797
1    0.045198
Name: parch, dtype: float64


embark_town
Population:
Southampton    0.724409
Cherbou

In [11]:
#fill in nulls from age with median age
titanic.age = titanic.age.fillna(value=titanic.age.median())

In [12]:
titanic.embark_town.mode()

0    Southampton
dtype: object

In [15]:
#fill in embark_town with mode
titanic.embark_town = titanic.embark_town.fillna(value="Southampton")

In [17]:
#verify all nulls have been replaced in embark_town column
titanic.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [18]:
#validate all nulls are removed from df
titanic.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embark_town    0
alone          0
dtype: int64

##### Encode

In [20]:
#encode sex and embark_town columns
dummy_df = pd.get_dummies(titanic[['sex','embark_town']], dummy_na=False, drop_first=[True, True])

In [21]:
#drop original columns and concat
titanic = titanic.drop(columns=["sex", "embark_town"])
titanic = pd.concat([titanic, dummy_df], axis=1)
titanic.head()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


Helping to learn Recall and Precision
- Recall that "recall" is all about real positives in the denominator. Recall = TP / (TP + FN)

- Precision is about positive predictions, so all positive predictions go in the denominator. Precision = TP / (TP + FP)

#### Split the data

In [27]:
### Function from Curriclulm

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [32]:
#split data using function with the target as survived.
train, validate, test = train_validate_test_split(titanic, 'survived', seed=123)

#### Create x & y version of train, validate, and test
- where y is a series with just the target variables & 
- x are all the features.

In [33]:
# x & y train
X_train = train.drop(columns=['survived'])
y_train = train.survived

#X & y validate
X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

#X & y test
X_test = test.drop(columns=['survived'])
y_test = test.survived

### Step 3. Model

#### Exercise 1.  What What is your baseline prediction? What is your baseline accuracy?

Remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [34]:
# find the baseline using mode
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


#### Excercise 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [57]:
clf = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit model on Train
clf = clf.fit(X_train, y_train)

#Evaluate model
y_pred = clf.predict(X_train)

#### Excercise 3.  Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [67]:
#Run classification report
report = (classification_report(y_train, y_pred, output_dict=True))
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


In [59]:
import graphviz
from graphviz import Graph

#for graph viz to work
dot_data = export_graphviz(clf, feature_names= X_train.columns, 
                           class_names=('Yes', 'No'), rounded=True, 
                           filled=True, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [60]:
#Estimate Probability
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.24      , 0.76      ],
       [0.24      , 0.76      ],
       [0.82043344, 0.17956656],
       [0.82043344, 0.17956656],
       [0.

In [63]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [61]:
#Compute Accuracy

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [62]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[265,  42],
       [ 58, 133]])

In [64]:
# Make dataframe out of confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,265,42
1,58,133


#### Exercise 5.  Run through steps 2-4 using a different max_depth value.

In [68]:
for i in range(2, 11):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825301    0.809602      0.822337
support    307.000000  191.000000  0.825301  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.831858    0.842767  0.835341    0.837313      0.836042
recall       0.918567    0.701571  0.835341    0.810069      0.835341
f1-score     

In [71]:
# compare in-sample to out-of-sample
metrics = []

for i in range(2, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.825301,0.799065,0.026236
2,4,0.835341,0.794393,0.040949
3,5,0.853414,0.799065,0.054348
4,6,0.865462,0.78972,0.075742
5,7,0.883534,0.780374,0.10316
6,8,0.899598,0.78972,0.109879
7,9,0.917671,0.799065,0.118605
8,10,0.937751,0.785047,0.152704
9,11,0.955823,0.813084,0.142739


#### Exercise 6.  Which model performs better on your in-sample data?
- Model 22 preforms best on train data with a max depth of 24.

#### Exercise 7.  Which model performs best on your out-of-sample data, the validate set?
- Model 11 preforms best with max depth of 13

# Random Forest Exercises

In [73]:
from sklearn.ensemble import RandomForestClassifier

#### Exercise 1:
Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [75]:
#create the object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [76]:
#Fit the model
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)