# Decision Tree Exercises

**Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:**



In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test
# generic split code found in prepare.py

In [3]:
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


1. **What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.**



In [4]:
df["did_survive"] = df.survived == 1
df["is_female"] = df.sex == "female"
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,did_survive,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,False


In [5]:
dummy_df = pd.get_dummies(df[["class"]], drop_first=True)
dummy_df = pd.get_dummies(df[["embark_town"]], drop_first=True)
dummy_df = pd.get_dummies(df[["embarked"]], drop_first=True)
dummy_df

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [6]:
df = pd.concat([df, dummy_df], axis=1)

# drop the old columns
df = df.drop(columns=["survived", 'class', 'embark_town', 'sex', 'deck', 'embarked', 'age'])
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,1,0,7.25,0,False,False,0,1
1,1,1,1,1,0,71.2833,0,True,True,0,0
2,2,2,3,0,0,7.925,1,True,True,0,1
3,3,3,1,1,0,53.1,0,True,True,0,1
4,4,4,3,0,0,8.05,1,False,False,0,1


In [7]:
train, validate, test = split(df, stratify_by = "did_survive")
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
583,583,583,1,0,0,40.125,1,False,False,0,0
165,165,165,3,0,2,20.525,0,True,False,0,1
50,50,50,3,4,1,39.6875,0,False,False,0,1
259,259,259,2,0,1,26.0,0,True,True,0,1
306,306,306,1,0,0,110.8833,1,True,True,0,0


In [8]:
x_train = train.drop(columns = ['did_survive'])
y_train = train.did_survive

x_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

x_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

#x is feature
# y is traget variable

In [9]:
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
583,583,583,1,0,0,40.125,1,False,False,0,0
165,165,165,3,0,2,20.525,0,True,False,0,1
50,50,50,3,4,1,39.6875,0,False,False,0,1
259,259,259,2,0,1,26.0,0,True,True,0,1
306,306,306,1,0,0,110.8833,1,True,True,0,0


In [10]:
y_train.head()

583    False
165     True
50     False
259     True
306     True
Name: did_survive, dtype: bool

In [11]:
clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [12]:
df = df.dropna()

In [13]:
clf = clf.fit(x_train, y_train)

In [14]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [15]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False, False,  True,  True])

In [16]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

In [17]:
y_train.head(3)

583    False
165     True
50     False
Name: did_survive, dtype: bool

In [18]:
clf.score(x_train, y_train)

0.821285140562249

In [19]:
train.did_survive.value_counts()

False    307
True     191
Name: did_survive, dtype: int64

In [20]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.616

2. **Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)**



In [21]:
clf = clf.fit(x_train, y_train)

In [22]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [23]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False, False,  True,  True])

In [24]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

In [25]:
y_train.head(3)

583    False
165     True
50     False
Name: did_survive, dtype: bool

In [26]:
clf.score(x_train, y_train)

0.821285140562249

In [27]:
train.did_survive.value_counts()

False    307
True     191
Name: did_survive, dtype: int64

In [28]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.616

3. **Evaluate your in-sample results using the model score, confusion matrix, and classification report.**



In [29]:
# model score
clf.score(x_validate, y_validate)

0.7757009345794392

In [30]:
# confusion matrix
confusion_matrix(y_train, y_pred)

array([[274,  33],
       [ 56, 135]])

In [31]:
# confusion matrix
y_train.value_counts()

False    307
True     191
Name: did_survive, dtype: int64

In [32]:
# confusion matrix
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,False,True
False,274,33
True,56,135


In [33]:
# Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.83      0.89      0.86       307
        True       0.80      0.71      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



4. **Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**



In [34]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(274, 33, 56, 135)

In [35]:
sorted(('didnt_survive', 'survived'))

['didnt_survive', 'survived']

In [36]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 135
False Positives 33
False Negatives 56
True Negatives 274
-------------
Accuracy is 0.821285140562249
Recall is 0.7068062827225131
Precision is 0.8035714285714286


5. **Run through steps 2-4 using a different max_depth value.**



In [37]:
clf = DecisionTreeClassifier(max_depth = 5, random_state = 123)

In [38]:
df = df.dropna()

In [39]:
clf = clf.fit(x_train, y_train)

In [40]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('survival_decision_tree', view=True, format="pdf")

'survival_decision_tree.pdf'

In [41]:
y_pred = clf.predict(x_train)
y_pred[0:5]

array([False, False, False,  True,  True])

In [42]:
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.5942029 , 0.4057971 ],
       [0.78947368, 0.21052632],
       [1.        , 0.        ],
       [0.2       , 0.8       ],
       [0.        , 1.        ]])

In [43]:
clf.score(x_train, y_train)

0.8493975903614458

In [44]:
train.did_survive.value_counts()

False    307
True     191
Name: did_survive, dtype: int64

In [45]:
train['most_frequent'] = False
baseline_accuracy = (train.did_survive == train.most_frequent).mean()
baseline_accuracy.round(3)

0.616

In [46]:
# model score
clf.score(x_validate, y_validate)

0.7663551401869159

In [47]:
# confusion matrix
confusion_matrix(y_train, y_pred)

array([[282,  25],
       [ 50, 141]])

In [48]:
# confusion matrix
y_train.value_counts()

False    307
True     191
Name: did_survive, dtype: int64

In [49]:
# confusion matrix
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,False,True
False,282,25
True,50,141


In [50]:
# Classification Report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.85      0.92      0.88       307
        True       0.85      0.74      0.79       191

    accuracy                           0.85       498
   macro avg       0.85      0.83      0.84       498
weighted avg       0.85      0.85      0.85       498



In [51]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(282, 25, 50, 141)

In [52]:
sorted(('didnt_survive', 'survived'))

['didnt_survive', 'survived']

In [53]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 141
False Positives 25
False Negatives 50
True Negatives 282
-------------
Accuracy is 0.8493975903614458
Recall is 0.7382198952879581
Precision is 0.8493975903614458


6. **Which model performs better on your in-sample data?**



One used in questions 1-4

7. **Which model performs best on your out-of-sample data, the validate set?**

One used in questions 1-4

# Random Forest Exercises

In [54]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # this is a whole library containing a bunch random forest is just one of many
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pydataset import data # grabbing iris dataset

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,1,0,7.25,0,False,False,0,1
1,1,1,1,1,0,71.2833,0,True,True,0,0
2,2,2,3,0,0,7.925,1,True,True,0,1
3,3,3,1,1,0,53.1,0,True,True,0,1
4,4,4,3,0,0,8.05,1,False,False,0,1


In [55]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [56]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='did_survive', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['did_survive'])
y_train = train.did_survive

X_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

X_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

1. **Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.**



In [57]:
rf = RandomForestClassifier(min_samples_leaf=1,
                            max_depth=10, 
                            random_state=123)

In [58]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [59]:
print(rf.feature_importances_)

[0.15236521 0.15136192 0.08397854 0.04800557 0.02988227 0.18566693
 0.01819858 0.29792658 0.01159102 0.02102339]


In [60]:
y_pred = rf.predict(x_train)
y_pred

array([False,  True, False,  True,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
       False, False,  True, False,  True,  True, False, False,  True,
       False, False, False, False, False, False,  True,  True, False,
       False,  True, False,  True, False, False, False,  True, False,
       False, False, False,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
        True, False, False, False, False,  True,  True,  True, False,
       False, False,  True,  True, False, False,  True, False, False,
        True, False, False,  True, False, False,  True, False,  True,
       False, False,  True,  True, False, False, False,  True, False,
       False,  True,  True, False, False, False, False,  True,  True,
        True, False,

In [61]:
y_pred_proba = rf.predict_proba(x_train)
y_pred_proba

array([[0.76216391, 0.23783609],
       [0.34876278, 0.65123722],
       [0.98888889, 0.01111111],
       [0.03791408, 0.96208592],
       [0.00615385, 0.99384615],
       [0.87111082, 0.12888918],
       [0.74752952, 0.25247048],
       [0.94055796, 0.05944204],
       [0.95050827, 0.04949173],
       [0.9375    , 0.0625    ],
       [0.64356736, 0.35643264],
       [0.50914161, 0.49085839],
       [0.01999741, 0.98000259],
       [0.87520503, 0.12479497],
       [0.74790871, 0.25209129],
       [0.66921037, 0.33078963],
       [0.98802437, 0.01197563],
       [0.00529412, 0.99470588],
       [0.95766021, 0.04233979],
       [0.77685165, 0.22314835],
       [0.1807803 , 0.8192197 ],
       [0.9499577 , 0.0500423 ],
       [0.00615385, 0.99384615],
       [0.10155556, 0.89844444],
       [0.68250288, 0.31749712],
       [0.91983515, 0.08016485],
       [0.071     , 0.929     ],
       [0.73463596, 0.26536404],
       [0.91975022, 0.08024978],
       [0.89629637, 0.10370363],
       [0.

2. **Evaluate your results using the model score, confusion matrix, and classification report.**



In [62]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [63]:
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [ 16 175]]


In [64]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.95      1.00      0.97       307
        True       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



3. **Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**



In [65]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(307, 0, 16, 175)

In [66]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * precision * recall) / (precision + recall)

print("Accuracy is", accuracy.round(3))
print("Recall is", recall.round(3))
print("Precision is", precision.round(3))
print("F1 score is", f1_score.round(3))

True Positives 175
False Positives 0
False Negatives 16
True Negatives 307
-------------
Accuracy is 0.968
Recall is 0.916
Precision is 1.0
F1 score is 0.956


4. **Run through steps increasing your min_samples_leaf and decreasing your max_depth.**



In [67]:
rf = RandomForestClassifier(min_samples_leaf=5,
                            max_depth=5, 
                            random_state=123)

In [68]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=123)

In [69]:
print(rf.feature_importances_)

[0.07355376 0.06278925 0.13126008 0.04440475 0.02532411 0.15201251
 0.01756746 0.45957497 0.01462573 0.01888739]


In [70]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.95      1.00      0.97       307
        True       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



In [71]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
tn, fp, fn, tp

(307, 0, 16, 175)

In [72]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * precision * recall) / (precision + recall)

print("Accuracy is", accuracy.round(3))
print("Recall is", recall.round(3))
print("Precision is", precision.round(3))
print("F1 score is", f1_score.round(3))

True Positives 175
False Positives 0
False Negatives 16
True Negatives 307
-------------
Accuracy is 0.968
Recall is 0.916
Precision is 1.0
F1 score is 0.956


5. **What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?**



In [73]:
print('Accuracy of in sample data set of Questions 1-4: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of in sample data set of Questions 1-4: 0.84


In [74]:
print('Classification Report of in sample data set of Question 4: {:.2f}'
     .format(rf.score(x_train, y_train)))

Classification Report of in sample data set of Question 4: 0.84


6. **After making a few models, which one has the best performance (or closest metrics) on both train and validate?**

    - Second model worked out the best when it came down to accuracy

## Another way to Complete Random Forest Exercises as Given by Instructors:

In [75]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from acquire import get_titanic_data
from prepare import prep_titanic_data, train_validate_test_split

In [76]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,1,0,7.25,0,False,False,0,1
1,1,1,1,1,0,71.2833,0,True,True,0,0
2,2,2,3,0,0,7.925,1,True,True,0,1
3,3,3,1,1,0,53.1,0,True,True,0,1
4,4,4,3,0,0,8.05,1,False,False,0,1


In [77]:
# take a look to make sure everything has acquired successfully
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
583,583,583,1,0,0,40.125,1,False,False,0,0
165,165,165,3,0,2,20.525,0,True,False,0,1
50,50,50,3,4,1,39.6875,0,False,False,0,1
259,259,259,2,0,1,26.0,0,True,True,0,1
306,306,306,1,0,0,110.8833,1,True,True,0,0


In [78]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    498 non-null    int64  
 1   passenger_id  498 non-null    int64  
 2   pclass        498 non-null    int64  
 3   sibsp         498 non-null    int64  
 4   parch         498 non-null    int64  
 5   fare          498 non-null    float64
 6   alone         498 non-null    int64  
 7   did_survive   498 non-null    bool   
 8   is_female     498 non-null    bool   
 9   embarked_Q    498 non-null    uint8  
 10  embarked_S    498 non-null    uint8  
dtypes: bool(2), float64(1), int64(6), uint8(2)
memory usage: 33.1 KB


In [79]:
# we will do our extra sklearn imports here since we have defined our function in-line
from sklearn.metrics import confusion_matrix, classification_report

In [80]:
# taken from eval.py, where we have modified our function for applicability

def get_metrics_bin(clf, X, y):
    '''
    get_metrics_bin will take in a sklearn classifier model, an X and a y variable and utilize
    the model to make a prediction and then gather accuracy, class report evaluations

    return:  a classification report as a pandas DataFrame
    '''
    y_pred = clf.predict(X)
    accuracy = clf.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [81]:
# create the Random Forest Model
clf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=1349)

In [82]:
# fit the model after splitting our X and y
X_train, y_train = train.drop(columns='did_survive'), train.did_survive

In [83]:
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=1349)

In [84]:
y_pred = clf.predict(X_train)

In [85]:
# call our function to get our metrics!
# 
class_report = get_metrics_bin(clf, X_train, y_train)


    The accuracy for our model is 0.9699
    The True Positive Rate is 0.921, The False Positive Rate is 0.0,
    The True Negative Rate is 1.0, and the False Negative Rate is 0.0785
    


In [86]:
class_report
# target is survival, with a binary 0 representing a passenger that did not surivive the titanic wreck
# and 1 representing a survivor

Unnamed: 0,precision,recall,f1-score,support
False,0.953416,1.0,0.976153,307.0
True,1.0,0.921466,0.959128,191.0
accuracy,0.96988,0.96988,0.96988,0.96988
macro avg,0.976708,0.960733,0.96764,498.0
weighted avg,0.971283,0.96988,0.969623,498.0


In [87]:
# create the Random Forest Model
clf1 = RandomForestClassifier(min_samples_leaf=3, max_depth=3, random_state=1349)

In [88]:
# fit the model
clf1.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=1349)

In [89]:
class_report1 = get_metrics_bin(clf1, X_train, y_train)


    The accuracy for our model is 0.8273
    The True Positive Rate is 0.649, The False Positive Rate is 0.0619,
    The True Negative Rate is 0.938, and the False Negative Rate is 0.351
    


In [90]:
X_val, y_val = validate.drop(columns='did_survive'), validate.did_survive

In [91]:
print('Model #1: min samples 1, max depth 10')
class_report_val = get_metrics_bin(clf, X_val, y_val)
print('-------------------------------------------\n Model #2: min samples 3, max_depth 3\n')
class_report_val1 = get_metrics_bin(clf1, X_val, y_val)

Model #1: min samples 1, max depth 10

    The accuracy for our model is 0.743
    The True Positive Rate is 0.573, The False Positive Rate is 0.152,
    The True Negative Rate is 0.848, and the False Negative Rate is 0.427
    
-------------------------------------------
 Model #2: min samples 3, max_depth 3


    The accuracy for our model is 0.7897
    The True Positive Rate is 0.61, The False Positive Rate is 0.0985,
    The True Negative Rate is 0.902, and the False Negative Rate is 0.39
    


In [92]:
conf = confusion_matrix(y_train, y_pred)
conf

array([[307,   0],
       [ 15, 176]])

In [93]:
# make a key for reference
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])

In [94]:
rubric_df

Unnamed: 0,predict_death,predict_survive
actual_death,true negative,false positive
actual_survive,false negative,true positive


In [95]:
# accuracy:
# accuracy = (true positives + true negatives) / (true positives + true negatives + false positives + false negatives)

# True Positive Rate: Sensitivity
# RECALL for the positive class --> out of those that actually survived, how many did we predict would survive?
# TPR = true positives / (true positives + false negatives)
#  If we wanted to calculate PRECISION, it would be true positives / (true positives + false postives)
# Recall being true positives over the sum of the row, precision being the true positive over the sum of the column
# i.e, out of the values we predicted survived, how many were actual survivors?

# False Positive Rate: 
# FPR = false positives / (false positive + true negatives)

# True Negative Rate: Specificity
# Recall for the negative class --> out of those that perished, how many did we predict would not make it?
# TNR = true negatives / (true negatives + false positives)

# False negative rate:
# FNR = false negatives / (false negatives + true positives)

In [96]:
clf.predict_proba(X_train)

array([[0.87891384, 0.12108616],
       [0.38427991, 0.61572009],
       [0.99      , 0.01      ],
       [0.0445    , 0.9555    ],
       [0.01342105, 0.98657895],
       [0.86103992, 0.13896008],
       [0.78387782, 0.21612218],
       [0.97138913, 0.02861087],
       [0.95827289, 0.04172711],
       [0.93666667, 0.06333333],
       [0.61293687, 0.38706313],
       [0.50849842, 0.49150158],
       [0.02116667, 0.97883333],
       [0.8921534 , 0.1078466 ],
       [0.82997824, 0.17002176],
       [0.65049517, 0.34950483],
       [0.98407752, 0.01592248],
       [0.        , 1.        ],
       [0.96664065, 0.03335935],
       [0.79077778, 0.20922222],
       [0.21762338, 0.78237662],
       [0.94985071, 0.05014929],
       [0.01096618, 0.98903382],
       [0.08      , 0.92      ],
       [0.63185825, 0.36814175],
       [0.91691129, 0.08308871],
       [0.14      , 0.86      ],
       [0.64958772, 0.35041228],
       [0.92190762, 0.07809238],
       [0.88642954, 0.11357046],
       [0.

In [97]:
my_preds = clf.predict_proba(X_train)[:,1]

In [98]:
my_preds < .7

array([ True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True, False,  True, False,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True, False, False,
        True, False,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False, False,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
        True,  True, False, False,  True,  True, False,  True,  True,
       False,  True,  True, False,  True,  True, False,  True, False,
        True,  True, False, False,  True,  True,  True, False,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,

# KNN Exercises

In [99]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,1,0,7.25,0,False,False,0,1
1,1,1,1,1,0,71.2833,0,True,True,0,0
2,2,2,3,0,0,7.925,1,True,True,0,1
3,3,3,1,1,0,53.1,0,True,True,0,1
4,4,4,3,0,0,8.05,1,False,False,0,1


In [100]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from pydataset import data

In [101]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [102]:
train, validate, test = train_validate_test_split(df, target='did_survive', seed=123)

X_train = train.drop(columns=['did_survive'])
y_train = train.did_survive

X_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

X_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

In [103]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

1. **Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)**



In [104]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [105]:
y_pred = knn.predict(X_train)

In [106]:
y_pred_proba = knn.predict_proba(X_train)

2. **Evaluate your results using the model score, confusion matrix, and classification report.**



In [107]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [108]:
print(confusion_matrix(y_train, y_pred))

[[269  38]
 [ 91 100]]


In [109]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.75      0.88      0.81       307
        True       0.72      0.52      0.61       191

    accuracy                           0.74       498
   macro avg       0.74      0.70      0.71       498
weighted avg       0.74      0.74      0.73       498



3. **Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**



In [110]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * precision * recall) / (precision + recall)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T


print(f'The True Negative Rate is: ', tn)
print(f'The False Positive Rate is: ', fp)
print(f'The False Negative Rate is: ', fn)
print(f'The True Positive Rate is:', tp)
print("Accuracy is", accuracy.round(3))
print("Recall is", recall.round(3))
print("Precision is", precision.round(3))
print("F1 score is", f1_score.round(3))

The True Negative Rate is:  269
The False Positive Rate is:  38
The False Negative Rate is:  91
The True Positive Rate is: 100
Accuracy is 0.741
Recall is 0.524
Precision is 0.725
F1 score is 0.608


4. **Run through steps 2-4 setting k to 10**



In [111]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [112]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [113]:
y_pred = knn.predict(X_train)

In [114]:
y_pred_proba = knn.predict_proba(X_train)

In [115]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.70


In [116]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [117]:
print(confusion_matrix(y_train, y_pred))

[[286  21]
 [126  65]]


In [118]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.69      0.93      0.80       307
        True       0.76      0.34      0.47       191

    accuracy                           0.70       498
   macro avg       0.72      0.64      0.63       498
weighted avg       0.72      0.70      0.67       498



In [119]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * precision * recall) / (precision + recall)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T


print(f'The True Negative Rate is: ', tn)
print(f'The False Positive Rate is: ', fp)
print(f'The False Negative Rate is: ', fn)
print(f'The True Positive Rate is:', tp)
print("Accuracy is", accuracy.round(3))
print("Recall is", recall.round(3))
print("Precision is", precision.round(3))
print("F1 score is", f1_score.round(3))

The True Negative Rate is:  286
The False Positive Rate is:  21
The False Negative Rate is:  126
The True Positive Rate is: 65
Accuracy is 0.705
Recall is 0.34
Precision is 0.756
F1 score is 0.469


5. **Run through setps 2-4 setting k to 20**



In [120]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [121]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [122]:
y_pred = knn.predict(X_train)

In [123]:
y_pred_proba = knn.predict_proba(X_train)

In [124]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.68


In [125]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [126]:
print(confusion_matrix(y_train, y_pred))

[[286  21]
 [137  54]]


In [127]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.68      0.93      0.78       307
        True       0.72      0.28      0.41       191

    accuracy                           0.68       498
   macro avg       0.70      0.61      0.59       498
weighted avg       0.69      0.68      0.64       498



In [128]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1_score = (2 * precision * recall) / (precision + recall)
conf = confusion_matrix(y_train, y_pred)
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T


print(f'The True Negative Rate is: ', tn)
print(f'The False Positive Rate is: ', fp)
print(f'The False Negative Rate is: ', fn)
print(f'The True Positive Rate is:', tp)
print("Accuracy is", accuracy.round(3))
print("Recall is", recall.round(3))
print("Precision is", precision.round(3))
print("F1 score is", f1_score.round(3))

The True Negative Rate is:  286
The False Positive Rate is:  21
The False Negative Rate is:  137
The True Positive Rate is: 54
Accuracy is 0.683
Recall is 0.283
Precision is 0.72
F1 score is 0.406


6. **What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?**



7. **Which model performs best on our out-of-sample data from validate?**

## Another way to do it

In [129]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import acquire
import matplotlib.pyplot as plt

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
0,0,0,3,1,0,7.25,0,False,False,0,1
1,1,1,1,1,0,71.2833,0,True,True,0,0
2,2,2,3,0,0,7.925,1,True,True,0,1
3,3,3,1,1,0,53.1,0,True,True,0,1
4,4,4,3,0,0,8.05,1,False,False,0,1


In [130]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [131]:
train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
583,583,583,1,0,0,40.125,1,False,False,0,0
165,165,165,3,0,2,20.525,0,True,False,0,1
50,50,50,3,4,1,39.6875,0,False,False,0,1
259,259,259,2,0,1,26.0,0,True,True,0,1
306,306,306,1,0,0,110.8833,1,True,True,0,0


In [132]:
validate.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
610,610,610,3,1,5,31.275,0,False,True,0,1
424,424,424,3,1,1,20.2125,0,False,False,0,1
568,568,568,3,0,0,7.2292,1,False,False,0,0
334,334,334,1,1,0,133.65,0,True,True,0,1
101,101,101,3,0,0,7.8958,1,False,False,0,1


In [133]:
test.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,did_survive,is_female,embarked_Q,embarked_S
561,561,561,3,0,0,7.8958,1,False,False,0,1
641,641,641,1,0,0,69.3,1,True,True,0,0
400,400,400,3,0,0,7.925,1,True,False,0,1
498,498,498,1,1,2,151.55,0,False,True,0,1
875,875,875,3,0,0,7.225,1,True,True,0,0


In [134]:
# split our X and y's:
X_train, y_train = train.drop(columns='did_survive'), train['did_survive']
X_validate, y_validate = validate.drop(columns='did_survive'), train['did_survive']

In [135]:
X_train.info()
# still has 8 features

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    498 non-null    int64  
 1   passenger_id  498 non-null    int64  
 2   pclass        498 non-null    int64  
 3   sibsp         498 non-null    int64  
 4   parch         498 non-null    int64  
 5   fare          498 non-null    float64
 6   alone         498 non-null    int64  
 7   is_female     498 non-null    bool   
 8   embarked_Q    498 non-null    uint8  
 9   embarked_S    498 non-null    uint8  
dtypes: bool(1), float64(1), int64(6), uint8(2)
memory usage: 32.6 KB


In [136]:
# make our model
clf = KNeighborsClassifier()

In [137]:
X_train.shape

(498, 10)

In [138]:
y_train.shape

(498,)

In [139]:
# fit the model
clf.fit(X_train[['is_female', 'pclass']], y_train)

KNeighborsClassifier()

In [140]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    498 non-null    int64  
 1   passenger_id  498 non-null    int64  
 2   pclass        498 non-null    int64  
 3   sibsp         498 non-null    int64  
 4   parch         498 non-null    int64  
 5   fare          498 non-null    float64
 6   alone         498 non-null    int64  
 7   is_female     498 non-null    bool   
 8   embarked_Q    498 non-null    uint8  
 9   embarked_S    498 non-null    uint8  
dtypes: bool(1), float64(1), int64(6), uint8(2)
memory usage: 32.6 KB


In [141]:
X_train.columns

Index(['Unnamed: 0', 'passenger_id', 'pclass', 'sibsp', 'parch', 'fare',
       'alone', 'is_female', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [142]:
# make our predications from the model:
y_pred = clf.predict(X_train[['is_female', 'pclass']])

In [143]:
from eval import get_metric_bin

ImportError: cannot import name 'get_metric_bin' from 'eval' (/Users/caitlyncarney/codeup-data-science/classification-exercises/eval.py)

In [None]:
# Evaluation phase
def get_metrics_bin(clf, X, y):
    '''
    get_metrics_bin will take in a sklearn classifier model, an X and a y variable and utilize
    the model to make a prediction and then gather accuracy, class report evaluations

    return:  a classification report as a pandas DataFrame
    '''
    y_pred = clf.predict(X)
    accuracy = clf.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [None]:
class_report = get_metrics_bin(clf, X_train[['is_female', 'pclass']], y_train)

In [None]:
# Label time
class_report

In [None]:
# Changing k to 10
clf_1 = KNeighborsClassifier(n_neighbors=10)

In [None]:
clf_1.fit(X_train, y_train)

In [None]:
y_pred_1 = clf.predict(X_train)

In [None]:
class_report_1 = get_metrics_bin(clf, X_train, y_train)

In [None]:
class_report_1

In [None]:
# change k to 20
clf_2 = KNeighborsClassifier(n_neighbors=10)

In [None]:
clf_2.fit(X_train, y_train)

In [None]:
y_pred_2 = clf.predict(X_train)

In [None]:
class_report_2 = get_metrics_bin(clf, X_train, y_train)

In [None]:
class_report_2

In [None]:
# which performed better on in sample data?
rubric_df = pd.DataFrame([['true negative', 'false positive'],['false negative', 'true positive']], columns=['predict_death', 'predict_survive'], index=['actual_death', 'actual_survive'])
rubric_df

In [None]:
# how about out of data
pred_val = clf.predict(X_validate[['is_female', 'pclass']])
pred_val1 = clf_1.predict(X_validate)
pred_val2 = clf_2.predict(X_validate)

In [None]:
class_report_val = get_metrics_bin(clf, X_validate[['is_female', 'pclass']])
class_report_val1 = get_metrics_bin(clf_1, X_validate, y_validate)
class_report_val2 = get_metrics_bin(clf_2, X_validate, y_validate)

# Logistic Regression Exercises

In [3]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import logistic_regression_util 
        # made by zach and v. useful for making some graphs

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [145]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [146]:
df2 = pd.read_csv('titanic_df.csv')
df2.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


#### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?



In [147]:
df2["did_survive"] = df2.survived == 1
df2.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,did_survive
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False


In [148]:
dummy_df2 = pd.get_dummies(df2[["class"]], drop_first=True)
dummy_df2 = pd.get_dummies(df2[["embark_town"]], drop_first=True)
dummy_df2 = pd.get_dummies(df2[["embarked"]], drop_first=True)
dummy_df2.head()

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [149]:
df2 = pd.concat([df2, dummy_df2], axis=1)

# drop the old columns
new_df2 = df2.drop(columns=["survived", 'class', 'embark_town', 
                        'sex', 'deck', 'embarked', 'Unnamed: 0', 
                       'passenger_id', 'sibsp', 'parch', 'alone', 
                        'embarked_Q', 'embarked_S', 'sex'])
new_df2.head()

Unnamed: 0,pclass,age,fare,did_survive
0,3,22.0,7.25,False
1,1,38.0,71.2833,True
2,3,26.0,7.925,True
3,1,35.0,53.1,True
4,3,35.0,8.05,False


In [150]:
new_df2['did_survive'] = (new_df2['did_survive'] == True ).astype(int)

In [151]:
new_df2.head()

Unnamed: 0,pclass,age,fare,did_survive
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [152]:
new_df2 = new_df2.dropna()

In [153]:
train, validate, test = train_validate_test_split(new_df2,
                                                  target = 'did_survive',
                                                  seed=123)
train.head()

Unnamed: 0,pclass,age,fare,did_survive
652,3,21.0,8.4333,0
813,3,6.0,31.275,0
194,1,44.0,27.7208,1
417,2,18.0,13.0,1
460,1,48.0,26.55,1


In [154]:
x_train = train.drop(columns = ['did_survive'])
y_train = train.did_survive

x_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

x_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

#x is feature
# y is traget variable

In [155]:
train.did_survive.value_counts()
# most didnt survive

0    237
1    162
Name: did_survive, dtype: int64

In [156]:
x_train.shape, y_train.shape
# perfect they match

((399, 3), (399,))

In [157]:
train.head()

Unnamed: 0,pclass,age,fare,did_survive
652,3,21.0,8.4333,0
813,3,6.0,31.275,0
194,1,44.0,27.7208,1
417,2,18.0,13.0,1
460,1,48.0,26.55,1


In [158]:
features1 = ["pclass", "age", "fare"]

In [159]:
logit1 = LogisticRegression(C=1, class_weight={0:1, 1:99},
                           random_state=123)

In [162]:
logit1.fit(x_train[features1], y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [163]:
y_pred1 = logit1.predict(x_train)
y_pred1

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [164]:
y_pred_proba =logit1.predict_proba(x_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['did_not_survive', 'survived'])
y_pred_proba.head()

Unnamed: 0,did_not_survive,survived
0,0.02803,0.97197
1,0.013733,0.986267
2,0.0059,0.9941
3,0.00671,0.99329
4,0.0071,0.9929


In [166]:
print(classification_report(y_train, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [399, 498]

#### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [4]:
df3 = pd.read_csv('titanic_df.csv')
df3.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
df3["did_survive"] = df3.survived == 1
df3["is_female"] = df3.sex == 'female'
df3.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,did_survive,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,False


In [6]:
dummy_df3 = pd.get_dummies(df3[["class"]], drop_first=True)
dummy_df3 = pd.get_dummies(df3[["embark_town"]], drop_first=True)
dummy_df3 = pd.get_dummies(df3[["embarked"]], drop_first=True)
dummy_df3.head()

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [7]:
df3 = pd.concat([df3, dummy_df3], axis=1)

# drop the old columns
new_df3 = df3.drop(columns=["survived", 'class', 'embark_town', 
                        'sex', 'deck', 'embarked', 'Unnamed: 0', 
                       'passenger_id', 'sibsp', 'parch', 'alone', 
                        'embarked_Q', 'embarked_S'])
new_df3.head()

Unnamed: 0,pclass,age,fare,did_survive,is_female
0,3,22.0,7.25,False,False
1,1,38.0,71.2833,True,True
2,3,26.0,7.925,True,True
3,1,35.0,53.1,True,True
4,3,35.0,8.05,False,False


In [9]:
new_df3['is_female'] = (new_df3['is_female'] == True ).astype(int)
new_df3['did_survive'] = (new_df3['did_survive'] == True ).astype(int)

In [10]:
new_df3.head()

Unnamed: 0,pclass,age,fare,did_survive,is_female
0,3,22.0,7.25,0,0
1,1,38.0,71.2833,1,1
2,3,26.0,7.925,1,1
3,1,35.0,53.1,1,1
4,3,35.0,8.05,0,0


In [None]:
new_df3 = new_df3.dropna()

In [None]:
train, validate, test = train_validate_test_split(new_df3,
                                                  target = 'did_survive',
                                                  seed=123)
train.head()

In [None]:
x_train = train.drop(columns = ['did_survive'])
y_train = train.did_survive

x_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

x_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

#x is feature
# y is traget variable

In [None]:
train.did_survive.value_counts()
# most didnt survive

In [None]:
x_train.shape, y_train.shape
# perfect they match

In [None]:
logit2 = LogisticRegression(C=1, class_weight={0:1, 1:99},
                           random_state=123)

In [None]:
features2 = ["pclass", "age", "fare", "is_female"]
logit2.fit(x_train[features2], y_train)

In [None]:
y_pred = logit2.predict(x_train)
y_pred

In [None]:
y_pred_proba =logit2.predict_proba(x_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['did_not_survive', 'survived'])
y_pred_proba.head()

In [None]:
print(classification_report(y_train, y_pred))

#### 3. Try out other combinations of features and models.



In [None]:
df4 = pd.read_csv('titanic_df.csv')
df4.head()

In [None]:
df4["did_survive"] = df4.survived == 1

In [None]:
df4.embark_town.unique()

In [None]:
dummy_df4 = pd.get_dummies(df4[["embark_town"]], drop_first=False)
dummy_df4.head()

In [None]:
df4 = pd.concat([df4, dummy_df4], axis=1)

# drop the old columns
new_df4 = df4.drop(columns=["survived", 'class', 'embark_town', 
                        'sex', 'deck', 'embarked', 'Unnamed: 0', 
                       'passenger_id', 'sibsp', 'parch', 'alone', 
                        'sex'])
new_df4.head()

In [None]:
new_df4['did_survive'] = (new_df4['did_survive'] == True ).astype(int)

In [None]:
new_df4.head()

In [None]:
new_df4 = new_df4.dropna()

In [None]:
train, validate, test = train_validate_test_split(new_df4,
                                                  target = 'did_survive',
                                                  seed=123)
train.head()

In [None]:
x_train = train.drop(columns = ['did_survive'])
y_train = train.did_survive

x_validate = validate.drop(columns=['did_survive'])
y_validate = validate.did_survive

x_test = test.drop(columns=['did_survive'])
y_test = test.did_survive

#x is feature
# y is traget variable

In [None]:
train.did_survive.value_counts()
# most didnt survive

In [None]:
x_train.shape, y_train.shape
# perfect they match

In [None]:
logit3 = LogisticRegression(C=1, class_weight={0:1, 1:99},
                           random_state=123)

In [None]:
train.head()

In [None]:
features3 = ["pclass", "age", "fare", "embark_town_Cherbourg",
             "embark_town_Queenstown", "embark_town_Southampton"]
logit2.fit(x_train[features3], y_train)

In [None]:
y_pred = logit3.predict(x_train)
y_pred

In [None]:
y_pred_proba =logit3.predict_proba(x_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['did_not_survive', 'survived'])
y_pred_proba.head()

In [None]:
class_report3 = print(classification_report(y_train, y_pred))
class_report3

#### 4. Use you best 3 models to predict and evaluate on your validate sample.



In [None]:
y_pred_validate1 = logit1.predict(x_train[features])

In [None]:
y_pred_validate2 = logit2.predict(x_validate)

In [None]:
y_pred_validate3 = logit3.predict(x_validate)

In [None]:
print("Model 1: solver = lbfgs, c = 1")
print('Accuracy: {:.2f}'.format(logit1.score(x_validate[features], y_validate)))

print(classification_report(y_validate1, y_pred_validate1))

In [None]:
print("Model 1:")

print('Accuracy: {:.2f}'.format(logit1.score(x_validate[features], y_validate)))

print(classification_report(y_train, y_pred))

In [None]:
print("Model 2:")

print('Accuracy: {:.2f}'.format(logit2.score(x_validate[features], y_validate)))

print(classification_report(y_train, y_pred))

In [None]:
print("Model 3:")

print('Accuracy: {:.2f}'.format(logit3.score(x_validate, y_validate)))

print(classification_report(y_train, y_pred))

#### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



#### Bonus1 How do different strategies for handling the missing values in the age column affect model performance?



#### Bonus2: How do different strategies for encoding sex affect model performance?



#### Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.



#### Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.
####  `C=.01,.1,1,10,100,1000`

#### Bonus Bonus: how does scaling the data interact with your choice of C?