In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

# Logistic Regression

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [3]:
# prepped titanic data: train, validate, test samples

train, validate, test = prep_titanic()
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [4]:
# baseline model: not survived is most common
survived = train.survived.value_counts()
print(survived, '\n')

# died/total passengers (accuracy)
baseline_accuracy = survived[0]/survived.sum()
print('Baseline Acccuracy:', baseline_accuracy)

0    307
1    190
Name: survived, dtype: int64 

Baseline Acccuracy: 0.6177062374245473


In [5]:
# MODEL 1: includes age, pclass, fare

# A. create model object
logit = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train = train[['age', 'fare', 'pclass']]
Y_train = train.survived

X_validate = validate[['age', 'fare', 'pclass']]
Y_validate = validate.survived

X_test = test[['age', 'fare', 'pclass']]
Y_test = test.survived

# C. Fit to X and Y train
logit = logit.fit(X_train, Y_train)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03051881  0.00266519 -0.97983178]]
Intercept: 
 [2.52970125]


In [6]:
#D. Predict Values on X_train
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

#accuracy
print('Accuracy: ', logit.score(X_train, Y_train), '\n')

#confusion matrix
print(confusion_matrix(Y_train, y_pred))

#classification report
print(classification_report(Y_train, y_pred))

Accuracy:  0.716297786720322 

[[265  42]
 [ 99  91]]
              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [7]:
# create dummy variable for sex as is_male
train_dummies = pd.get_dummies(train[['sex']], drop_first=True)
train = pd.concat([train, train_dummies], axis=1)

validate_dummies = pd.get_dummies(validate[['sex']], drop_first=True)
validate = pd.concat([validate, validate_dummies], axis=1)

test_dummies = pd.get_dummies(test[['sex']], drop_first=True)
test = pd.concat([test, test_dummies], axis=1)

train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,sex_male
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,0


In [8]:
# MODEL 2: includes age, pclass, fare, sex

# A. create model object
logit2 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train2 = train[['age', 'fare', 'pclass', 'sex_male']]
Y_train2 = train.survived

X_validate2 = validate[['age', 'fare', 'pclass', 'sex_male']]
Y_validate2 = validate.survived

X_test2 = test[['age', 'fare', 'pclass', 'sex_male']]
Y_test2 = test.survived

# C. Fit to X and Y train
logit2 = logit2.fit(X_train2, Y_train2)

print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-2.66594879e-02  9.02716903e-04 -1.11402368e+00 -2.45878213e+00]]
Intercept: 
 [4.30664987]


In [9]:
#D. Predict Values on X_train
y_pred2 = logit2.predict(X_train2)
y_pred_proba2 = logit2.predict_proba(X_train2)

#accuracy
print('Accuracy: ', logit2.score(X_train2, Y_train2), '\n')

#confusion matrix
print(confusion_matrix(Y_train2, y_pred2))

#classification report
print(classification_report(Y_train2, y_pred2))

Accuracy:  0.7987927565392354 

[[263  44]
 [ 56 134]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



3. Try out other combinations of features and models.

In [10]:
# MODEL 3: includes age, pclass, fare, sex, and alone

# A. create model object
logit3 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train3 = train[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_train3 = train.survived

X_validate3 = validate[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_validate3 = validate.survived

X_test3 = test[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_test3 = test.survived

# C. Fit to X and Y train
logit3 = logit3.fit(X_train3, Y_train3)

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

Coefficient: 
 [[-2.55633270e-02  5.53491876e-04 -1.10859199e+00 -2.41546062e+00
  -1.58214588e-01]]
Intercept: 
 [4.33705386]


In [11]:
#D. Predict Values on X_train
y_pred3 = logit3.predict(X_train3)
y_pred_proba3 = logit3.predict_proba(X_train3)

#accuracy
print('Accuracy: ', logit3.score(X_train3, Y_train3), '\n')

#confusion matrix
print(confusion_matrix(Y_train3, y_pred3))

#classification report
print(classification_report(Y_train3, y_pred3))

Accuracy:  0.7967806841046278 

[[263  44]
 [ 57 133]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.70      0.72       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.79      0.80      0.80       497



4. Use you best 3 models to predict and evaluate on your validate sample.

In [12]:
# Validate Accuracy
y_pred = logit.predict(X_validate)
y_pred2 = logit2.predict(X_validate2)
y_pred3 = logit3.predict(X_validate3)

print("model 1\n", logit.score(X_validate, Y_validate))
print("model 2\n", logit2.score(X_validate2, Y_validate2))
print("model 3\n", logit3.score(X_validate3, Y_validate3))

model 1
 0.7289719626168224
model 2
 0.7850467289719626
model 3
 0.7850467289719626


In [13]:
# Validate Confusion Matrix
print("model 1\n", confusion_matrix(Y_validate, y_pred))
print("model 2\n", confusion_matrix(Y_validate2, y_pred2))
print("model 3\n", confusion_matrix(Y_validate3, y_pred3))

model 1
 [[116  16]
 [ 42  40]]
model 2
 [[111  21]
 [ 25  57]]
model 3
 [[110  22]
 [ 24  58]]


In [14]:
# Validate Classification Report
print("model 1\n", classification_report(Y_validate, y_pred))
print("model 2\n", classification_report(Y_validate2, y_pred2))
print("model 3\n", classification_report(Y_validate3, y_pred3))

model 1
               precision    recall  f1-score   support

           0       0.73      0.88      0.80       132
           1       0.71      0.49      0.58        82

    accuracy                           0.73       214
   macro avg       0.72      0.68      0.69       214
weighted avg       0.73      0.73      0.72       214

model 2
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       132
           1       0.73      0.70      0.71        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78       214

model 3
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       132
           1       0.72      0.71      0.72        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78    

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [15]:
y_pred3 = logit3.predict(X_test3)
y_pred_proba3 = logit3.predict_proba(X_test3)

print("Model 3: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit3.score(X_test3, Y_test3)))

print(confusion_matrix(Y_test3, y_pred3))

print(classification_report(Y_test, y_pred3))

Model 3: solver = lbfgs, c = 1
Accuracy: 0.81
[[92 18]
 [16 52]]
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.74      0.76      0.75        68

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



# Decision Trees

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [45]:
X_train = train.drop(columns=['passenger_id', 'sex', 'embark_town', 'embarked', 'class', 'survived'])
y_train = train[['survived']]

In [62]:
# Decision Tree Object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit model to training data
clf = clf.fit(X_train, y_train)

#Estimate Survived
y_pred = clf.predict(X_train)

# Estimate probability of survived
y_pred_proba = clf.predict_proba(X_train)

clf, y_pred[:5], y_pred_proba[:5]

(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=3, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=123, splitter='best'),
 array([0, 1, 0, 1, 1]),
 array([[0.51666667, 0.48333333],
        [0.0326087 , 0.9673913 ],
        [0.88      , 0.12      ],
        [0.0326087 , 0.9673913 ],
        [0.0326087 , 0.9673913 ]]))

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [103]:
# Compute Accuracy
acc = round(clf.score(X_train, y_train),2)

# Create Confustion Matrix
labels = sorted(y_train.survived.unique())

cm = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [117]:
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]

TPR = TP/(TP+FN)
FPR = FP/(FP+TN)
TNR = TN/(TN+FP)
FNR = FN/(FN+TP)

# Create Classification Report
cr = classification_report(y_train, y_pred)

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [118]:
print('Accuracy: ', acc, '\n')
print('True Positive Rate: ', TPR,  '\n')
print('False Positive Rate: ', FPR, '\n')
print('True Negative Rate: ', TNR, '\n')
print('False Negative Rate: ', FNR, '\n')
print('Classification Report: \n', cr)
print('Confustion Matrix: \n', cm)

Accuracy:  0.82 

True Positive Rate:  0.8181818181818182 

False Positive Rate:  0.1794871794871795 

True Negative Rate:  0.8205128205128205 

False Negative Rate:  0.18181818181818182 

Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497

Confustion Matrix: 
      0    1
0  279   28
1   62  128


4. Run through steps 2-4 using a different max_depth value.

In [127]:
# Decision Tree Object
clf2 = DecisionTreeClassifier(max_depth=5, random_state=123)

# Fit model to training data
clf2 = clf2.fit(X_train, y_train)

#Estimate Survived
y_pred2 = clf2.predict(X_train)

# Estimate probability of survived
y_pred_proba2 = clf2.predict_proba(X_train)

clf2, y_pred2[:5], y_pred_proba2[:5]

(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=5, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=123, splitter='best'),
 array([0, 1, 0, 1, 1]),
 array([[0.60416667, 0.39583333],
        [0.        , 1.        ],
        [0.87777778, 0.12222222],
        [0.        , 1.        ],
        [0.        , 1.        ]]))

In [128]:
# Compute Accuracy
acc2 = round(clf2.score(X_train, y_train),2)

# Create Confustion Matrix
labels2 = sorted(y_train.survived.unique())

cm2 = pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)
cm2

Unnamed: 0,0,1
0,285,22
1,49,141


In [129]:
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]

TPR2 = TP2/(TP2+FN2)
FPR2 = FP2/(FP2+TN2)
TNR2 = TN2/(TN2+FP2)
FNR2 = FN2/(FN2+TP2)

# Create Classification Report
cr2 = classification_report(y_train, y_pred2)

In [130]:
print('Accuracy 2: ', acc2, '\n'),
print('True Positive Rate 2: ', TPR2,  '\n')
print('False Positive Rate 2: ', FPR2, '\n')
print('True Negative Rate 2: ', TNR2, '\n')
print('False Negative Rate 2: ', FNR2, '\n')
print('Classification Report 2: \n', cr2),
print('Confustion Matrix 2: \n', cm2)

Accuracy 2:  0.86 

True Positive Rate 2:  0.8532934131736527 

False Positive Rate 2:  0.13496932515337423 

True Negative Rate 2:  0.8650306748466258 

False Negative Rate 2:  0.1467065868263473 

Classification Report 2: 
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       307
           1       0.87      0.74      0.80       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.84       497
weighted avg       0.86      0.86      0.85       497

Confustion Matrix 2: 
      0    1
0  285   22
1   49  141


5. Which performs better on your in-sample data?

In [167]:
d = {'Results': ['Accuracy', 
                 'True Postive Rate', 
                 'False Positive Rate', 
                 'True Negative Rate', 
                 'False Negative Rate'], 
     'Model 1': [acc, TPR, FPR, TNR, FNR], 
     'Model 2':[acc2, TPR2, FPR2, TNR2, FNR2]
    }

best_model = pd.DataFrame(data=d)
best_model

Unnamed: 0,Results,Model 1,Model 2
0,Accuracy,0.82,0.86
1,True Postive Rate,0.818182,0.853293
2,False Positive Rate,0.179487,0.134969
3,True Negative Rate,0.820513,0.865031
4,False Negative Rate,0.181818,0.146707


In [168]:
print('Model 1: \n', cr, '\n', 'Model 2: \n', cr2)

Model 1: 
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497
 
 Model 2: 
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       307
           1       0.87      0.74      0.80       190

    accuracy                           0.86       497
   macro avg       0.86      0.84      0.84       497
weighted avg       0.86      0.86      0.85       497

