In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [39]:
# prepped titanic data: train, validate, test samples

train, validate, test = prep_titanic()
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [40]:
# baseline model: not survived is most common
survived = train.survived.value_counts()
print(survived, '\n')

# died/total passengers (accuracy)
baseline_accuracy = survived[0]/survived.sum()
print('Baseline Acccuracy:', baseline_accuracy)

0    307
1    190
Name: survived, dtype: int64 

Baseline Acccuracy: 0.6177062374245473


In [22]:
# MODEL 1: includes age, pclass, fare

# A. create model object
logit = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train = train[['age', 'fare', 'pclass']]
Y_train = train.survived

X_validate = validate[['age', 'fare', 'pclass']]
Y_validate = validate.survived

X_test = test[['age', 'fare', 'pclass']]
Y_test = test.survived

# C. Fit to X and Y train
logit = logit.fit(X_train, Y_train)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03051881  0.00266519 -0.97983178]]
Intercept: 
 [2.52970125]


In [23]:
#D. Predict Values on X_train
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

#accuracy
print('Accuracy: ', logit.score(X_train, Y_train), '\n')

#confusion matrix
print(confusion_matrix(Y_train, y_pred))

#classification report
print(classification_report(Y_train, y_pred))

Accuracy:  0.716297786720322 

[[265  42]
 [ 99  91]]
              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [24]:
# create dummy variable for sex as is_male
train_dummies = pd.get_dummies(train[['sex']], drop_first=True)
train = pd.concat([train, train_dummies], axis=1)

validate_dummies = pd.get_dummies(validate[['sex']], drop_first=True)
validate = pd.concat([validate, validate_dummies], axis=1)

test_dummies = pd.get_dummies(test[['sex']], drop_first=True)
test = pd.concat([test, test_dummies], axis=1)

train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,sex_male
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,0
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,0


In [25]:
# MODEL 2: includes age, pclass, fare, sex

# A. create model object
logit2 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train2 = train[['age', 'fare', 'pclass', 'sex_male']]
Y_train2 = train.survived

X_validate2 = validate[['age', 'fare', 'pclass', 'sex_male']]
Y_validate2 = validate.survived

X_test2 = test[['age', 'fare', 'pclass', 'sex_male']]
Y_test2 = test.survived

# C. Fit to X and Y train
logit2 = logit2.fit(X_train2, Y_train2)

print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-2.66594879e-02  9.02716903e-04 -1.11402368e+00 -2.45878213e+00]]
Intercept: 
 [4.30664987]


In [26]:
#D. Predict Values on X_train
y_pred2 = logit2.predict(X_train2)
y_pred_proba2 = logit2.predict_proba(X_train2)

#accuracy
print('Accuracy: ', logit2.score(X_train2, Y_train2), '\n')

#confusion matrix
print(confusion_matrix(Y_train2, y_pred2))

#classification report
print(classification_report(Y_train2, y_pred2))

Accuracy:  0.7987927565392354 

[[263  44]
 [ 56 134]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



3. Try out other combinations of features and models.

In [27]:
# MODEL 3: includes age, pclass, fare, sex, and alone

# A. create model object
logit3 = LogisticRegression(C=1, class_weight=None, random_state=123)

# B. split each sample into an X DataFrame and a Y Series
X_train3 = train[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_train3 = train.survived

X_validate3 = validate[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_validate3 = validate.survived

X_test3 = test[['age', 'fare', 'pclass', 'sex_male', 'alone']]
Y_test3 = test.survived

# C. Fit to X and Y train
logit3 = logit3.fit(X_train3, Y_train3)

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

Coefficient: 
 [[-2.55633270e-02  5.53491876e-04 -1.10859199e+00 -2.41546062e+00
  -1.58214588e-01]]
Intercept: 
 [4.33705386]


In [28]:
#D. Predict Values on X_train
y_pred3 = logit3.predict(X_train3)
y_pred_proba3 = logit3.predict_proba(X_train3)

#accuracy
print('Accuracy: ', logit3.score(X_train3, Y_train3), '\n')

#confusion matrix
print(confusion_matrix(Y_train3, y_pred3))

#classification report
print(classification_report(Y_train3, y_pred3))

Accuracy:  0.7967806841046278 

[[263  44]
 [ 57 133]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.70      0.72       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.79      0.80      0.80       497



4. Use you best 3 models to predict and evaluate on your validate sample.

In [41]:
# Validate Accuracy
y_pred = logit.predict(X_validate)
y_pred2 = logit2.predict(X_validate2)
y_pred3 = logit3.predict(X_validate3)

print("model 1\n", logit.score(X_validate, Y_validate))
print("model 2\n", logit2.score(X_validate2, Y_validate2))
print("model 3\n", logit3.score(X_validate3, Y_validate3))

model 1
 0.7289719626168224
model 2
 0.7850467289719626
model 3
 0.7850467289719626


In [43]:
# Validate Confusion Matrix
print("model 1\n", confusion_matrix(Y_validate, y_pred))
print("model 2\n", confusion_matrix(Y_validate2, y_pred2))
print("model 3\n", confusion_matrix(Y_validate3, y_pred3))

model 1
 [[116  16]
 [ 42  40]]
model 2
 [[111  21]
 [ 25  57]]
model 3
 [[110  22]
 [ 24  58]]


In [45]:
# Validate Classification Report
print("model 1\n", classification_report(Y_validate, y_pred))
print("model 2\n", classification_report(Y_validate2, y_pred2))
print("model 3\n", classification_report(Y_validate3, y_pred3))

model 1
               precision    recall  f1-score   support

           0       0.73      0.88      0.80       132
           1       0.71      0.49      0.58        82

    accuracy                           0.73       214
   macro avg       0.72      0.68      0.69       214
weighted avg       0.73      0.73      0.72       214

model 2
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       132
           1       0.73      0.70      0.71        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78       214

model 3
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       132
           1       0.72      0.71      0.72        82

    accuracy                           0.79       214
   macro avg       0.77      0.77      0.77       214
weighted avg       0.78      0.79      0.78    

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [47]:
y_pred3 = logit3.predict(X_test3)
y_pred_proba3 = logit3.predict_proba(X_test3)

print("Model 3: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit3.score(X_test3, Y_test3)))

print(confusion_matrix(Y_test3, y_pred3))

print(classification_report(Y_test, y_pred3))

Model 3: solver = lbfgs, c = 1
Accuracy: 0.81
[[92 18]
 [16 52]]
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       110
           1       0.74      0.76      0.75        68

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178

