In [1]:
from pydataset import data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import \
accuracy_score,\
recall_score,\
precision_score,\
confusion_matrix,\
classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = get_titanic_data('titanic_db')
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def train_val_test(df, strat, seed = 123):
    train, val_test = train_test_split(df, train_size = 0.8, random_state = seed, stratify = df[strat])
    val, test = train_test_split(val_test, train_size = 0.5, random_state = seed, stratify = val_test[strat])
    return train, val, test
train, validate, test = train_val_test(df, 'survived')
train = train.drop(columns=['embark_town', 'class', 'deck', 'age'])
validate = validate.drop(columns=['embark_town', 'class', 'deck', 'age'])
test = test.drop(columns=['embark_town', 'class', 'deck', 'age'])
target = 'survived'

X_train = train.drop(columns=['survived', 'sex', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'sex', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone'])
X_test = test.drop(columns=['survived', 'sex', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone'])


y_validate = validate[target]
y_test = test[target]

In [4]:
train['baseline_prediction'] = 0
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 61.66%


In [5]:
# Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?
# I got rid of age due to too many null values that might be skewed by imputation
# yes; baseline is 61%, train is 67%
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)


In [6]:
y_pred = logit.predict(X_train)

In [7]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.76       439
           1       0.62      0.37      0.46       273

    accuracy                           0.67       712
   macro avg       0.66      0.62      0.61       712
weighted avg       0.66      0.67      0.65       712



In [8]:

dummy_df = pd.get_dummies(data=train[['sex']], drop_first=True)
train = pd.concat([train, dummy_df], axis=1)

In [9]:

#train['sex_male'] = np.where(train.sex_male == 'True', '1', '0')

In [10]:
# Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.
X_train = train.drop(columns=['survived', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone', 'sex'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone', 'sex'])
X_test = test.drop(columns=['survived', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone', 'sex'])


y_validate = validate[target]
y_test = test[target]


In [11]:
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_train, y_train)

In [12]:
y_pred = logit.predict(X_train)

In [13]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       439
           1       0.74      0.69      0.71       273

    accuracy                           0.79       712
   macro avg       0.78      0.77      0.77       712
weighted avg       0.78      0.79      0.79       712



In [14]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,baseline_prediction,sex_male
222,222,0,3,male,0,0,8.05,S,1,0,True
610,610,0,3,female,1,5,31.275,S,0,0,False
249,249,0,2,male,1,0,26.0,S,0,0,True
814,814,0,3,male,0,0,8.05,S,1,0,True
118,118,0,1,male,0,1,247.5208,C,0,0,True


In [15]:
# Try out other combinations of features and models.
# trying with sibsp and c=.5
X_train = train.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])
X_test = test.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])


y_validate = validate[target]
y_test = test[target]



In [16]:
logit = LogisticRegression(C=.5, random_state=123)
logit.fit(X_train, y_train)

In [17]:
y_pred = logit.predict(X_train)

In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       439
           1       0.76      0.68      0.72       273

    accuracy                           0.80       712
   macro avg       0.79      0.77      0.78       712
weighted avg       0.79      0.80      0.79       712



In [19]:
# Use you best 3 models to predict and evaluate on your validate sample.
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_validate, y_validate)


In [20]:
y_val_pred = logit.predict(X_validate)

In [21]:
print(classification_report(y_validate, y_val_pred))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76        55
           1       0.61      0.50      0.55        34

    accuracy                           0.69        89
   macro avg       0.66      0.65      0.65        89
weighted avg       0.68      0.69      0.68        89



In [28]:
# without sibsp
X_validate2 = validate.drop(columns=['survived', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone', 'sex'])
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_validate2, y_validate)

In [29]:
y_val_pred = logit.predict(X_validate2)


In [30]:
print(classification_report(y_validate, y_val_pred))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76        55
           1       0.61      0.50      0.55        34

    accuracy                           0.69        89
   macro avg       0.66      0.65      0.65        89
weighted avg       0.68      0.69      0.68        89



In [24]:
# without sex
X_validate3 = validate.drop(columns=['survived', 'sex', 'embarked', 'passenger_id', 'sibsp', 'parch', 'alone'])
logit = LogisticRegression(C=1, random_state=123)
logit.fit(X_validate3, y_validate)

In [25]:
y_val_pred = logit.predict(X_validate3)
print(classification_report(y_validate, y_val_pred))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76        55
           1       0.61      0.50      0.55        34

    accuracy                           0.69        89
   macro avg       0.66      0.65      0.65        89
weighted avg       0.68      0.69      0.68        89



In [33]:
# Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?
# better accuracy than my validate and slightly worse than my train
X_train = train.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])
y_train = train[target]

X_validate = validate.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])
X_test = test.drop(columns=['survived', 'embarked', 'passenger_id', 'parch', 'alone', 'sex'])


y_validate = validate[target]
y_test = test[target]




In [34]:
logit = LogisticRegression(C=.5, random_state=123)
logit.fit(X_test, y_test)

In [35]:
y_pred = logit.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80        55
           1       0.72      0.51      0.60        35

    accuracy                           0.73        90
   macro avg       0.73      0.69      0.70        90
weighted avg       0.73      0.73      0.72        90



In [None]:
# Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

