1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [7]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [8]:
df = prepare.prep_titanic_data(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_id     891 non-null    int64  
 1   survived         891 non-null    int64  
 2   pclass           891 non-null    int64  
 3   sex              891 non-null    object 
 4   age              714 non-null    float64
 5   sibsp            891 non-null    int64  
 6   parch            891 non-null    int64  
 7   fare             891 non-null    float64
 8   embarked         891 non-null    object 
 9   class            891 non-null    object 
 10  embark_town      891 non-null    object 
 11  alone            891 non-null    int64  
 12  embarked_encode  891 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 97.5+ KB


In [10]:
df.age = df.age.fillna(df.age.mean())

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   passenger_id     891 non-null    int64  
 1   survived         891 non-null    int64  
 2   pclass           891 non-null    int64  
 3   sex              891 non-null    object 
 4   age              891 non-null    float64
 5   sibsp            891 non-null    int64  
 6   parch            891 non-null    int64  
 7   fare             891 non-null    float64
 8   embarked         891 non-null    object 
 9   class            891 non-null    object 
 10  embark_town      891 non-null    object 
 11  alone            891 non-null    int64  
 12  embarked_encode  891 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 97.5+ KB


In [12]:
df = df.drop(columns=['embarked', 'pclass', 'embarked_encode', 'passenger_id'])

df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton
0,0,22.0,1,0,7.25,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0,0,0
2,1,26.0,0,0,7.925,1,0,0,1,0,0,1
3,1,35.0,1,0,53.1,0,0,0,0,0,0,1
4,0,35.0,0,0,8.05,1,1,0,1,0,0,1


In [13]:
train, validate, test = prepare.split(df, stratify_by='survived')

In [14]:
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton
583,0,36.0,0,0,40.125,1,1,0,0,0,0,0
165,1,9.0,0,2,20.525,0,1,0,1,0,0,1
50,0,7.0,4,1,39.6875,0,1,0,1,0,0,1
259,1,50.0,0,1,26.0,0,0,1,0,0,0,1
306,1,29.699118,0,0,110.8833,1,0,0,0,0,0,0


### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [15]:
# baseline

train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [16]:
train['baseline'] = 0
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Other,embark_town_Queenstown,embark_town_Southampton,baseline
583,0,36.0,0,0,40.125,1,1,0,0,0,0,0,0
165,1,9.0,0,2,20.525,0,1,0,1,0,0,1,0
50,0,7.0,4,1,39.6875,0,1,0,1,0,0,1,0
259,1,50.0,0,1,26.0,0,0,1,0,0,0,1,0
306,1,29.699118,0,0,110.8833,1,0,0,0,0,0,0,0


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   age                      498 non-null    float64
 2   sibsp                    498 non-null    int64  
 3   parch                    498 non-null    int64  
 4   fare                     498 non-null    float64
 5   alone                    498 non-null    int64  
 6   sex_male                 498 non-null    uint8  
 7   class_Second             498 non-null    uint8  
 8   class_Third              498 non-null    uint8  
 9   embark_town_Other        498 non-null    uint8  
 10  embark_town_Queenstown   498 non-null    uint8  
 11  embark_town_Southampton  498 non-null    uint8  
 12  baseline                 498 non-null    int64  
dtypes: float64(2), int64(5), uint8(6)
memory usage: 34.0 KB


In [19]:
baseline_accuracy = (train.survived == train.baseline).mean()

print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

Baseline accuracy: 0.62


In [20]:
# build a logistic regression model with "pclass", "fare", "age"
train.columns

Index(['survived', 'age', 'sibsp', 'parch', 'fare', 'alone', 'sex_male',
       'class_Second', 'class_Third', 'embark_town_Other',
       'embark_town_Queenstown', 'embark_town_Southampton', 'baseline'],
      dtype='object')

In [21]:
x_train = train.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                              'embark_town_Queenstown', 'embark_town_Southampton','sex_male', 'baseline'])
y_train = train.survived

x_validate = validate.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                                    'embark_town_Queenstown', 'embark_town_Southampton','sex_male'])
y_validate = validate.survived

x_test = test.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                            'embark_town_Queenstown', 'embark_town_Southampton','sex_male'])
y_test = test.survived

In [22]:
x_train.head()

Unnamed: 0,age,fare,class_Second,class_Third
583,36.0,40.125,0,0
165,9.0,20.525,0,1
50,7.0,39.6875,0,1
259,50.0,26.0,1,0
306,29.699118,110.8833,0,0


In [23]:
y_train[:5]

583    0
165    1
50     0
259    1
306    1
Name: survived, dtype: int64

In [24]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

#  fit the model on train data
logit.fit(x_train, y_train)

# now use the model to make predictions
y_pred = logit.predict(x_train)

#take a look at predictions
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,

In [25]:
# look at probability of being in certain class for each observation
logit.predict_proba(x_train)[:10]

array([[0.43261276, 0.56738724],
       [0.65654432, 0.34345568],
       [0.62860162, 0.37139838],
       [0.63051811, 0.36948189],
       [0.33380344, 0.66619656],
       [0.49892652, 0.50107348],
       [0.58481101, 0.41518899],
       [0.4971073 , 0.5028927 ],
       [0.77859667, 0.22140333],
       [0.73904127, 0.26095873]])

In [26]:
# look at classes attribute
logit.classes_

array([0, 1])

In [27]:
# View raw probabilities (output from the model)

y_pred_proba = logit.predict_proba(x_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['lose', 'survided'])
y_pred_proba.head()

Unnamed: 0,lose,survided
0,0.432613,0.567387
1,0.656544,0.343456
2,0.628602,0.371398
3,0.630518,0.369482
4,0.333803,0.666197


In [28]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.81      0.76       307
           1       0.62      0.49      0.54       191

    accuracy                           0.69       498
   macro avg       0.67      0.65      0.65       498
weighted avg       0.68      0.69      0.68       498



In [42]:
print(f"Baseline accuracy on train set: {round(baseline_accuracy, 2)}")

print('Accuracy of Logistic Regression (c=1) on train set: {:.2f}'
     .format(logit.score(x_train, y_train)))

Baseline accuracy on train set: 0.62
Accuracy of Logistic Regression (c=1) on train set: 0.69


### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [31]:
x_train1 = train.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                              'embark_town_Queenstown', 'embark_town_Southampton','baseline'])
y_train1 = train.survived

x_validate1 = validate.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                                    'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate1 = validate.survived

x_test1 = test.drop(columns=["survived", 'sibsp', 'parch','alone','embark_town_Other',
                            'embark_town_Queenstown', 'embark_town_Southampton'])
y_test1 = test.survived

In [32]:
x_train1.head()

Unnamed: 0,age,fare,sex_male,class_Second,class_Third
583,36.0,40.125,1,0,0
165,9.0,20.525,1,0,1
50,7.0,39.6875,1,0,1
259,50.0,26.0,0,1,0
306,29.699118,110.8833,0,0,0


In [33]:
y_train1[:5]

583    0
165    1
50     0
259    1
306    1
Name: survived, dtype: int64

In [35]:
logit1 = LogisticRegression(C=1, random_state=123)

logit1.fit(x_train1, y_train1)

y_pred1 = logit1.predict(x_train1)

y_pred1

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [38]:
logit1.predict_proba(x_train1)[:10]

array([[0.61855714, 0.38144286],
       [0.86328429, 0.13671571],
       [0.85741497, 0.14258503],
       [0.24256963, 0.75743037],
       [0.08842572, 0.91157428],
       [0.73373193, 0.26626807],
       [0.79350825, 0.20649175],
       [0.72330622, 0.27669378],
       [0.91454423, 0.08545577],
       [0.42898047, 0.57101953]])

In [39]:
logit1.classes_

array([0, 1])

In [40]:
y_pred_proba1 = logit1.predict_proba(x_train1)
y_pred_proba1 = pd.DataFrame(y_pred_proba1, columns = ['lose', 'survided'])
y_pred_proba1.head()

Unnamed: 0,lose,survided
0,0.618557,0.381443
1,0.863284,0.136716
2,0.857415,0.142585
3,0.24257,0.75743
4,0.088426,0.911574


In [41]:
print(classification_report(y_train1, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       307
           1       0.77      0.69      0.73       191

    accuracy                           0.80       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.80      0.80       498



In [44]:
print('Accuracy of Logistic Regression (c=1) on train set: {:.2f}'
     .format(logit1.score(x_train1, y_train1)))

Accuracy of Logistic Regression (c=1) on train set: 0.80


### 3. Try out other combinations of features and models.

In [45]:
x_train1.columns

Index(['age', 'fare', 'sex_male', 'class_Second', 'class_Third'], dtype='object')

In [67]:
# this time use alone, fare, sex, class, age

x_train2 = train.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other',
                              'embark_town_Queenstown', 'embark_town_Southampton','baseline'])
y_train2 = train.survived

x_validate2 = validate.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate2 = validate.survived

x_test2 = test.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test2 = test.survived

In [68]:
x_train2.head()

Unnamed: 0,age,fare,alone,sex_male,class_Second,class_Third
583,36.0,40.125,1,1,0,0
165,9.0,20.525,0,1,0,1
50,7.0,39.6875,0,1,0,1
259,50.0,26.0,0,0,1,0
306,29.699118,110.8833,1,0,0,0


In [69]:
logit2 = LogisticRegression(C=1, random_state=123)

logit2.fit(x_train2, y_train2)

y_pred2 = logit2.predict(x_train2)

y_pred2

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [70]:
logit2.classes_

array([0, 1])

In [71]:
y_pred_proba2 = logit2.predict_proba(x_train2)
y_pred_proba2 = pd.DataFrame(y_pred_proba2, columns = ['lose', 'survided'])
y_pred_proba2.head()

Unnamed: 0,lose,survided
0,0.613958,0.386042
1,0.868159,0.131841
2,0.861782,0.138218
3,0.250413,0.749587
4,0.084021,0.915979


In [72]:
print(classification_report(y_train2, y_pred2))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       307
           1       0.79      0.69      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.79       498
weighted avg       0.81      0.81      0.81       498



In [73]:
print('Accuracy of Logistic Regression (c=1) on train set: {:.2f}'
     .format(logit2.score(x_train2, y_train2)))

Accuracy of Logistic Regression (c=1) on train set: 0.81


In [74]:
# only use class, age, sex

x_train3 = train.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other','fare','alone',
                              'embark_town_Queenstown', 'embark_town_Southampton','baseline'])
y_train3 = train.survived

x_validate3 = validate.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other','fare','alone',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate3 = validate.survived

x_test3 = test.drop(columns=["survived", 'sibsp', 'parch','embark_town_Other','fare','alone',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test3 = test.survived

In [75]:
logit3 = LogisticRegression(C=1, random_state=123)

logit3.fit(x_train3, y_train3)

y_pred3 = logit3.predict(x_train3)

y_pred3

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [76]:
y_pred_proba3 = logit3.predict_proba(x_train3)
y_pred_proba3 = pd.DataFrame(y_pred_proba3, columns = ['lose', 'survided'])
y_pred_proba3.head()

Unnamed: 0,lose,survided
0,0.619397,0.380603
1,0.86326,0.13674
2,0.857124,0.142876
3,0.242431,0.757569
4,0.088127,0.911873


In [77]:
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       307
           1       0.77      0.69      0.73       191

    accuracy                           0.80       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.80      0.80       498



In [80]:
print(f"Baseline accuracy on train set: {round(baseline_accuracy, 2)}")

print('Accuracy of Logistic Regression1 (age, fare, class) on train set: {:.2f}'
     .format(logit.score(x_train, y_train)))

print('Accuracy of Logistic Regression2 (age, fare, class, sex) on train set: {:.2f}'
     .format(logit1.score(x_train1, y_train1)))

print('Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on train set: {:.2f}'
     .format(logit2.score(x_train2, y_train2)))

print('Accuracy of Logistic Regression4 (age, class, sex) on train set: {:.2f}'
     .format(logit3.score(x_train3, y_train3)))

Baseline accuracy on train set: 0.62
Accuracy of Logistic Regression1 (age, fare, class) on train set: 0.69
Accuracy of Logistic Regression2 (age, fare, class, sex) on train set: 0.80
Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on train set: 0.81
Accuracy of Logistic Regression4 (age, class, sex) on train set: 0.80


### 4. Use you best 3 models to predict and evaluate on your validate sample.

In [81]:
# use 2, 3, 4
print('Accuracy of Logistic Regression2 (age, fare, class, sex) on validate set: {:.2f}'
     .format(logit1.score(x_validate1, y_validate1)))

print('Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on validate set: {:.2f}'
     .format(logit2.score(x_validate2, y_validate2)))

print('Accuracy of Logistic Regression4 (age, class, sex) on validate set: {:.2f}'
     .format(logit3.score(x_validate3, y_validate3)))

Accuracy of Logistic Regression2 (age, fare, class, sex) on validate set: 0.78
Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on validate set: 0.78
Accuracy of Logistic Regression4 (age, class, sex) on validate set: 0.78


### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [82]:
# use 3

print('Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on test set: {:.2f}'
     .format(logit2.score(x_test2, y_test2)))

Accuracy of Logistic Regression3 (age, fare, class, sex, alone) on test set: 0.80


In [83]:
train.shape

(498, 13)