Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt

import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

df = get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [2]:
# Handle missing values in the `age` column.
df.dropna(inplace=True)

X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127 entries, 123 to 540
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  127 non-null    int64  
 1   age     127 non-null    float64
 2   fare    127 non-null    float64
 3   sibsp   127 non-null    int64  
 4   parch   127 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 6.0 KB


In [3]:
# from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [4]:
# Print the coefficients and intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[1.30411374e-02 8.72240193e-05 1.53779647e-02 5.48610411e-03
  1.65371660e-03]]
Intercept: 
 [0.00655794]


In [5]:
# Estimate whether or not a passenger would survive, using the training data
y_pred = logit.predict(X_train)

# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = logit.predict_proba(X_train)

# Compute the accuracy
print('Accuracy:',format(logit.score(X_train, y_train)))

Accuracy: 0.6377952755905512


In [6]:
# confusion matrix
print(confusion_matrix(y_train, y_pred))

[[ 0 46]
 [ 0 81]]


In [7]:
# Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        46
           1       0.64      1.00      0.78        81

    accuracy                           0.64       127
   macro avg       0.32      0.50      0.39       127
weighted avg       0.41      0.64      0.50       127



In [8]:
### 1.  Base Model
#split df
tdf = get_titanic_data()
tdf.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [10]:
train, validate, test = prep_titanic(tdf)

In [11]:
print(train.shape, validate.shape, test.shape)

(497, 10) (214, 10) (178, 10)


In [12]:
train.survived.mean()

0.3822937625754527

In [13]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [14]:
# died is the majority response - requires human intervention, but gives same result as Ryan's
# positive case = died
my_baseline_accuracy = 307/(307+190)
my_baseline_accuracy

0.6177062374245473

In [15]:
# instructors method
train['baseline_prediction'] = 0
pd.crosstab(train.baseline_prediction, train.survived)

survived,0,1
baseline_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,190


In [16]:
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
baseline_accuracy

0.6177062374245473

In [23]:
#model included for baseline
logit1 = LogisticRegression()

X_train1 = train.drop(columns=['survived'])
y_train = train.survived

logit1 = logit1.fit(X_train1, y_train)
print(logit1.coef_)
print(X_train1.columns)

[[-1.12191387e+00 -3.26573519e-02 -5.93398655e-01 -2.11718189e-01
   1.16953841e-03 -1.02705201e+00 -2.41661090e+00  7.33505485e-01
   2.15341921e-01  0.00000000e+00]]
Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'sex_male',
       'embark_town_Queenstown', 'embark_town_Southampton',
       'baseline_prediction'],
      dtype='object')


In [25]:
y_pred1 = logit1.predict(X_train1)

logit1.score(X_train1, y_train)

print('better accuracy @', logit1.score(X_train1, y_train))

better accuracy @ 0.8088531187122736


In [21]:
#2. Create another model that includes age in addition to fare and pclass. 
#Does this model perform better than your baseline?

X_train2 = train[['age', 'fare', 'pclass']]
y_train = train.survived

logit2 = LogisticRegression()
logit2 = logit2.fit(X_train2, y_train)

print(logit2.coef_)
print(X_train2.columns)

[[-0.03012701  0.00269178 -0.98002535]]
Index(['age', 'fare', 'pclass'], dtype='object')


In [22]:
logit2.score(X_train2, y_train)

0.7142857142857143

In [26]:
#this model didnt do better 

In [27]:
#3 Include sex in your model as well. Note that you'll need to encode 
#or create a dummy variable of this feature before including it in a model.

X_train3 = train[['age', 'fare', 'pclass', 'sex_male']]
y_train = train.survived

logit3 = LogisticRegression()
logit3 = logit3.fit(X_train3, y_train)

print(logit3.coef_)
print(X_train3.columns)
print(logit3.score(X_train3, y_train))

[[-2.63670192e-02  9.26636404e-04 -1.11442057e+00 -2.45962126e+00]]
Index(['age', 'fare', 'pclass', 'sex_male'], dtype='object')
0.7927565392354124


In [29]:
#4 Try out other combinations of features and models.
X_train4 = train[['sex_male']]
y_train = train.survived

def my_logit(X_train):
    my_logit = LogisticRegression()
    my_logit = my_logit.fit(X_train, y_train)
    return my_logit, my_logit.coef_, my_logit.score(X_train, y_train)

logit4, coefs, accuracy = my_logit(X_train4)
print(coefs, accuracy)

[[-2.37681345]] 0.7847082494969819


In [31]:
train.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton',
       'baseline_prediction'],
      dtype='object')

In [32]:
X_train5 = train[['sibsp', 'parch', 'alone', 'embark_town_Queenstown', 'embark_town_Southampton']]

logit5, coefs, accuracy = my_logit(X_train5)
print(coefs, accuracy)


[[-0.58064188 -0.08586553 -1.81001298  0.11658434 -0.35790445]] 0.6941649899396378


In [33]:
X_train6 = train[['alone', 'sex_male', 'pclass']]

logit6, coefs, accuracy = my_logit(X_train6)
print(coefs, accuracy)


[[-0.30828946 -2.40744024 -0.95701015]] 0.7847082494969819


In [34]:
#5 Use you best 3 models to predict and evaluate on your validate sample.

X_validate1 = validate.drop(columns=['survived'])
X_validate4 = validate[['sex_male']]
X_validate6 = validate[['alone', 'pclass', 'sex_male']]

y_validate = validate.survived

acc1 = logit1.score(X_validate1, y_validate)
acc4 = logit4.score(X_validate4, y_validate)
acc6 = logit6.score(X_validate6, y_validate)

print(acc1, acc4, acc6)

ValueError: X has 9 features per sample; expecting 10

In [35]:
#6 Choose you best model from the validation performation, and evaluate 
#it on the test dataset. How do the performance metrics compare to 
#validate? to train?

X_test1 = test.drop(columns=['survived'])
y_test = test.survived

test_acc = logit1.score(X_test1, y_test)
y1_pred = logit1.predict(X_test1)

print(test_acc)
print("test report:\n", classification_report(y_test, y1_pred))

ValueError: X has 9 features per sample; expecting 10

In [None]:
#######
# Decision Tree

#Fit the decision tree classifier to your training sample and transform 
#(i.e. make predictions on the training sample)

