In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import acquire
from prepare import tts

# Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?



In [2]:
# function to get the titanic data from my acquire file

def prep_titanic():
    titan=acquire.get_titanic()
    titan.drop(columns=['passenger_id', 'embarked', 'deck', 'class'], inplace=True)
    dummy_var=pd.get_dummies(titan[['embark_town', 'sex']], drop_first=True)
    titan=pd.concat([titan, dummy_var], axis=1)
    return titan

#assigning the titanic data to a dataframe

df=prep_titanic()

In [3]:
#dropping columns that have been split into dummy variables

df = df.drop(columns=['embark_town', 'sex'])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,22.0,1,0,7.25,0,0,1,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,1,0
3,1,1,35.0,1,0,53.1,0,0,1,0
4,0,3,35.0,0,0,8.05,1,0,1,1


In [4]:
df=df.dropna()

In [5]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,22.0,1,0,7.25,0,0,1,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,1,0
3,1,1,35.0,1,0,53.1,0,0,1,0
4,0,3,35.0,0,0,8.05,1,0,1,1


In [6]:
#establishing basline
df['baseline']= 0

In [7]:
print(classification_report(df['survived'], df['baseline']))

              precision    recall  f1-score   support

           0       0.59      1.00      0.75       424
           1       0.00      0.00      0.00       290

    accuracy                           0.59       714
   macro avg       0.30      0.50      0.37       714
weighted avg       0.35      0.59      0.44       714



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
t_train, t_val, t_test = tts(df)

stratify=survived


### Getting the columns we want from the split and making them dataframes

In [9]:
t_train1= t_train['survived'], t_train['age'], t_train['fare'], t_train['pclass']

In [10]:
t_train1=pd.DataFrame(t_train1).T

In [11]:
t_val1=t_val['survived'], t_val['age'], t_val['fare'], t_val['pclass']

In [12]:
t_val1= pd.DataFrame(t_val1).T

In [13]:
t_test1=t_test['survived'], t_test['age'], t_test['fare'], t_test['pclass']

In [14]:
t_test1=pd.DataFrame(t_test1).T

In [27]:
t_train1.head()

Unnamed: 0,survived,age,fare,pclass
809,1.0,33.0,53.1,1.0
817,0.0,31.0,37.0042,2.0
369,1.0,24.0,69.3,1.0
840,0.0,20.0,7.925,3.0
120,0.0,21.0,73.5,2.0


In [15]:
#Assigning the x and y for the model

X_train = t_train1.drop(columns=['survived'])
y_train = t_train1['survived']

X_val = t_val1.drop(columns=['survived'])
y_val = t_val1['survived']

X_test = t_test1.drop(columns=['survived'])
y_test = t_test1['survived']

In [28]:
X_train.head()

Unnamed: 0,age,fare,pclass
809,33.0,53.1,1.0
817,31.0,37.0042,2.0
369,24.0,69.3,1.0
840,20.0,7.925,3.0
120,21.0,73.5,2.0


In [36]:
#making the model
logit = LogisticRegression(C=1, 
                           # class_weight={0:1, 1:99}, 
                           random_state=8675309, 
                          intercept_scaling=1, solver='lbfgs')

In [37]:
#fitting the model
logit.fit(X_train, y_train)

In [38]:
#generating predictions
y_pred1=logit.predict(X_train)

In [39]:
#getting accuracy
logit.score(X_train, y_train)

#this accuracy is much better than the baseline (.59)

0.7268170426065163

# Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [47]:
#assigning train, validate, test for second model

t_train2= t_train['survived'], t_train['age'], t_train['fare'], t_train['pclass'], t_train['sex_male']
t_train2=pd.DataFrame(t_train2).T

t_val2=t_val['survived'], t_val['age'], t_val['fare'], t_val['pclass'], t_val['sex_male']
t_val2= pd.DataFrame(t_val2).T

t_test2=t_test['survived'], t_test['age'], t_test['fare'], t_test['pclass'], t_test['sex_male']
t_test2=pd.DataFrame(t_test2).T

In [48]:
t_train2.head()

Unnamed: 0,survived,age,fare,pclass,sex_male
809,1.0,33.0,53.1,1.0,0.0
817,0.0,31.0,37.0042,2.0,1.0
369,1.0,24.0,69.3,1.0,0.0
840,0.0,20.0,7.925,3.0,1.0
120,0.0,21.0,73.5,2.0,1.0


In [49]:
X_train2 = t_train2.drop(columns=['survived'])
y_train2 = t_train2['survived']

X_val2 = t_val2.drop(columns=['survived'])
y_val2 = t_val2['survived']

X_test2 = t_test2.drop(columns=['survived'])
y_test2 = t_test2['survived']

In [50]:
#making the model
logit2 = LogisticRegression(C=1, 
                            # class_weight={0:1, 1:99}, 
                            random_state=8675309, 
                          intercept_scaling=1, solver='lbfgs')

In [51]:
#fitting the model
logit2.fit(X_train2, y_train2)

In [52]:
#making the predictions
y_pred2=logit2.predict(X_train2)

In [53]:
#accuracy
logit2.score(X_train2, y_train2)

0.7944862155388471

# Try out other combinations of features and models.



# sibsp and alone

In [54]:
#assigning train, validate, test for second model

t_train3= t_train['survived'], t_train['sibsp'], t_train['alone']
t_train3=pd.DataFrame(t_train3).T

t_val3=t_val['survived'], t_val['sibsp'], t_val['alone']
t_val3= pd.DataFrame(t_val3).T

t_test3=t_test['survived'], t_test['sibsp'], t_test['alone']
t_test3=pd.DataFrame(t_test3).T

In [55]:
X_train3 = t_train3.drop(columns=['survived'])
y_train3 = t_train3['survived']

X_val3 = t_val3.drop(columns=['survived'])
y_val3 = t_val3['survived']

X_test3 = t_test3.drop(columns=['survived'])
y_test3 = t_test3['survived']

In [56]:
#making the model
logit3 = LogisticRegression(C=1, 
                            # class_weight={0:1, 1:99}, 
                            random_state=8675309, 
                          intercept_scaling=1, solver='lbfgs')

In [57]:
#fitting the model
logit3.fit(X_train3, y_train3)

In [58]:
#making the predictions
y_pred3=logit3.predict(X_train3)

In [59]:
#accuracy
logit3.score(X_train3, y_train3)

0.6666666666666666

# age, alone, fare

In [60]:
#assigning train, validate, test for second model

t_train4= t_train['survived'], t_train['age'], t_train['alone'], t_train['fare']
t_train4=pd.DataFrame(t_train4).T

t_val4=t_val['survived'], t_val['age'], t_val['alone'], t_val['fare']
t_val4= pd.DataFrame(t_val4).T

t_test4=t_test['survived'], t_test['age'], t_test['alone'], t_test['fare']
t_test4=pd.DataFrame(t_test4).T

In [61]:
X_train4 = t_train4.drop(columns=['survived'])
y_train4 = t_train4['survived']

X_val4 = t_val4.drop(columns=['survived'])
y_val4 = t_val4['survived']

X_test4 = t_test4.drop(columns=['survived'])
y_test4 = t_test4['survived']

In [62]:
#making the model
logit4 = LogisticRegression(C=1, 
                            # class_weight={0:1, 1:99}, 
                            random_state=8675309, 
                          intercept_scaling=1, solver='lbfgs')

In [63]:
#fitting the model
logit4.fit(X_train4, y_train4)

In [64]:
#making the predictions
y_pred4=logit4.predict(X_train4)

In [65]:
#accuracy
logit4.score(X_train4, y_train4)

0.6591478696741855

# sex, age, alone

In [88]:
#assigning train, validate, test for second model

t_train5= t_train['survived'], t_train['age'], t_train['alone'], t_train['sex_male']
t_train5=pd.DataFrame(t_train5).T

t_val5=t_val['survived'], t_val['age'], t_val['alone'], t_val['sex_male']
t_val5= pd.DataFrame(t_val5).T

t_test5=t_test['survived'], t_test['age'], t_test['alone'], t_test['sex_male']
t_test5=pd.DataFrame(t_test5).T

In [89]:
X_train5 = t_train5.drop(columns=['survived']).dropna()
y_train5 = t_train5['survived']

X_val5 = t_val5.drop(columns=['survived']).dropna()
y_val5 = t_val5['survived'].dropna()

X_test5 = t_test5.drop(columns=['survived']).dropna()
y_test5 = t_test5['survived']

In [90]:
#making the model
logit5 = LogisticRegression(C=1, 
                            # class_weight={0:1, 1:99}, 
                            random_state=8675309, 
                          intercept_scaling=1, solver='lbfgs')

In [91]:
#fitting the model
logit5.fit(X_train5, y_train5)

In [92]:
#making the predictions
y_pred5=logit5.predict(X_train5)

In [93]:
#accuracy
logit5.score(X_train5, y_train5)

0.7719298245614035

# Use you best 3 models to predict and evaluate on your validate sample.



In [94]:
# make predictions on validate sets
y_pred = logit.predict(X_val)
y_pred2 = logit2.predict(X_val2)
y_pred5 = logit5.predict(X_val5)

In [95]:
print(f'accuracy of first model: {logit.score(X_val, y_val)}')
print(f'accuracy of second model: {logit2.score(X_val2, y_val2)}')
print(f'accuracy of fifth model: {logit5.score(X_val5, y_val5)}')

accuracy of first model: 0.7034883720930233
accuracy of second model: 0.813953488372093
accuracy of fifth model: 0.7616279069767442


In [96]:
print(classification_report(y_pred, y_val))

              precision    recall  f1-score   support

         0.0       0.79      0.73      0.76       111
         1.0       0.57      0.66      0.61        61

    accuracy                           0.70       172
   macro avg       0.68      0.69      0.69       172
weighted avg       0.72      0.70      0.71       172



In [97]:
print(classification_report(y_pred2, y_val2))

              precision    recall  f1-score   support

         0.0       0.84      0.84      0.84       102
         1.0       0.77      0.77      0.77        70

    accuracy                           0.81       172
   macro avg       0.81      0.81      0.81       172
weighted avg       0.81      0.81      0.81       172



In [98]:
print(classification_report(y_pred5, y_val5))

              precision    recall  f1-score   support

         0.0       0.81      0.79      0.80       105
         1.0       0.69      0.72      0.70        67

    accuracy                           0.76       172
   macro avg       0.75      0.75      0.75       172
weighted avg       0.76      0.76      0.76       172



# Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



In [99]:
y_pred2= logit2.predict(X_test2)

In [102]:
logit2.score(X_test2, y_test2)

0.7902097902097902

In [110]:
pd.DataFrame(classification_report(y_test2, y_pred2, output_dict=True, 
             target_names=['died', 'survived']))

Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.839506,0.725806,0.79021,0.782656,0.79339
recall,0.8,0.775862,0.79021,0.787931,0.79021
f1-score,0.819277,0.75,0.79021,0.784639,0.791179
support,85.0,58.0,0.79021,143.0,143.0


In [111]:
# the metrics are very close to the train and validate sets
# in some catagories the values are the same as the train or the validate (not exact)