In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from pydataset import data
import acquire
import prepare

In [30]:

def prep_titanic():
    titanic_db = acquire.get_titanic_data()
    titanic_db = titanic_db.drop(columns=['embarked', 'class', 'deck'])
    dummy_titanic_db = pd.get_dummies(titanic_db[['sex', 'embark_town']], dummy_na=False, drop_first = [True])#, True])
    titanic_db = pd.concat([titanic_db, dummy_titanic_db], axis=1)
    titanic_db = titanic_db.drop(columns=['Unnamed: 0', 'sex', 'embark_town'])
    return titanic_db

titanic = prep_titanic()
titanic.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694,0.647587,0.08642,0.722783
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615,0.47799,0.281141,0.447876
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,1.0,0.0,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0,1.0,0.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0


Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [31]:
titanic = titanic.drop(columns=['passenger_id', 'sibsp', 'parch', 'alone', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [32]:
titanic = titanic.dropna()

In [33]:
titanic.describe()

Unnamed: 0,survived,pclass,age,fare
count,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,29.699118,34.694514
std,0.49146,0.83825,14.526497,52.91893
min,0.0,1.0,0.42,0.0
25%,0.0,1.0,20.125,8.05
50%,0.0,2.0,28.0,15.7417
75%,1.0,3.0,38.0,33.375
max,1.0,3.0,80.0,512.3292


In [34]:
train, validate, test = prepare.split_data(titanic, 'survived')

In [35]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [48]:
logit = LogisticRegression(C=1, random_state=823, intercept_scaling=1, solver='lbfgs')

In [49]:
logit.fit(X_train, y_train)

In [50]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.03653957e+00 -3.24465365e-02  7.12689413e-04]]
Intercept: 
 [2.82618957]


In [51]:
odds = np.exp(logit.coef_)
odds

array([[0.35467991, 0.96807421, 1.00071294]])

In [52]:
y_pred = logit.predict(X_train)

In [53]:
y_pred_proba = logit.predict_proba(X_train)

In [54]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.68


In [55]:
print(confusion_matrix(y_train, y_pred))

[[208  46]
 [ 91  83]]


In [56]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.82      0.75       254
           1       0.64      0.48      0.55       174

    accuracy                           0.68       428
   macro avg       0.67      0.65      0.65       428
weighted avg       0.67      0.68      0.67       428



Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.


In [83]:
titanic = prep_titanic()
titanic = titanic.dropna()
titanic.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,447.582633,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514,0.565826,0.634454,0.039216,0.77591
std,259.119524,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893,0.495995,0.481921,0.194244,0.417274
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,221.25,0.0,1.0,20.125,0.0,0.0,8.05,0.0,0.0,0.0,1.0
50%,444.0,0.0,2.0,28.0,0.0,0.0,15.7417,1.0,1.0,0.0,1.0
75%,676.75,1.0,3.0,38.0,1.0,1.0,33.375,1.0,1.0,0.0,1.0
max,890.0,1.0,3.0,80.0,5.0,6.0,512.3292,1.0,1.0,1.0,1.0


In [84]:
titanic = titanic.drop(columns=['passenger_id', 'sibsp', 'parch', 'alone', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [85]:
train, validate, test = prepare.split_data(titanic, 'survived')

In [86]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [124]:
logit = LogisticRegression(C=10, random_state=823, intercept_scaling=1, solver='lbfgs')

In [125]:
logit.fit(X_train, y_train)

In [126]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.18092589e+00 -2.81775488e-02 -1.15520026e-03 -2.90455383e+00]]
Intercept: 
 [4.81081415]


In [127]:
odds = np.exp(logit.coef_)
odds

array([[0.30699436, 0.97221574, 0.99884547, 0.05477322]])

In [128]:
y_pred = logit.predict(X_train)

In [129]:
y_pred_proba = logit.predict_proba(X_train)

In [130]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.82


In [131]:
print(confusion_matrix(y_train, y_pred))

[[222  32]
 [ 46 128]]


In [132]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       254
           1       0.80      0.74      0.77       174

    accuracy                           0.82       428
   macro avg       0.81      0.80      0.81       428
weighted avg       0.82      0.82      0.82       428




Try out other combinations of features and models.

In [152]:
#all the stuff from prepare
titanic = prep_titanic()
titanic = titanic.dropna()
titanic = titanic.drop(columns=['passenger_id'])
titanic.describe()

train, validate, test = prepare.split_data(titanic, 'survived')

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

logit = LogisticRegression(C=10, random_state=823, intercept_scaling=1, solver='lbfgs')

logit.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [153]:
titanic

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.2500,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.9250,1,0,0,1
3,1,1,35.0,1,0,53.1000,0,0,0,1
4,0,3,35.0,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.0,0,5,29.1250,0,0,1,0
886,0,2,27.0,0,0,13.0000,1,1,0,1
887,1,1,19.0,0,0,30.0000,1,0,0,1
889,1,1,26.0,0,0,30.0000,1,1,0,0


In [154]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)
odds = np.exp(logit.coef_)
print('Odds: \n', odds)

Coefficient: 
 [[-1.12507181e+00 -3.11187679e-02 -5.93575156e-01  2.43720245e-02
   1.30290637e-03 -3.43366456e-01 -2.83117226e+00  3.90838799e-01
   5.94576068e-01]]
Intercept: 
 [4.61997987]
Odds: 
 [[0.32462915 0.96936044 0.55234902 1.02467145 1.00130376 0.70937821
  0.05894372 1.4782202  1.81226251]]


In [155]:
y_pred = logit.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 0, 1])

In [156]:
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba

array([[0.07804646, 0.92195354],
       [0.68733779, 0.31266221],
       [0.93616095, 0.06383905],
       [0.88164334, 0.11835666],
       [0.33245121, 0.66754879],
       [0.8997226 , 0.1002774 ],
       [0.89414501, 0.10585499],
       [0.88802034, 0.11197966],
       [0.91056151, 0.08943849],
       [0.81623015, 0.18376985],
       [0.80935474, 0.19064526],
       [0.73202277, 0.26797723],
       [0.12118754, 0.87881246],
       [0.37109478, 0.62890522],
       [0.75523893, 0.24476107],
       [0.08841139, 0.91158861],
       [0.10854334, 0.89145666],
       [0.8506704 , 0.1493296 ],
       [0.62360778, 0.37639222],
       [0.47907751, 0.52092249],
       [0.39868961, 0.60131039],
       [0.98417053, 0.01582947],
       [0.76153756, 0.23846244],
       [0.69535825, 0.30464175],
       [0.2501665 , 0.7498335 ],
       [0.05224907, 0.94775093],
       [0.32567085, 0.67432915],
       [0.74342801, 0.25657199],
       [0.89296557, 0.10703443],
       [0.86821167, 0.13178833],
       [0.

In [157]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.82


In [158]:
print(confusion_matrix(y_train, y_pred))

[[221  33]
 [ 46 128]]


In [159]:
pd.DataFrame(confusion_matrix(y_train, y_pred))

Unnamed: 0,0,1
0,221,33
1,46,128


In [160]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       254
           1       0.80      0.74      0.76       174

    accuracy                           0.82       428
   macro avg       0.81      0.80      0.81       428
weighted avg       0.81      0.82      0.81       428



Drop embark info

In [177]:
titanic = prep_titanic()
titanic = titanic.dropna()
titanic = titanic.drop(columns=['passenger_id', 'embark_town_Queenstown', 'embark_town_Southampton'])
titanic.describe()

train, validate, test = prepare.split_data(titanic, 'survived')

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

logit = LogisticRegression(C=10, random_state=823, intercept_scaling=1, solver='lbfgs')

logit.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [169]:
titanic

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male
0,0,3,22.0,1,0,7.2500,0,1
1,1,1,38.0,1,0,71.2833,0,0
2,1,3,26.0,0,0,7.9250,1,0
3,1,1,35.0,1,0,53.1000,0,0
4,0,3,35.0,0,0,8.0500,1,1
...,...,...,...,...,...,...,...,...
885,0,3,39.0,0,5,29.1250,0,0
886,0,2,27.0,0,0,13.0000,1,1
887,1,1,19.0,0,0,30.0000,1,0
889,1,1,26.0,0,0,30.0000,1,1


In [170]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)
odds = np.exp(logit.coef_)
print('Odds: \n', odds)

Coefficient: 
 [[-1.22087939e+00 -3.38843366e-02 -4.96454992e-01  5.30201805e-04
  -1.11118922e-03 -4.33834390e-01 -2.95292872e+00]]
Intercept: 
 [5.61222555]
Odds: 
 [[0.29497066 0.96668331 0.60868463 1.00053034 0.99888943 0.64801957
  0.05218664]]


In [171]:
y_pred = logit.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 0, 1])

In [172]:
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba

array([[0.04355165, 0.95644835],
       [0.70511308, 0.29488692],
       [0.93916989, 0.06083011],
       [0.89958589, 0.10041411],
       [0.34841065, 0.65158935],
       [0.91655952, 0.08344048],
       [0.91109782, 0.08890218],
       [0.91536275, 0.08463725],
       [0.92624702, 0.07375298],
       [0.8106623 , 0.1893377 ],
       [0.72982025, 0.27017975],
       [0.6502213 , 0.3497787 ],
       [0.11780381, 0.88219619],
       [0.39246738, 0.60753262],
       [0.77682726, 0.22317274],
       [0.04789421, 0.95210579],
       [0.10443114, 0.89556886],
       [0.85378259, 0.14621741],
       [0.59889776, 0.40110224],
       [0.28691907, 0.71308093],
       [0.27867149, 0.72132851],
       [0.9789922 , 0.0210078 ],
       [0.78217359, 0.21782641],
       [0.71259054, 0.28740946],
       [0.22758842, 0.77241158],
       [0.04413217, 0.95586783],
       [0.21013063, 0.78986937],
       [0.76495838, 0.23504162],
       [0.88977831, 0.11022169],
       [0.88652286, 0.11347714],
       [0.

In [173]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))
pd.DataFrame(confusion_matrix(y_train, y_pred))

Accuracy of Logistic Regression classifier on training set: 0.81
[[218  36]
 [ 45 129]]


Unnamed: 0,0,1
0,218,36
1,45,129


In [174]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       254
           1       0.78      0.74      0.76       174

    accuracy                           0.81       428
   macro avg       0.81      0.80      0.80       428
weighted avg       0.81      0.81      0.81       428



In [106]:
Use you best 3 models to predict and evaluate on your validate sample.

SyntaxError: invalid syntax (4199409991.py, line 1)

In [None]:
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?