In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import acquire, split_scale

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_titanic_data()

In [3]:
df.drop(columns=['deck'], inplace=True)
df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

imputer = SimpleImputer(strategy='mean')
imputer.fit(df[['age']])
df.age = imputer.transform(df[['age']])
df.age = df.age.round(1)

In [4]:
encoder = LabelEncoder()
encoder.fit(df.sex)
df['sex_encoded'] = encoder.transform(df.sex)

In [5]:
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(df[['embarked']])
cols = [c for c in encoder.categories_[0]]
m_train = encoder.transform(df[['embarked']])
encoded_train = pd.DataFrame(m_train, columns=cols, index=df.index)
df = df.join(encoded_train).drop(columns='embarked')

In [6]:
train, test = split_scale.split_my_data(df, .8)
train, validate = split_scale.split_my_data(train, .8)

In [7]:
train.shape, validate.shape, test.shape

((569, 15), (143, 15), (179, 15))

In [8]:
X_train_2 = train[['pclass','fare']]
y_train = train[['survived']]
model2 = LogisticRegression(random_state=42).fit(X_train_2, y_train)
# y_pred = model2.predict(X_train_2)
# y_pred_prob = model2.predict_proba(X_train_2)
model2.score(X_train_2, y_train)

0.687170474516696

In [9]:
X_val = validate[['pclass','fare']]
y_val = validate[['survived']]
model2.score(X_val, y_val)

0.6153846153846154

In [10]:
X_train_3 = train[['pclass','fare','age']]
model3 = LogisticRegression(random_state=42).fit(X_train_3, y_train)
model3.score(X_train_3, y_train)

0.7012302284710018

In [11]:
X_val = validate[['pclass','fare','age']]
model3.score(X_val, y_val)

0.6643356643356644

In [12]:
X_train_4 = train[['pclass','fare','age','sex_encoded']]
model4 = LogisticRegression(random_state=42).fit(X_train_4, y_train)
model4.score(X_train_4, y_train)

0.7855887521968365

In [13]:
X_val = validate[['pclass','fare','age','sex_encoded']]
model4.score(X_val, y_val)

0.7972027972027972

In [14]:
X_train_5 = train[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
model5 = LogisticRegression(random_state=42).fit(X_train_5, y_train)
model5.score(X_train_5, y_train)

0.789103690685413

In [15]:
X_val = validate[['pclass','fare','age','sex_encoded',
                 'C','Q','S']]
model5.score(X_val, y_val)

0.8111888111888111

In [16]:
X_test = test[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
y_test = test.survived
model5.score(X_test, y_test)

0.7877094972067039

I do not believe my data is overfit, because the score is not significantly lower than it was for validate and train

If the missing values in age were dropped rather than imputed, it drastically lowers the numnber of observations available to work with, making the validate data nearly useless

Encoding sex with OneHot shouldn't affect the score much more than with LabelEncoder, because it is a binary choice.

In [17]:
model5 = LogisticRegression(random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [18]:
model5 = LogisticRegression(C = .01, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.715, 0.685


In [19]:
model5 = LogisticRegression(C = .1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.773, 0.811


In [20]:
model5 = LogisticRegression(C = 1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [21]:
model5 = LogisticRegression(C = 10, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.782, 0.811


In [22]:
model5 = LogisticRegression(C = 100, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811


In [23]:
model5 = LogisticRegression(C = 1000, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811


# Decision Tree

In [24]:
train.shape, validate.shape, test.shape

((569, 15), (143, 15), (179, 15))

In [25]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,sex_encoded,C,Q,S
517,517,0,3,male,29.7,0,0,24.15,Third,Queenstown,1,1,0.0,1.0,0.0
792,792,0,3,female,29.7,8,2,69.55,Third,Southampton,0,0,0.0,0.0,1.0
472,472,1,2,female,33.0,1,2,27.75,Second,Southampton,0,0,0.0,0.0,1.0
483,483,1,3,female,63.0,0,0,9.5875,Third,Southampton,1,0,0.0,0.0,1.0
9,9,1,2,female,14.0,1,0,30.0708,Second,Cherbourg,0,0,1.0,0.0,0.0


In [26]:
features = ['pclass','sex_encoded','age','fare','C','Q','S']
X_train = train[features]
y_train = train.survived
X_validate = validate[features]
y_validate = validate.survived
X_test = test[features]
y_test = test.survived

In [27]:
clf = DecisionTreeClassifier(max_depth=5, random_state=42)\
      .fit(X_train, y_train)

In [28]:
y_pred = clf.predict(X_train)

In [29]:
clf.score(X_train, y_train), clf.score(X_validate, y_validate),\
clf.score(X_test, y_test)

(0.8488576449912126, 0.8461538461538461, 0.7988826815642458)

In [30]:
report = pd.DataFrame(classification_report(y_train, y_pred, 
                                   output_dict=True)).T
report

Unnamed: 0,precision,recall,f1-score,support
0,0.821853,0.969188,0.88946,357.0
1,0.925676,0.646226,0.761111,212.0
accuracy,0.848858,0.848858,0.848858,0.848858
macro avg,0.873764,0.807707,0.825286,569.0
weighted avg,0.860535,0.848858,0.841639,569.0


In [31]:
labels = sorted(y_train.unique().astype(str))
predicted_labels = [name + " predicted" for name in labels]
conf = pd.DataFrame(confusion_matrix(y_train, y_pred), 
                    index=labels, columns=[predicted_labels])
conf.index.name = "actual"
conf

Unnamed: 0_level_0,0 predicted,1 predicted
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,346,11
1,75,137


In [32]:
print(report.iloc[1])
print(f'\nAccuracy is {(report.loc["accuracy"][0] * 100):.1f}%')
C = confusion_matrix(y_train, y_pred)
C = pd.DataFrame(C / C.astype(float).sum())
C.columns = ['predicted 0', 'predicted 1']
C.index = ['actual 0', 'actual 1']
print(f'True negative: {(C.iloc[0][0] * 100):.1f}%, '
      f'false negative: {(C.iloc[0][1] * 100):.1f}%, '
      f'true positive: {(C.iloc[1][1] * 100):.1f}%,  '
      f'false positive: {(C.iloc[1][0] * 100):.1f}%')

precision      0.925676
recall         0.646226
f1-score       0.761111
support      212.000000
Name: 1, dtype: float64

Accuracy is 84.9%
True negative: 60.8%, false negative: 1.9%, true positive: 24.1%,  false positive: 13.2%


In [33]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5,\
                             random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_train)
report = pd.DataFrame(classification_report(y_train, y_pred, 
                                   output_dict=True)).T
labels = sorted(y_train.unique().astype(str))
predicted_labels = [name + " predicted" for name in labels]
conf = pd.DataFrame(confusion_matrix(y_train, y_pred), 
                    index=labels, columns=[predicted_labels])
conf.index.name = "actual"

print(report.iloc[1])
print(f'\nAccuracy is {(report.loc["accuracy"][0] * 100):.1f}%')
C = confusion_matrix(y_train, y_pred)
C = pd.DataFrame(C / C.astype(float).sum())
C.columns = ['predicted 0', 'predicted 1']
C.index = ['actual 0', 'actual 1']
print(f'True negative: {(C.iloc[0][0] * 100):.1f}%, '
      f'false negative: {(C.iloc[0][1] * 100):.1f}%, '
      f'true positive: {(C.iloc[1][1] * 100):.1f}%,  '
      f'false positive: {(C.iloc[1][0] * 100):.1f}%')

precision      0.925676
recall         0.646226
f1-score       0.761111
support      212.000000
Name: 1, dtype: float64

Accuracy is 84.9%
True negative: 60.8%, false negative: 1.9%, true positive: 24.1%,  false positive: 13.2%
