In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import acquire, split_scale
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_titanic_data()

In [3]:
df.drop(columns=['deck'], inplace=True)
df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

imputer = SimpleImputer(strategy='mean')
imputer.fit(df[['age']])
df.age = imputer.transform(df[['age']])
df.age = df.age.round(1)

In [4]:
encoder = LabelEncoder()
encoder.fit(df.sex)
df['sex_encoded'] = encoder.transform(df.sex)

In [5]:
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(df[['embarked']])
cols = [c for c in encoder.categories_[0]]
m_train = encoder.transform(df[['embarked']])
encoded_train = pd.DataFrame(m_train, columns=cols, index=df.index)
df = df.join(encoded_train).drop(columns='embarked')

In [6]:
train, test = split_scale.split_my_data(df, .8)
train, validate = split_scale.split_my_data(train, .8)

In [7]:
train.shape, validate.shape, test.shape

((569, 15), (143, 15), (179, 15))

In [8]:
X_train_2 = train[['pclass','fare']]
y_train = train[['survived']]
model2 = LogisticRegression(random_state=42).fit(X_train_2, y_train)
# y_pred = model2.predict(X_train_2)
# y_pred_prob = model2.predict_proba(X_train_2)
model2.score(X_train_2, y_train)

0.687170474516696

In [9]:
X_val = validate[['pclass','fare']]
y_val = validate[['survived']]
model2.score(X_val, y_val)

0.6153846153846154

In [10]:
X_train_3 = train[['pclass','fare','age']]
model3 = LogisticRegression(random_state=42).fit(X_train_3, y_train)
model3.score(X_train_3, y_train)

0.7012302284710018

In [11]:
X_val = validate[['pclass','fare','age']]
model3.score(X_val, y_val)

0.6643356643356644

In [12]:
X_train_4 = train[['pclass','fare','age','sex_encoded']]
model4 = LogisticRegression(random_state=42).fit(X_train_4, y_train)
model4.score(X_train_4, y_train)

0.7855887521968365

In [13]:
X_val = validate[['pclass','fare','age','sex_encoded']]
model4.score(X_val, y_val)

0.7972027972027972

In [14]:
X_train_5 = train[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
model5 = LogisticRegression(random_state=42).fit(X_train_5, y_train)
model5.score(X_train_5, y_train)

0.789103690685413

In [15]:
X_val = validate[['pclass','fare','age','sex_encoded',
                 'C','Q','S']]
model5.score(X_val, y_val)

0.8111888111888111

In [16]:
X_test = test[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
y_test = test.survived
model5.score(X_test, y_test)

0.7877094972067039

I do not believe my data is overfit, because the score is not significantly lower than it was for validate and train

If the missing values in age were dropped rather than imputed, it drastically lowers the numnber of observations available to work with, making the validate data nearly useless

Encoding sex with OneHot shouldn't affect the score much more than with LabelEncoder, because it is a binary choice.

In [17]:
model5 = LogisticRegression(random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [18]:
model5 = LogisticRegression(C = .01, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.715, 0.685


In [19]:
model5 = LogisticRegression(C = .1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.773, 0.811


In [20]:
model5 = LogisticRegression(C = 1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [21]:
model5 = LogisticRegression(C = 10, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.782, 0.811


In [22]:
model5 = LogisticRegression(C = 100, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811


In [23]:
model5 = LogisticRegression(C = 1000, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811
