In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import acquire, split_scale
from pydataset import data

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, \
            MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_titanic_data()

In [3]:
df.drop(columns=['deck'], inplace=True)
df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

imputer = SimpleImputer(strategy='mean')
imputer.fit(df[['age']])
df.age = imputer.transform(df[['age']])
df.age = df.age.round(1)

In [4]:
encoder = LabelEncoder()
encoder.fit(df.sex)
df['sex_encoded'] = encoder.transform(df.sex)

In [5]:
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(df[['embarked']])
cols = [c for c in encoder.categories_[0]]
m_train = encoder.transform(df[['embarked']])
encoded_train = pd.DataFrame(m_train, columns=cols, index=df.index)
df = df.join(encoded_train).drop(columns='embarked')

In [6]:
train, test = split_scale.split_my_data(df, .8)
train, validate = split_scale.split_my_data(train, .8)

In [7]:
train.shape, validate.shape, test.shape

((569, 15), (143, 15), (179, 15))

In [8]:
X_train_2 = train[['pclass','fare']]
y_train = train[['survived']]
model2 = LogisticRegression(random_state=42).fit(X_train_2, y_train)
# y_pred = model2.predict(X_train_2)
# y_pred_prob = model2.predict_proba(X_train_2)
model2.score(X_train_2, y_train)

0.687170474516696

In [9]:
X_val = validate[['pclass','fare']]
y_val = validate[['survived']]
model2.score(X_val, y_val)

0.6153846153846154

In [10]:
X_train_3 = train[['pclass','fare','age']]
model3 = LogisticRegression(random_state=42).fit(X_train_3, y_train)
model3.score(X_train_3, y_train)

0.7012302284710018

In [11]:
X_val = validate[['pclass','fare','age']]
model3.score(X_val, y_val)

0.6643356643356644

In [12]:
X_train_4 = train[['pclass','fare','age','sex_encoded']]
model4 = LogisticRegression(random_state=42).fit(X_train_4, y_train)
model4.score(X_train_4, y_train)

0.7855887521968365

In [13]:
X_val = validate[['pclass','fare','age','sex_encoded']]
model4.score(X_val, y_val)

0.7972027972027972

In [14]:
X_train_5 = train[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
model5 = LogisticRegression(random_state=42).fit(X_train_5, y_train)
model5.score(X_train_5, y_train)

0.789103690685413

In [15]:
X_val = validate[['pclass','fare','age','sex_encoded',
                 'C','Q','S']]
model5.score(X_val, y_val)

0.8111888111888111

In [16]:
X_test = test[['pclass','fare','age','sex_encoded',
                  'C','Q','S']]
y_test = test.survived
model5.score(X_test, y_test)

0.7877094972067039

I do not believe my data is overfit, because the score is not significantly lower than it was for validate and train

If the missing values in age were dropped rather than imputed, it drastically lowers the numnber of observations available to work with, making the validate data nearly useless

Encoding sex with OneHot shouldn't affect the score much more than with LabelEncoder, because it is a binary choice.

In [17]:
model5 = LogisticRegression(random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [18]:
model5 = LogisticRegression(C = .01, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.715, 0.685


In [19]:
model5 = LogisticRegression(C = .1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.773, 0.811


In [20]:
model5 = LogisticRegression(C = 1, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.789, 0.811


In [21]:
model5 = LogisticRegression(C = 10, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.782, 0.811


In [22]:
model5 = LogisticRegression(C = 100, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811


In [23]:
model5 = LogisticRegression(C = 1000, random_state=42)\
    .fit(X_train_5, y_train)
print(f'{model5.score(X_train_5, y_train):.3f}, ' 
      f'{model5.score(X_val, y_val):.3f}')

0.780, 0.811


# Decision Tree

In [24]:
train.shape, validate.shape, test.shape

((569, 15), (143, 15), (179, 15))

In [25]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,sex_encoded,C,Q,S
517,517,0,3,male,29.7,0,0,24.15,Third,Queenstown,1,1,0.0,1.0,0.0
792,792,0,3,female,29.7,8,2,69.55,Third,Southampton,0,0,0.0,0.0,1.0
472,472,1,2,female,33.0,1,2,27.75,Second,Southampton,0,0,0.0,0.0,1.0
483,483,1,3,female,63.0,0,0,9.5875,Third,Southampton,1,0,0.0,0.0,1.0
9,9,1,2,female,14.0,1,0,30.0708,Second,Cherbourg,0,0,1.0,0.0,0.0


In [26]:
features = ['pclass','sex_encoded','age','fare','C','Q','S']
X_train = train[features]
y_train = train.survived
X_validate = validate[features]
y_validate = validate.survived
X_test = test[features]
y_test = test.survived

In [27]:
clf = DecisionTreeClassifier(max_depth=5, random_state=42)\
      .fit(X_train, y_train)

y_pred = clf.predict(X_train)

clf.score(X_train, y_train), clf.score(X_validate, y_validate),\
clf.score(X_test, y_test)

(0.8488576449912126, 0.8461538461538461, 0.7988826815642458)

In [28]:
report = pd.DataFrame(classification_report(y_train, y_pred, 
                                   output_dict=True)).T
report

Unnamed: 0,precision,recall,f1-score,support
0,0.821853,0.969188,0.88946,357.0
1,0.925676,0.646226,0.761111,212.0
accuracy,0.848858,0.848858,0.848858,0.848858
macro avg,0.873764,0.807707,0.825286,569.0
weighted avg,0.860535,0.848858,0.841639,569.0


In [29]:
labels = sorted(y_train.unique().astype(str))
predicted_labels = [name + " predicted" for name in labels]
conf = pd.DataFrame(confusion_matrix(y_train, y_pred), 
                    index=labels, columns=[predicted_labels])
conf.index.name = "actual"
conf

Unnamed: 0_level_0,0 predicted,1 predicted
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,346,11
1,75,137


In [30]:
print(report.iloc[1])
print(f'\nAccuracy is {(report.loc["accuracy"][0] * 100):.1f}%')
C = confusion_matrix(y_train, y_pred)
C = pd.DataFrame(C / C.astype(float).sum())
C.columns = ['predicted 0', 'predicted 1']
C.index = ['actual 0', 'actual 1']
print(f'True negative: {(C.iloc[0][0] * 100):.1f}%, '
      f'false negative: {(C.iloc[0][1] * 100):.1f}%, '
      f'true positive: {(C.iloc[1][1] * 100):.1f}%,  '
      f'false positive: {(C.iloc[1][0] * 100):.1f}%')

precision      0.925676
recall         0.646226
f1-score       0.761111
support      212.000000
Name: 1, dtype: float64

Accuracy is 84.9%
True negative: 60.8%, false negative: 1.9%, true positive: 24.1%,  false positive: 13.2%


In [31]:
def evaluate_results(y_pred, scaled=False):
    report = pd.DataFrame(classification_report(y_train, y_pred, 
                                   output_dict=True)).T
    labels = sorted(y_train.unique().astype(str))
    predicted_labels = [name + " predicted" for name in labels]
    conf = pd.DataFrame(confusion_matrix(y_train, y_pred), 
                        index=labels, columns=[predicted_labels])
    conf.index.name = "actual"

    print(report.iloc[1])
    C = confusion_matrix(y_train, y_pred)
    C = pd.DataFrame(C / C.astype(float).sum())
    C.columns = ['predicted 0', 'predicted 1']
    C.index = ['actual 0', 'actual 1']
    print(f'True negative: {(C.iloc[0][0] * 100):.1f}%, '
          f'false negative: {(C.iloc[0][1] * 100):.1f}%, '
          f'true positive: {(C.iloc[1][1] * 100):.1f}%,  '
          f'false positive: {(C.iloc[1][0] * 100):.1f}%')
    print(f'\nTrain accuracy is '\
          f'{(report.loc["accuracy"][0]*100):.3f}%')
    if scaled == False:
          print(f'Val accuracy: '\
                f'{(rf.score(X_val, y_val)*100):.3f}%')
    else:
          print(f'Val accuracy: '\
                f'{(rf.score(X_val_scaled, y_val)*100):.3f}%')
#     print(f'Test accuracy: '\
#                 f'{(rf.score(X_test, y_test) * 100):.3f}%')

In [32]:
rf = DecisionTreeClassifier(criterion='entropy', max_depth=5,\
                             random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_train)
evaluate_results(y_pred)

precision      0.925676
recall         0.646226
f1-score       0.761111
support      212.000000
Name: 1, dtype: float64
True negative: 60.8%, false negative: 1.9%, true positive: 24.1%,  false positive: 13.2%

Train accuracy is 84.886%
Val accuracy: 59.441%


# Random Forest

In [33]:
rf = RandomForestClassifier(random_state=42, min_samples_leaf = 1,
                            max_depth = 20).fit(X_train, y_train)
y_pred = rf.predict(X_train)
evaluate_results(y_pred)

precision      0.985149
recall         0.938679
f1-score       0.961353
support      212.000000
Name: 1, dtype: float64
True negative: 62.2%, false negative: 0.5%, true positive: 35.0%,  false positive: 2.3%

Train accuracy is 97.188%
Val accuracy: 59.441%


In [34]:
rf = RandomForestClassifier(random_state=42, min_samples_leaf = 5,
                            max_depth = 3).fit(X_train, y_train)
y_pred = rf.predict(X_train)
evaluate_results(y_pred)

precision      0.859873
recall         0.636792
f1-score       0.731707
support      212.000000
Name: 1, dtype: float64
True negative: 58.9%, false negative: 3.9%, true positive: 23.7%,  false positive: 13.5%

Train accuracy is 82.601%
Val accuracy: 58.741%


# K Nearest Neighbors

In [35]:
rf = KNeighborsClassifier().fit(X_train, y_train)
y_pred = rf.predict(X_train)
evaluate_results(y_pred)

precision      0.741935
recall         0.650943
f1-score       0.693467
support      212.000000
Name: 1, dtype: float64
True negative: 54.3%, false negative: 8.4%, true positive: 24.3%,  false positive: 13.0%

Train accuracy is 78.559%
Val accuracy: 60.839%


In [36]:
rf = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
y_pred = rf.predict(X_train)
evaluate_results(y_pred)

precision      0.751724
recall         0.514151
f1-score       0.610644
support      212.000000
Name: 1, dtype: float64
True negative: 56.4%, false negative: 6.3%, true positive: 19.2%,  false positive: 18.1%

Train accuracy is 75.571%
Val accuracy: 60.839%


In [37]:
rf = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
y_pred = rf.predict(X_train)
evaluate_results(y_pred)

precision      0.769912
recall         0.410377
f1-score       0.535385
support      212.000000
Name: 1, dtype: float64
True negative: 58.2%, false negative: 4.6%, true positive: 15.3%,  false positive: 22.0%

Train accuracy is 73.462%
Val accuracy: 60.140%


In [38]:
X_train.head()

Unnamed: 0,pclass,sex_encoded,age,fare,C,Q,S
517,3,1,29.7,24.15,0.0,1.0,0.0
792,3,0,29.7,69.55,0.0,0.0,1.0
472,2,0,33.0,27.75,0.0,0.0,1.0
483,3,0,63.0,9.5875,0.0,0.0,1.0
9,2,0,14.0,30.0708,1.0,0.0,0.0


In [39]:
scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['age','fare']] = scaler.fit_transform\
                              (X_train[['age','fare']])
X_val_scaled = X_val.copy()
X_val_scaled[['age','fare']] = scaler.transform\
                            (X_val[['age','fare']])
X_test_scaled = X_test.copy()
X_test_scaled[['age','fare']] = scaler.transform\
                            (X_test[['age','fare']])

In [40]:
rf = KNeighborsClassifier().fit(X_train_scaled, y_train)
y_pred = rf.predict(X_train_scaled)
evaluate_results(y_pred, scaled=True)

precision      0.818653
recall         0.745283
f1-score       0.780247
support      212.000000
Name: 1, dtype: float64
True negative: 56.6%, false negative: 6.2%, true positive: 27.8%,  false positive: 9.5%

Train accuracy is 84.359%
Val accuracy: 62.238%


In [41]:
rf = KNeighborsClassifier(n_neighbors=10)\
        .fit(X_train_scaled, y_train)
y_pred = rf.predict(X_train_scaled)
evaluate_results(y_pred, scaled=True)

precision      0.892086
recall         0.584906
f1-score       0.706553
support      212.000000
Name: 1, dtype: float64
True negative: 60.1%, false negative: 2.6%, true positive: 21.8%,  false positive: 15.5%

Train accuracy is 81.898%
Val accuracy: 58.741%


In [42]:
rf = KNeighborsClassifier(n_neighbors=20)\
        .fit(X_train_scaled, y_train)
y_pred = rf.predict(X_train_scaled)
evaluate_results(y_pred, scaled=True)

precision      0.919355
recall         0.537736
f1-score       0.678571
support      212.000000
Name: 1, dtype: float64
True negative: 61.0%, false negative: 1.8%, true positive: 20.0%,  false positive: 17.2%

Train accuracy is 81.019%
Val accuracy: 61.538%


More neighbors makes the model worse, likely because the data is close together, so more points only gets further and further data

# Testing

In [43]:
model = LogisticRegression(random_state=42).fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_val, y_val)

(0.789103690685413, 0.5944055944055944)

In [44]:
model = LogisticRegression(C = 10, random_state=42)\
            .fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_val, y_val)

(0.7820738137082601, 0.5944055944055944)

In [45]:
rf = DecisionTreeClassifier(max_depth=15, random_state=42)\
      .fit(X_train, y_train)
rf.score(X_train, y_train), rf.score(X_val, y_val)

(0.9630931458699473, 0.5944055944055944)

In [46]:
rf = RandomForestClassifier(random_state=42, min_samples_leaf = 1,
                            max_depth = 15).fit(X_train, y_train)
rf.score(X_train, y_train), rf.score(X_val, y_val)

(0.9701230228471002, 0.5944055944055944)

In [47]:
rf = KNeighborsClassifier().fit(X_train_scaled, y_train)
rf.score(X_train_scaled, y_train), rf.score(X_val_scaled, y_val)

(0.843585237258348, 0.6223776223776224)

### Of all of these models, I got the best results with Random Forest.

In [52]:
features = ['pclass','sex_encoded','age','fare']
X_train = train[features]
y_train = train.survived
X_val = validate[features]
y_val = validate.survived
X_test = test[features]
y_test = test.survived

In [54]:
rf = RandomForestClassifier(random_state=42, min_samples_leaf = 1,
                            max_depth = 15).fit(X_train, y_train)
rf.score(X_train, y_train), rf.score(X_val, y_val)

(0.9701230228471002, 0.8391608391608392)

In [None]:
df = data('iris')
train, test = split_scale.split_my_data(df, .8)
train, validate = split_scale.split_my_data(train, .8)
train.head()