In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from feature_extraction import Featurizer
pd.set_option('display.max_columns', None)

pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

drop_cols = ["Name", "Ticket", "Cabin", "Embarked", "PassengerId", "Age"]
featurizer = Featurizer(drop_cols)
train_df, val_df = featurizer.get_train_features(train_df)
train_df = pd.concat([train_df, val_df])

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
cross_val_score(model, train_df.drop("Survived", axis=1), train_df["Survived"], cv=5)

array([0.7877095 , 0.78651685, 0.79775281, 0.75842697, 0.84269663])

The inputs to the `scoring` argument can be found [here](https://scikit-learn.org/stable/modules/model_evaluation.html). The default accuracy is ok, but is usually not the only thing you want to look at:

In [5]:
cross_val_score(model, train_df.drop("Survived", axis=1), train_df["Survived"], cv=5, scoring="accuracy")

array([0.7877095 , 0.78651685, 0.79775281, 0.75842697, 0.84269663])

In [6]:
cross_val_score(model, train_df.drop("Survived", axis=1), train_df["Survived"], cv=5, scoring="roc_auc")

array([0.85289855, 0.83273501, 0.81336898, 0.77372995, 0.8967246 ])

In [7]:
from sklearn.metrics import roc_auc_score

def model_score(model, train_df, val_df, y_col, metric):
    model.fit(train_df.drop(y_col, axis=1), train_df[y_col])
    

    dist = train_df[y_col].mean()
    
    y_pred = model.predict_proba(train_df.drop(y_col, axis=1))[:, 1]
    train_metric = metric(train_df[y_col].values.squeeze(), y_pred.squeeze())

    y_pred = model.predict_proba(val_df.drop(y_col, axis=1))[:, 1]
    val_metric = metric(val_df[y_col].values.squeeze(), y_pred.squeeze())
    
    print(f"Survived: {dist:.4f} Train metric: {train_metric:.4f} Validation metric: {val_metric:.4f}")

In [8]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scores = []
val_idxs = []
for tr_idx, val_idx in skf.split(train_df.drop("Survived", axis=1), train_df["Survived"]):
    model = LogisticRegression()
    model_score(
        model, 
        train_df.iloc[tr_idx], 
        train_df.iloc[val_idx],
        "Survived",
        roc_auc_score
    )
    val_idxs.extend(val_idx)
    
# print(scores)

Survived: 0.3834 Train metric: 0.8490 Validation metric: 0.7866
Survived: 0.3829 Train metric: 0.8417 Validation metric: 0.8131
Survived: 0.3843 Train metric: 0.8334 Validation metric: 0.8451
Survived: 0.3843 Train metric: 0.8362 Validation metric: 0.8318
Survived: 0.3843 Train metric: 0.8241 Validation metric: 0.8791


In [13]:
len(set(val_idxs))

891

In [9]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

drop_cols = ["Name", "Ticket", "Cabin", "Embarked", "PassengerId"]
featurizer = Featurizer(drop_cols)
train_df, val_df = featurizer.get_train_features(train_df)
train_df = pd.concat([train_df, val_df])

In [10]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scores = []
for tr_idx, val_idx in skf.split(train_df.drop("Survived", axis=1), train_df["Survived"]):
    model = LogisticRegression(max_iter=200)
    tr_df = train_df.iloc[tr_idx].copy()
    val_df = train_df.iloc[val_idx].copy()
    tr_df.fillna({"Age": tr_df["Age"].median()}, inplace=True)
    val_df.fillna({"Age": tr_df["Age"].median()}, inplace=True)
    model_score(
        model, 
        tr_df, 
        val_df,
        "Survived",
        roc_auc_score
    )

Survived: 0.3834 Train metric: 0.8586 Validation metric: 0.8277
Survived: 0.3829 Train metric: 0.8657 Validation metric: 0.8005
Survived: 0.3843 Train metric: 0.8545 Validation metric: 0.8517
Survived: 0.3843 Train metric: 0.8528 Validation metric: 0.8635
Survived: 0.3843 Train metric: 0.8456 Validation metric: 0.8893
