In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
from feature_extraction import Featurizer
pd.set_option('display.max_columns', None)

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

drop_cols = ["Name", "Age", "Ticket", "Cabin", "Embarked", "PassengerId"]
featurizer = Featurizer(drop_cols)
train_df, val_df = featurizer.get_train_features(train_df)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
model = LogisticRegression()
model.fit(train_df.drop("Survived", axis=1), train_df["Survived"])
y_pred = model.predict(train_df.drop("Survived", axis=1))



In [6]:
model.predict_proba(train_df.drop("Survived", axis=1))



array([[0.89191599, 0.10808401],
       [0.62885907, 0.37114093],
       [0.61664604, 0.38335396],
       ...,
       [0.92476544, 0.07523456],
       [0.11030946, 0.88969054],
       [0.58867042, 0.41132958]])

In [7]:
import numpy as np
prob = model.predict_proba(train_df.drop("Survived", axis=1))
(prob[:,1] > 0.5).astype(np.int32)



array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,

In [8]:
accuracy_score(train_df["Survived"].values, y_pred)

0.7952559300873908

In [9]:
(train_df["Survived"].values == y_pred).mean()

0.7952559300873908

In [10]:
y_pred = model.predict(val_df.drop("Survived", axis=1))
accuracy_score(val_df["Survived"].values.squeeze(), y_pred.squeeze())



0.8222222222222222

In [11]:
from typing import Callable, Dict, List, Tuple

class Model():
    def __init__(
        self, 
        y_col: str,
        model: Callable, 
        metrics:Dict[str, Tuple[str, Callable]]=accuracy_score,
                ) -> None:
        self.model = model
        self.metrics = metrics
        self.y_col = y_col
        
    def print_metric(self, 
                     metric:Callable, 
                     metric_name: str,
                     metric_y_type: str,
                     df:pd.DataFrame, 
                     data_type:str) -> None:
        y_class = self.model.predict(df.drop(self.y_col, axis=1))
        y_prob = self.model.predict_proba(df.drop(self.y_col, axis=1))
        if metric_y_type == "prob":
            y_pred= y_prob
        else:
            y_pred = y_class
        metric_val = metric(df[self.y_col].values.squeeze(), y_pred.squeeze())
        print(f"{data_type} {metric_name}: {metric_val:.4f}")
        
    def fit(self, train_df: pd.DataFrame, val_df: pd.DataFrame) -> None:
        self.model.fit(train_df.drop(self.y_col, axis=1), train_df[self.y_col])
        
        for name, (metric_y_type, metric) in self.metrics.items():
            self.print_metric(metric, name, metric_y_type, train_df, "Training")
            self.print_metric(metric, name, metric_y_type, val_df, "Validation")

In [12]:
metric_list = {"Accuracy": ("class", accuracy_score)}

In [13]:
logistic = LogisticRegression()
model = Model("Survived", logistic, metric_list)
model.fit(train_df, val_df)

Training Accuracy: 0.7953
Validation Accuracy: 0.8222




In [14]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=50, max_depth=3)
model = Model("Survived", random_forest, metric_list)
model.fit(train_df, val_df)

Training Accuracy: 0.7915
Validation Accuracy: 0.7667




In [15]:
from sklearn.gaussian_process import GaussianProcessClassifier

gp = GaussianProcessClassifier()
model = Model("Survived", gp, metric_list)
model.fit(train_df, val_df)



Training Accuracy: 0.8764




Validation Accuracy: 0.7778


In [16]:
from sklearn import metrics

def auc(y, pred):
    fpr, tpr, _ = metrics.roc_curve(y, pred[:, 1])
    return metrics.auc(fpr, tpr)

In [17]:
metric_list["AUC"] = ("prob", auc)

In [18]:
for model_ in [logistic, random_forest, gp]:
    model = Model("Survived", model_, metric_list)
    model.fit(train_df, val_df)
    print("="*30)



Training Accuracy: 0.7953
Validation Accuracy: 0.8222
Training AUC: 0.8306
Validation AUC: 0.8840
Training Accuracy: 0.7903
Validation Accuracy: 0.7778
Training AUC: 0.8652
Validation AUC: 0.8840




Training Accuracy: 0.8764




Validation Accuracy: 0.7778




Training AUC: 0.8938




Validation AUC: 0.8228


In [19]:
train_df["Survived"].mean()

0.38202247191011235