In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from src.gen import train_test_from_null, get_xy_from_dataframe
from src.kaggle_api import get_dataset

# Whether to run intensive grid searches (True) or simple fits (False)
intensive = True

In [None]:
data_path = get_dataset("titanic")
raw_train_data = pd.read_csv(data_path / "train.csv")
raw_test_data = pd.read_csv(data_path / "test.csv")
raw_comb_data = pd.concat([raw_train_data, raw_test_data], ignore_index=True)

print(raw_train_data.info())
print(raw_test_data.info())

In [None]:
def get_title_old(df):
    df["Title"] = df["Name"].str.extract(r",\s?(\w*).{1}")

    replace_male = (df["Sex"] == "male") & (~df["Title"].isin(["Mr", "Master"]))
    df.loc[replace_male, "Title"] = "Mr"
    df.loc[replace_male & (df["Age"] < 18), "Title"] = "Master"

    replace_female = (df["Sex"] == "female") & (~df["Title"].isin(["Miss", "Mrs"]))
    df.loc[replace_female, "Title"] = "Miss"
    df.loc[replace_female & (df["Age"] > 18) & (df["SibSp"] | df["Parch"]), "Title"] = "Mrs"
    df["Title"] = pd.factorize(df["Title"])[0]

    return df[["Title"]]

def get_title(df):
    df["Title"] = df["Name"].str.extract(r",\s?(\w*).{1}")

    is_male = df["Sex"] == "male"
    is_female = df["Sex"] == "female"
    outlier_male = is_male & (~df["Title"].isin(["Mr", "Master"]))
    df.loc[outlier_male, "Title"] = "Mr"

    # All men under 18 = Master, over = Mr
    df.loc[is_male & (df["Age"] >= 18), "Title"] = "Mr"
    df.loc[is_male & (df["Age"] < 18), "Title"] = "Master"

    outlier_female = is_female & (~df["Title"].isin(["Miss", "Mrs"]))
    df.loc[outlier_female, "Title"] = "Mrs"

    # All women over 18 = Mrs, under = Miss
    df.loc[is_female & (df["Age"] >= 18), "Title"] = "Mrs"
    df.loc[is_female & (df["Age"] < 18), "Title"] = "Miss"
    out = pd.get_dummies(df["Title"], drop_first=True)
    # df["Title"] = pd.factorize(df["Title"])[0]

    return out # df[["Title"]]

age_target = "age_bin"
age_bins = 5

comb_data = raw_comb_data.copy()
# comb_data = comb_data.assign(age_bin=pd.qcut(comb_data["Age"], q=age_bins, precision=0, labels=False, retbins=False).values)
comb_data = comb_data.assign(age_bin=pd.cut(comb_data["Age"], bins=[-1, 17, 30, 50, np.inf], precision=0, labels=False, retbins=False).values)

age_train_data, age_test_data = train_test_from_null(comb_data, age_target)
y_age_train = age_train_data[age_target]

clip_sibsp = FunctionTransformer(lambda x, kwargs: x.clip(**kwargs), kw_args={"kwargs": {"upper": 3}})
clip_parch = FunctionTransformer(lambda x, kwargs: x.clip(**kwargs), kw_args={"kwargs": {"upper": 2}})

age_preprocessor = ColumnTransformer(
    transformers=[
        ("pclass", OrdinalEncoder(), ["Pclass"]),
        ("sibsp", clip_sibsp, ["SibSp"]),
        ("parch", clip_parch, ["Parch"]),
        ("title", FunctionTransformer(get_title), ["Name", "Sex", "Age", "SibSp", "Parch"]),
    ]
)

trans_train = age_preprocessor.fit_transform(age_train_data)
trans_train

This method also allows us to tune the preprocessing step by adding parameters to the hyperparameters search:

In [None]:
pipe_hyperparams = {
#    "preprocess__parch__kw_args": [
#        {"kwargs": {"upper": 1}},
#        {"kwargs": {"upper": 2}},
#        {"kwargs": {"upper": 3}},
#    ],
#    "preprocess__sibsp__kw_args": [
#        {"kwargs": {"upper": 2}},
#        {"kwargs": {"upper": 3}},
#        {"kwargs": {"upper": 4}},
#    ],
    "classifier__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1],
    "classifier__min_samples_split": np.linspace(0.1, 0.5, 4),
    "classifier__min_samples_leaf": np.linspace(0.1, 0.5, 4),
    "classifier__max_depth": [5, 8],
    "classifier__subsample": [0.6, 0.8, 0.95, 1.0],
}

age_pipe = Pipeline(steps=[
    ("preprocess", age_preprocessor),
    ("classifier", GradientBoostingClassifier(loss="log_loss", criterion="friedman_mse", n_estimators=50))]
)

if not intensive:
    age_pipe.fit(age_train_data, y_age_train)
    y_pred = age_pipe.predict(age_test_data)
    disp = age_pipe
else:
    age_clf = GridSearchCV(age_pipe, param_grid=pipe_hyperparams, cv=10, n_jobs=-1, verbose=2)
    age_clf.fit(age_train_data, y_age_train)
    print("model score: %.3f" % age_clf.best_score_)
    y_pred = age_clf.predict(age_test_data)
    disp = age_clf

disp

In [None]:
age_test_data.loc[:, age_target] = y_pred

comb_data = pd.concat([age_train_data, age_test_data]).sort_index()
comb_data.info()

In [None]:
target = "Survived"

train_data, test_data = train_test_from_null(comb_data, target)
_, y_train = get_xy_from_dataframe(train_data, target)

embarked_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder())

preprocessor = ColumnTransformer(
    transformers=[
        ("fare", SimpleImputer(strategy="mean"), ["Fare"]),
        ("Embarked", embarked_transformer, ["Embarked"]),
        ("pclass_age", OrdinalEncoder(), ["Pclass", "age_bin"]),
        ("sibsp", clip_sibsp, ["SibSp"]),
        ("parch", clip_parch, ["Parch"]),
        ("title", FunctionTransformer(get_title), ["Name", "Sex", "Age", "SibSp", "Parch"]),
        #("cat",         categorical_transformer, categorical_features)
        #("cat",         categorical_transformer, selector(dtype_include="category")),
        #("num",         numeric_transformer, selector(dtype_exclude="category")),
    ]
)

X_pro = preprocessor.fit_transform(train_data)
X_pro

In [None]:
pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", GradientBoostingClassifier(loss="log_loss", criterion="friedman_mse", n_estimators=50))]
)

if not intensive:
    pipe.fit(train_data, y_train)
    y_pred = pipe.predict(test_data)
    disp = pipe
else:
    clf = GridSearchCV(pipe, param_grid=pipe_hyperparams, cv=10, n_jobs=-1, verbose=2)
    clf.fit(train_data, y_train)
    print("model score: %.3f" % clf.best_score_)
    y_pred = clf.predict(test_data)
    disp = clf

disp

In [None]:
test_data = test_data.copy()
test_data[target] = y_pred
test_data

In [None]:
# Get correct format
test_data[target] = test_data[target].astype(int)

# Write out
test_data.to_csv(data_path / "pipeline_prediction.csv", columns=["PassengerId", target], index=False)

This is a good start, but we would like to only predict one thing in the dataset (survival), and encapsulate all preprocessing and classification within a single pipeline. To do this we need a smarter imputer method to fill the null Age rows. Options to move forward are:
- Nest the current method within the pipeline (only if we can improve the current method)
- Use the experimental iterative imputer provided by scikit-learn