In [84]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import *
import graphviz
import re
import numpy as np


In [85]:
#@title Helper functions
def proc_data(df, dropPassenger = True):
    df["Name"] = df["Name"].fillna("unknownFirstName unknownLastName").astype("string")
    split_names = df["Name"].str.split(" ", expand=True)
    df["FirstName"] = split_names[0].astype("string")
    df["LastName"] = split_names[1].astype("string")
    travelling_alone_dict = (df["LastName"].value_counts() == 1).to_dict()
    df["TravellingAlone"] = df["LastName"].map(travelling_alone_dict).astype("int8")
    family_size = (df["LastName"].value_counts()).to_dict()
    df["FamilySize"] = (df["LastName"].map(family_size)).astype("int8")

    split_ids = df["PassengerId"].str.split("_", expand=True)
    df["GroupId"] = split_ids[0].astype("int32")
    df["IndividualId"] = split_ids[1].astype("int32")
    
    df["Cabin"] = df["Cabin"].fillna("unknown/9999/unknown")
    split_cabins = df["Cabin"].str.split("/", expand=True)
    df["Deck"] = split_cabins[0].astype("category")
    df["CabinNum"] = split_cabins[1].astype("int32")
    df["CabinSide"] = split_cabins[2].astype("category")
    
    df["HomePlanet"] = df["HomePlanet"].fillna("unknown").astype("category")
    df["Destination"] = df["Destination"].fillna("unknown").astype("category")
    df["Age"] = df["Age"].fillna(df["Age"].median()).astype("int32")

    for col in spends:
        df[col] = df[col].fillna(0).astype("int32")
        
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    df[['VIP', 'CryoSleep']] = imputer.fit_transform(df[['VIP', 'CryoSleep']])
    df['VIP'] = df['VIP'].astype("int8")
    df['CryoSleep'] = df['CryoSleep'].astype("int8")
    if "Transported" in df:
        df["Transported"] = df["Transported"].astype("int8")

    df[cats] = df[cats].apply(lambda x: x.cat.codes)
    if dropPassenger:
        df.drop(columns=["PassengerId"], inplace=True)
    df.drop(columns=["Cabin"], inplace=True)
    df.drop(columns=["Name"], inplace=True)


def xs_y(df):
    xs = df[cats+conts+bools].copy()
    return xs, df[dep] if dep in df else None

def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                      special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))

def subm(preds, suff):
    tst_df["Transported"] = preds.astype("bool")
    sub_df = tst_df[["PassengerId", "Transported"]]
    sub_df.to_csv(f"sub-{suff}.csv", index = False)
    
def get_tree(prop=0.80):
    n = len(trn_y)
    idxs = random.choice(n, int(n*prop))
    return DecisionTreeClassifier(min_samples_leaf=50).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])


In [86]:
comp = "spaceship-titanic"
path = setup_comp(comp, install='fastai "timm>=0.6.2.dev0"')


In [87]:
df = pd.read_csv(path/"train.csv")
tst_df = pd.read_csv(path/"test.csv")
columns = df.columns
conts = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "IndividualId", "CabinNum", "FamilySize"]
spends = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cats = ["HomePlanet", "Destination", "Deck", "CabinSide"]
bools = ["VIP", "CryoSleep", "TravellingAlone"]
dep = "Transported"
print(f"All columns: {columns}")
print(f"Continuous columns: {conts}")
print(f"Spending columns: {spends}")
print(f"Categorical columns: {cats}")
print(f"Dependent columns: {dep}")



All columns: Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')
Continuous columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'IndividualId', 'CabinNum']
Spending columns: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
Categorical columns: ['HomePlanet', 'Destination', 'Deck', 'CabinSide']
Dependent columns: Transported


In [88]:
proc_data(df)
proc_data(tst_df, dropPassenger=False)
trn_xs, trn_y = xs_y(df)


In [90]:
import scipy
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 8),
    'learning_rate': uniform(0.01, 0.2),
    'min_samples_split': randint(30, 51),
}

random_search = RandomizedSearchCV(GradientBoostingClassifier(), param_dist, n_iter=20, cv=5)
random_search.fit(trn_xs, trn_y)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

KeyboardInterrupt: 

In [None]:
random_search.fit(trn_xs, trn_y)

In [79]:
m = GradientBoostingClassifier(learning_rate=0.10017887035360057, max_depth = 5, min_samples_leaf=20, min_samples_split=49, n_estimators=117)
m.fit(trn_xs, trn_y)

In [82]:
tst_xs, _ = xs_y(tst_df)
subm(m.predict(tst_xs), "RandomCVGradient")

In [83]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('sub-RandomCVGradient.csv', 'RandomCVGradient model', comp)

100%|██████████| 56.4k/56.4k [00:00<00:00, 94.2kB/s]
