In [62]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import *
import graphviz
import re
import numpy as np


In [63]:
#@title Helper functions
def proc_data(df, dropPassenger = True):
    
    split_ids = df["PassengerId"].str.split("_", expand=True)
    df["GroupId"] = split_ids[0].astype("int32")
    df["IndividualId"] = split_ids[1].astype("int32")
    
    df["Cabin"] = df["Cabin"].fillna("unknown/9999/unknown")
    split_cabins = df["Cabin"].str.split("/", expand=True)
    df["Deck"] = split_cabins[0].astype("category")
    df["CabinNum"] = split_cabins[1].astype("int32")
    df["CabinSide"] = split_cabins[2].astype("category")
    
    df["HomePlanet"] = df["HomePlanet"].fillna("unknown").astype("category")
    df["Destination"] = df["Destination"].fillna("unknown").astype("category")
    df["Name"] = df["Name"].fillna("unknown").astype("string")
    df["Age"] = df["Age"].fillna(df["Age"].median()).astype("int32")

    for col in spends:
        df[col] = df[col].fillna(0).astype("int32")
        
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    df[['VIP', 'CryoSleep']] = imputer.fit_transform(df[['VIP', 'CryoSleep']])
    df['VIP'] = df['VIP'].astype("int8")
    df['CryoSleep'] = df['CryoSleep'].astype("int8")
    if "Transported" in df:
        df["Transported"] = df["Transported"].astype("int8")

    df[cats] = df[cats].apply(lambda x: x.cat.codes)
    if dropPassenger:
        df.drop(columns=["PassengerId"], inplace=True)
    df.drop(columns=["Cabin"], inplace=True)

def xs_y(df):
    xs = df[cats+conts+bools].copy()
    return xs, df[dep] if dep in df else None

def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                      special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))

def subm(preds, suff):
    tst_df["Transported"] = preds.astype("bool")
    sub_df = tst_df[["PassengerId", "Transported"]]
    sub_df.to_csv(f"sub-{suff}.csv", index = False)
    
def get_tree(prop=0.80):
    n = len(trn_y)
    idxs = random.choice(n, int(n*prop))
    return DecisionTreeClassifier(min_samples_leaf=50).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])


In [64]:
comp = "spaceship-titanic"
path = setup_comp(comp, install='fastai "timm>=0.6.2.dev0"')


In [65]:
df = pd.read_csv(path/"train.csv")
tst_df = pd.read_csv(path/"test.csv")
columns = df.columns
conts = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "IndividualId", "CabinNum"]
spends = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cats = ["HomePlanet", "Destination", "Deck", "CabinSide"]
bools = ["VIP", "CryoSleep"]
dep = "Transported"
print(f"All columns: {columns}")
print(f"Continuous columns: {conts}")
print(f"Spending columns: {spends}")
print(f"Categorical columns: {cats}")
print(f"Dependent columns: {dep}")



All columns: Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')
Continuous columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId', 'IndividualId', 'CabinNum']
Spending columns: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
Categorical columns: ['HomePlanet', 'Destination', 'Deck', 'CabinSide']
Dependent columns: Transported


In [66]:
proc_data(df)
proc_data(tst_df, dropPassenger=False)
trn_xs, trn_y = xs_y(df)


In [67]:
trn_xs

Unnamed: 0,HomePlanet,Destination,Deck,CabinSide,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,IndividualId,CabinNum,VIP,CryoSleep
0,1,2,1,0,39,0,0,0,0,0,1,1,0,0,0
1,0,2,5,1,24,109,9,25,549,44,2,1,0,0,0
2,1,2,0,1,58,43,3576,0,6715,49,3,1,0,1,0
3,1,2,0,1,33,0,1283,371,3329,193,3,2,0,0,0
4,0,2,5,1,16,303,70,151,565,2,4,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,0,41,0,6819,0,1643,74,9276,1,98,1,0
8689,0,1,6,1,18,0,0,0,0,0,9278,1,1499,0,1
8690,0,2,6,1,26,0,0,1872,1,0,9279,1,1500,0,0
8691,1,0,4,1,32,0,1049,0,353,3235,9280,1,608,0,0


In [76]:
GB = GradientBoostingClassifier()
GB.fit(trn_xs, trn_y)
scores = cross_validate(GB, trn_xs, trn_y, cv=10)
scores


{'fit_time': array([1.15291309, 1.11447906, 1.12478232, 1.07192945, 1.16066861,
        1.0883677 , 1.11098623, 2.022856  , 1.68014956, 1.36443496]),
 'score_time': array([0.00307727, 0.00269294, 0.00246692, 0.00273609, 0.00319028,
        0.00278568, 0.0031805 , 0.00617528, 0.00445175, 0.00275779]),
 'test_score': array([0.79655172, 0.75747126, 0.74252874, 0.69620253, 0.75143843,
        0.80552359, 0.81703107, 0.81472957, 0.80552359, 0.70080552])}

In [77]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.2, 0.3],
    'min_samples_split': [30, 40, 50],
    'min_samples_leaf': [20, 25, 30]
}

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
grid_search.fit(trn_xs, trn_y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

KeyboardInterrupt: 

In [78]:
import scipy
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 8),
    'learning_rate': uniform(0.1, 0.3),
    'min_samples_split': randint(30, 51),
    'min_samples_leaf': randint(20, 31)
}

random_search = RandomizedSearchCV(GradientBoostingClassifier(), param_dist, n_iter=50, cv=5)
random_search.fit(trn_xs, trn_y)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

Best parameters: {'learning_rate': 0.10017887035360057, 'max_depth': 5, 'min_samples_leaf': 20, 'min_samples_split': 49, 'n_estimators': 117}
Best score: 0.6713545805923937


In [36]:
tst_xs, _ = xs_y(tst_df)
subm(GB.predict(tst_xs), "histgradientboosting")

In [37]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('sub-histgradientboosting.csv', 'initial histgradientboosting model', comp)

100%|██████████| 56.3k/56.3k [00:00<00:00, 91.4kB/s]
