In [None]:
#pip install kaggle --upgrade

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai import *
from fastai.tabular import *
from pathlib import Path

In [None]:
path = Path('data')
path.mkdir(parents=True, exist_ok=True)

# Load data # 
The preprocessing was done in the [Titanic_TabularLearner](Titanic_TabularLearner.ipynb) notebook.

In [None]:
df = pd.read_csv(path /'titanic_preproc.csv')

Split prepocessed dataframe from 'is_test' column.

In [None]:
df_train = df[df['is_test'] == False].drop('is_test', axis = 1)
df_test = df[df['is_test'] == True].drop(['is_test','Survived'], axis = 1)

In [None]:
df_train['Survived'] = df_train['Survived'].astype(int)

# Prepare Dataframe #

In [None]:
dep_var = 'Survived' # target variable
cat_names = ['LName','Pclass','Sex','SibSp','Parch','Cabin','Title']
cont_names = ['Age','Fare']
procs = [FillMissing, Categorify, Normalize]
t_procs = [partial(FillMissing,test=True), partial(Categorify,test=True),partial(Normalize,test=True)]

In [None]:
test = TabularList.from_df(df_test, path='.', cat_names=cat_names, cont_names=cont_names, procs = t_procs)

In [None]:
data = (TabularList.from_df(df_train, path = path, cat_names=cat_names, cont_names=cont_names, procs = procs)
        .split_by_rand_pct(0.2, seed = 41) # Force the DataBunch to learn on all data. Hyperparameters come from a random search.
        .label_from_df(cols = dep_var)
        .add_test(test)
        .databunch())

In [None]:
data.show_batch(rows = 15)

# Learner #

Perform a random search to find 'the best' hyperparameters.

In [None]:
record = []
y_range = [0, 1.2]

while(True):
    wd = random.randrange(5,100)/1000
    emb_drop = random.randrange(5,100)/1000
    
    exp = random.choice([0,1,2,3,4])
    lr = random.random() / (10**exp)

    layers = random.choice([[100,50],[200,100],[100,200,50],[1000,500],[2000,1000]])
    ps = random.randrange(0,7)/10
    mom = 0.1
    opt_func = random.choice([optim.SGD,optim.Adam])


    
    learn = tabular_learner(data, layers = layers, metrics = (accuracy,error_rate),
                        ps = ps, emb_drop = emb_drop, y_range = y_range, 
                        opt_func = opt_func)
    
    print('Learning on: layout {}, lr {}, dropout {}, emb_drop {}, wd {} with {}.'.format(layers,lr,ps,emb_drop,wd,opt_func))
    learn.fit_one_cycle(40, lr, wd)
    
    
    t_loss = learn.recorder.losses[-1]
    v_loss = learn.recorder.val_losses[-1]
    acc, err = learn.recorder.metrics
    
    setup = [wd, emb_drop, layers, ps, opt_func, lr]
    result = [t_loss,v_loss,acc,err]
    
    record.append((setup,result))

Build DataFrame from hyperparameters.

In [None]:
wd = [x[0][0] for x in record]
emb_drop = [x[0][1] for x in record]
layers = [str(x[0][2]) for x in record]
dropout = [x[0][3] for x in record]
opt_func = [str(x[0][4]) for x in record]
t_loss = [x[1][0].item() for x in record]
v_loss = [x[1][1] for x in record]
acc = [x[1][2].item() for x in record]
err = [x[1][3].item() for x in record]
lr = [x[0][5] for x in record]

dic = {'accuracy': acc,
       'error': err,
       'learning rate': lr,
       'wd': wd,
       'emb_drop': emb_drop,
       'layers': layers,
       'dropout': dropout,
       'opt_func': opt_func,
       't_loss': t_loss,
       'v_loss': v_loss
      }

hyper_df = pd.DataFrame(dic)

Sort for best accuracy.

In [None]:
hyper_df.sort_values(['accuracy'], ascending = False)

Best values:

|accuracy | error| 	learning rate |	wd |	emb_drop |	layers |	dropout |	opt_func | t_loss |	v_loss |
|----------|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|----------:|
| 	0.882023 |	0.117978  |	0.097620 |	0.014 |	0.055 |	[100, 200, 50] |	0.3 |	<class 'torch.optim.adam.Adam'> |	0.292863 |	0.406563 |