In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import json
import os

from pandas.io.json import json_normalize
from functools import partial
from pdb import set_trace

from datetime import datetime, timedelta
import urllib.request

from fastai.tabular.all import *

%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [47]:
df = pd.read_csv('data/train_features.csv')
df = pd.concat([df, pd.read_csv('data/test_features.csv')])

In [107]:
cat_names = ['cp_type', 'cp_dose']
cont_names = df.columns[~df.columns.isin(cat_names)].tolist()
cont_names.remove('sig_id')

In [108]:
# This simply reads a batch but instead of returning a dependant variable, just return the same thing as the input
# The continuous variables are normalized though if we use the Normalize transform.  Didn't find an easy way of
# outputing the non normalized continuous variables
class ReadTabBatchIdentity(ItemTransform):
    def __init__(self, to): self.to = to

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
        if to.device is not None: res = to_device(res, to.device)
        return res
    
class TabularPandasIdentity(TabularPandas): pass

In [109]:
@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    "A transformed `DataLoader` for Tabular data"
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super(TabDataLoader, self).__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

TabularPandasIdentity._dl_type = TabDataLoaderIdentity

to = TabularPandasIdentity(df, [Categorify, FillMissing, Normalize], cat_names, cont_names, splits=RandomSplitter(seed=32)(df))
dls = to.dataloaders(bs=1024)
dls.n_inp = 2

emb_szs = get_emb_sz(to)

In [110]:
# For each categorical variable, we want to know how many possible values it has
# We will need this in our loss function to figure out where to apply our F.cross_entropy for each categorical variables
total_cats = {k:len(v) for k,v in dls.procs[1].classes.items()}
total_cats

{'cp_type': 3, 'cp_dose': 3}

In [111]:
# This is going the be the # of outputs we are going to need for our categorical variables
sum([v for k,v in total_cats.items()])

6

In [112]:
out = dls.one_batch()

In [113]:
means = pd.DataFrame.from_dict({k:[v] for k,v in dls.procs[2].means.items()})
stds = pd.DataFrame.from_dict({k:[v] for k,v in dls.procs[2].stds.items()})

In [114]:
# To make the life of our model easier, let's use a SigmoidRange to reduce the range of values that can be predicted for the continuous variables
low = (df[cont_names].min().to_frame().T.values - means.values) / stds.values
high = (df[cont_names].max().to_frame().T.values - means.values) / stds.values

## Batch Swap Noise
Used in the winning solution for the Kaggle competition [Puerto Seguro Safe Driver Prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927)

In [115]:
class BatchSwapNoise(nn.Module):
    """Swap Noise module"""

    def __init__(self, p):
        super().__init__()
        self.p = p

    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x

In [116]:
class TabularAE(TabularModel):
    def __init__(self, emb_szs, n_cont, hidden_size, cats, low, high, ps=0.2, embed_p=0.01, bswap=None):
        super().__init__(emb_szs, n_cont, layers=[1024, 512, 256], out_sz=hidden_size, embed_p=embed_p)
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(256, hidden_size, p=ps, act=nn.ReLU(inplace=True))))
        
        if(bswap != None): self.noise = BatchSwapNoise(bswap)
        self.decoder = nn.Sequential(
            LinBnDrop(hidden_size, 256, p=ps, act=nn.ReLU(inplace=True)),
            LinBnDrop(256, 512, p=ps, act=nn.ReLU(inplace=True)),
            LinBnDrop(512, 1024, p=ps, act=nn.ReLU(inplace=True))
        )
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(1024, n_cont, p=ps, bn=False, act=None),
            SigmoidRange(low=low, high=high)
        )
        
        self.decoder_cat = LinBnDrop(1024, self.activation_cats, p=ps, bn=False, act=None)
        
    def forward(self, x_cat, x_cont=None):
        if(self.bswap != None):
            x_cat = self.noise(x_cat)
            x_cont = self.noise(x_cont)
        
        encoded = super().forward(x_cat, x_cont)
        
        decoded_trunk = self.decoder(encoded)
        
        decoded_cats = self.decoder_cat(decoded_trunk)
        
        decoded_conts = self.decoder_cont(decoded_trunk)
        
        return decoded_cats, decoded_conts

def combined_loss(preds, cat_targs, cont_targs):
    cats,conts = preds
    
    CE = cats.new([0])
    pos=0
    for i, (k,v) in enumerate(total_cats.items()):
        CE += F.cross_entropy(cats[:, pos:pos+v], cat_targs[:, i], reduction='sum')
        pos += v
        
    batch_size = cats.size(0)
        
    norm_cats = cats.new([len(total_cats.keys())])
    norm_conts = conts.new([conts.size(1)])
    total = (F.mse_loss(conts, cont_targs, reduction='sum')/norm_conts) + (CE/norm_cats)
    
    return total / batch_size

In [117]:
# Let's test that everything works for a batch...
model = TabularAE(emb_szs, len(cont_names), 128, total_cats, low=tensor(low).cuda(), high=tensor(high).cuda()).cuda()
r = model(out[0].cuda(), out[1].cuda())
loss = combined_loss(r, out[2], out[3])

In [118]:
import wandb
from fastai.callback.wandb import *
from fastai.callback.tracker import *
from sklearn.metrics import r2_score
from sklearn.metrics import balanced_accuracy_score, f1_score

class WandbCallbackCustom(WandbCallback):
    def log_predictions(self, preds):
        inp,(cat_preds, cont_preds),(cat_targs, cont_targs),out = preds

        cont_preds = pd.DataFrame(cont_preds, columns=cont_names)
        cont_targs = pd.DataFrame(cont_targs, columns=cont_names)

        preds = pd.DataFrame((cont_preds.values * stds.values) + means.values, columns=cont_preds.columns)
        targets = pd.DataFrame((cont_targs.values * stds.values) + means.values, columns=cont_targs.columns)

        mi = (np.abs(targets-preds)).min().to_frame().T
        ma = (np.abs(targets-preds)).max().to_frame().T
        mean = (np.abs(targets-preds)).mean().to_frame().T
        median = (np.abs(targets-preds)).median().to_frame().T
        r2 = pd.DataFrame.from_dict({c:[r2_score(targets[c], preds[c])] for c in preds.columns})

        for d,name in zip([mi,ma,mean,median,r2], ['Min', 'Max', 'Mean', 'Median', 'R2']):
            d = d.insert(0, 'GroupBy', name)

        data = pd.concat([r2,mi,ma,mean,median])
        
        cat_reduced = torch.zeros_like(cat_targs)
        pos=0
        for i, (k,v) in enumerate(total_cats.items()):
            cat_reduced[:,i] = cat_preds[:,pos:pos+v].argmax(dim=1)
            pos += v
        
        cat_preds = pd.DataFrame(cat_reduced, columns=cat_names)
        cat_targs = pd.DataFrame(cat_targs, columns=cat_names)

        accuracy = pd.DataFrame.from_dict({c:[balanced_accuracy_score(cat_targs[c], cat_preds[c])] for c in cat_preds.columns})
        f1 = pd.DataFrame.from_dict({c:[f1_score(cat_targs[c], cat_preds[c], average='weighted')] for c in cat_preds.columns})
        
        tolog = {}
        
#         for c in preds.columns:
#             tolog[c + '_MAE'] = wandb.Histogram(np.abs(preds[c]-targets[c]))
            
        for c in preds.columns:
            tolog[c + '_R2'] = r2[c][0]
            
        for c in accuracy.columns:
            tolog[c + '_Accuracy'] = accuracy[c][0]
            
        for c in accuracy.columns:
            tolog[c + '_F1'] = f1[c][0]
        
        for d,name in zip([accuracy, f1], ['Accuracy', 'F1']):
            d = d.insert(0, 'MetricName', name)
            
        tolog['MeanR2'] = r2.mean(axis=1)[0]
        tolog['StdR2'] = r2.std(axis=1)[0]
        tolog['MeanAccuracy'] = accuracy.mean(axis=1)[0]
        tolog['MeanF1'] = f1.mean(axis=1)[0]
        
        tolog['continuous'] = wandb.Table(dataframe=data)
        tolog['categorical'] = wandb.Table(dataframe=pd.concat([accuracy, f1]))
        
        wandb.log(tolog, step=self._wandb_step)
        
config = {
    'hidden_size': 128,
    'dropout': 0.1,
    'embed_p': 0.01,
    'wd': 0.01,
    'bswap': 0.1,
    'lr': 1e-3,
    'epochs': 100
}

cbs = [SaveModelCallback(fname='tabular' + datetime.now().strftime('%Y-%m-%d %Hh%M.%S'), every_epoch=True)]

# wandb.init(project='Kaggle-Lish-Moa', config=config)
# cbs += [WandbCallbackCustom(log_model=False)]

model = TabularAE(emb_szs, len(cont_names), config['hidden_size'], ps=config['dropout'], cats=total_cats, embed_p=config['embed_p'], bswap=config['bswap'], low=tensor(low).cuda(), high=tensor(high).cuda())
learn = Learner(dls, model, lr=config['lr'], loss_func=combined_loss, wd=config['wd'], cbs=cbs)
learn = learn.to_fp16()

In [119]:
# learn.lr_find()

In [120]:
learn.fit_one_cycle(config['epochs'])

epoch,train_loss,valid_loss,time
0,4.797834,3.888723,00:02
1,3.906911,2.617576,00:02
2,3.122289,1.716328,00:02
3,2.48224,1.317152,00:02
4,1.993878,0.974741,00:02
5,1.619559,0.776361,00:02
6,1.3407,0.683106,00:02
7,1.142212,0.637574,00:02
8,0.99839,0.605221,00:02
9,0.895347,0.589779,00:02


# Getting the compressed representations

In [121]:
dl = learn.dls.test_dl(df)

In [122]:
class GetBottleNeckCallback(HookCallback):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.preds = []
        
    def hook(self, m, i, o):
        self.preds += [self.learn.to_detach(i[0])]
    
# model.decoder[0][0] is the first layer of the decoder layer, we are saving the input of this layer since this is our compressed representations
hook = GetBottleNeckCallback(modules=[learn.model.decoder[0][0]])
learn.add_cbs(hook)

In [123]:
(cat_preds, cont_preds), (cat_targs, cont_targs) = learn.get_preds(dl=dl)

In [124]:
compressed = np.concatenate(hook.preds)
compressed.shape

(27796, 128)

# Final reconstruction stats

## Continuous

In [125]:
from sklearn.metrics import r2_score

cont_preds = pd.DataFrame(cont_preds, columns=cont_names)
cont_targs = pd.DataFrame(cont_targs, columns=cont_names)

preds = pd.DataFrame((cont_preds.values * stds.values) + means.values, columns=cont_preds.columns)
targets = pd.DataFrame((cont_targs.values * stds.values) + means.values, columns=cont_targs.columns)

mi = (np.abs(targets-preds)).min().to_frame().T
ma = (np.abs(targets-preds)).max().to_frame().T
mean = (np.abs(targets-preds)).mean().to_frame().T
median = (np.abs(targets-preds)).median().to_frame().T
r2 = pd.DataFrame.from_dict({c:[r2_score(targets[c], preds[c])] for c in preds.columns})


for d,name in zip([mi,ma,mean,median,r2], ['Min', 'Max', 'Mean', 'Median', 'R2']):
    d = d.insert(0, 'GroupBy', name)
    
data = pd.concat([mi,ma,mean,median,r2])
data

Unnamed: 0,GroupBy,cp_time,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,Min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Max,47.813,5.605,4.86,6.485,9.311,5.905,9.628,9.462,5.863,...,4.441,6.747,4.914,5.854,6.631,6.04,5.947,5.151,6.2,6.443
0,Mean,9.412,0.387,0.477,0.67,0.448,0.412,0.693,0.369,0.349,...,0.468,0.48,0.512,0.482,0.441,0.522,0.483,0.49,0.522,0.503
0,Median,8.262,0.299,0.389,0.528,0.345,0.274,0.504,0.295,0.258,...,0.374,0.379,0.407,0.381,0.346,0.42,0.377,0.402,0.399,0.416
0,R2,0.657,0.856,0.411,0.247,0.541,0.617,0.31,0.673,0.806,...,0.905,0.897,0.885,0.905,0.923,0.823,0.9,0.861,0.846,0.788


In [131]:
r2.mean(axis=1)

0   0.635
dtype: float64

## Categorical

In [126]:
cat_reduced = torch.zeros_like(cat_targs)
pos=0
for i, (k,v) in enumerate(total_cats.items()):
    cat_reduced[:,i] = cat_preds[:,pos:pos+v].argmax(dim=1)
    pos += v

In [127]:
cat_preds = pd.DataFrame(cat_reduced, columns=cat_names)
cat_targs = pd.DataFrame(cat_targs, columns=cat_names)

from sklearn.metrics import balanced_accuracy_score, f1_score

accuracy = pd.DataFrame.from_dict({c:[balanced_accuracy_score(cat_targs[c], cat_preds[c])] for c in cat_preds.columns})

In [128]:
f1 = pd.DataFrame.from_dict({c:[f1_score(cat_targs[c], cat_preds[c], average='weighted')] for c in cat_preds.columns})

In [129]:
for d,name in zip([accuracy, f1], ['Accuracy', 'F1']):
    d = d.insert(0, 'MetricName', name)
pd.concat([accuracy, f1])

Unnamed: 0,MetricName,cp_type,cp_dose
0,Accuracy,0.994,1.0
0,F1,0.997,1.0


In [132]:
accuracy.mean(axis=1)

0   0.997
dtype: float64