In [1]:
!pip show fastai fastcore

Name: fastai
Version: 2.0.13
Summary: fastai simplifies training fast and accurate neural nets using modern best practices
Home-page: https://github.com/fastai/fastai/tree/master/
Author: Jeremy Howard, Sylvain Gugger, and contributors
Author-email: info@fast.ai
License: Apache Software License 2.0
Location: c:\users\etienne-pc\miniconda3\envs\fastai\lib\site-packages
Requires: torch, pillow, packaging, pyyaml, requests, scipy, fastprogress, torchvision, matplotlib, fastcore, scikit-learn, pandas, pip, spacy
Required-by: 
---
Name: fastcore
Version: 1.0.13
Summary: Python supercharged for fastai development
Home-page: https://github.com/fastai/fastcore/tree/master/
Author: Jeremy Howard and Sylvain Gugger
Author-email: infos@fast.ai
License: Apache Software License 2.0
Location: c:\users\etienne-pc\miniconda3\envs\fastai\lib\site-packages
Requires: pip, packaging
Required-by: tsai, fastai2, fastai


In [2]:
from matplotlib import cm
from fastai.tabular.all import *

pd.set_option('display.float_format', lambda x: '%.3f' % x)

We'll use the `Adult Sample` dataset:

In [3]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

And declare the relevent information:

In [4]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()
splits = RandomSplitter()(range_of(df))

## Baseline

In [5]:
# to = TabularPandas(df, procs = [Categorify, FillMissing, Normalize], cont_names=cont_names, cat_names=cat_names, splits=splits, y_names=['salary'], reduce_memory=False, 
#                    y_block=CategoryBlock())

In [6]:
# dls = to.dataloaders(bs=1024)

In [7]:
# def accuracy(inp, targ, axis=-1):
#     "Compute accuracy with `targ` when `pred` is bs * n_classes"
#     pred,targ = flatten_check(inp.argmax(dim=axis), targ)
#     return (pred == targ).float().mean()

In [8]:
# learn = tabular_learner(dls, layers=[200,100], config={'ps':.1}, metrics=[accuracy])

In [9]:
# learn.fit(10, 1e-3)

# VAE AutoEncoder

Next we need our own version of `ReadTabBatch` that will return our inputs

> The continous variables are still normalized if we used `Normalize`. Couldn't figure out an easy way to de-norm it, but it's okay that we do not

In [10]:
class ReadTabBatchIdentity(ItemTransform):
    "Read a batch of data and return the inputs as both `x` and `y`"
    def __init__(self, to): store_attr()

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
        if to.device is not None: res = to_device(res, to.device)
        return res
    
class TabularPandasIdentity(TabularPandas): pass

Next we need to make a new `TabDataLoader` that uses our `RadTabBatchIdentity`:

In [11]:
@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    "A transformed `DataLoader` for AutoEncoder problems with Tabular data"
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

And make `TabularPandasIdentity`'s `dl_type` to `TabDataLoaderIdentity`

In [12]:
TabularPandasIdentity._dl_type = TabDataLoaderIdentity

To start we'll make a very basic `to` object using our new `TabularPandasIdentity`:

In [13]:
bs=1024

to = TabularPandasIdentity(df, [Categorify, FillMissing, Normalize], cat_names, cont_names, splits=RandomSplitter(seed=32)(df))
dls = to.dataloaders(bs=1024)

Set the `n_inp` to 2:

In [14]:
dls.n_inp = 2

And then we'll calculate the embedding sizes:

In [15]:
emb_szs = get_emb_sz(to.train)

For each categorical variable we need to know the total possible values it can have:

In [16]:
total_cats = {k:len(v) for k,v in to.classes.items()}
total_cats

{'workclass': 10,
 'education': 17,
 'marital-status': 8,
 'occupation': 16,
 'relationship': 7,
 'race': 6,
 'education-num_na': 3}

We will need this dictionary in our loss function to figure out where to apply our `CrossEntropyLossFlat` for each categorical variables

Next we need to know the total number ouf outputs possible for our categorical variables

In [17]:
sum([v for k,v in total_cats.items()])

67

And let's keep a batch of our data for later

In [18]:
batch = dls.one_batch()

Next we need to know the means and standard deviations:

In [19]:
to.means

{'age': 38.5793696495067,
 'fnlwgt': 190006.02011593536,
 'education-num': 10.079158782958984}

We can store them in a `DataFrame` for easy adjustments:

In [20]:
means = pd.DataFrame.from_dict({k:[v] for k,v in to.means.items()})
stds = pd.DataFrame.from_dict({k:[v] for k,v in to.stds.items()})

We'll also use a SigmoidRange based on the un-normalized data to reduce the range our values can be:

In [21]:
low = (df[cont_names].min().to_frame().T.values - means.values) / stds.values
high = (df[cont_names].max().to_frame().T.values - means.values) / stds.values

In [22]:
low, high

(array([[-1.57952443, -1.67843578, -3.55622464]]),
 array([[ 3.76378659, 12.22741736,  2.31914013]]))

## Batch Swap Noise
Used in the winning solution for the Kaggle competition [Puerto Seguro Safe Driver Prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927)

In [23]:
class BatchSwapNoise(Module):
    "Swap Noise Module"
    def __init__(self, p): store_attr()

    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x

We'll make a custom `TabularVAE` model (Denoising Variational AutoEncoder) for us to use.

In [24]:
class TabularVAE(TabularModel):
    def __init__(self, emb_szs, n_cont, hidden_size, cats, low, high, ps=0.2, embed_p=0.01, bswap=None, act_cls=Swish()):
        super().__init__(emb_szs, n_cont, layers=[hidden_size*8, hidden_size*4, hidden_size*2], out_sz=hidden_size, embed_p=embed_p, act_cls=act_cls)
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(hidden_size*2, hidden_size, p=ps, act=act_cls)))
        
        if self.bswap != None: self.noise = BatchSwapNoise(self.bswap)
        self.decoder = nn.Sequential(
            LinBnDrop(hidden_size, hidden_size*2, p=ps, act=act_cls),
            LinBnDrop(hidden_size*2, hidden_size*4, p=ps, act=act_cls),
            LinBnDrop(hidden_size*4, hidden_size*8, p=ps, act=act_cls)
        )
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(hidden_size*8, n_cont, p=ps, bn=False, act=None),
            SigmoidRange(low=low, high=high)
        )
        
        self.decoder_cat = LinBnDrop(hidden_size*8, self.activation_cats, p=ps, bn=False, act=None)
        
    def forward(self, x_cat, x_cont=None, encode=False):
        if(self.bswap != None):
            x_cat = self.noise(x_cat)
            x_cont = self.noise(x_cont)
        
        z = super().forward(x_cat, x_cont)
        if(encode): return z
        
        decoded_trunk = self.decoder(z)
        
        decoded_cats = self.decoder_cat(decoded_trunk)
        
        decoded_conts = self.decoder_cont(decoded_trunk)
        
        return decoded_cats, decoded_conts, z

We'll also need a loss function that can grade how well our features represent the original dataset. 

The categorical features will be graded on `CrossEntropyLossFlat` and the continous with `MSELossFlat`.

Since this is a Variationnal AutoEncoder, we have to worry about KL-Divergence too.  kl_weight is a special parameter controlled by a callback.  At the beginning this parameter will be zero (basically like a normal autoencoder) and we will gradually increase it to 1 so that the auto-encoder become variationnal.  This is a trick suggested in [Ladder Variational AutoEncoder](https://arxiv.org/abs/1602.02282) and also used in the [NVAE](https://arxiv.org/abs/2007.03898) paper.

In [25]:
def compute_kernel(x, y):
    x_size = x.shape[0]
    y_size = y.shape[0]
    dim = x.shape[1]

    tiled_x = x.view(x_size,1,dim).repeat(1, y_size,1)
    tiled_y = y.view(1,y_size,dim).repeat(x_size, 1,1)

    return torch.exp(-torch.mean((tiled_x - tiled_y)**2,dim=2)/dim*1.0)


def compute_mmd(x, y):
    x_kernel = compute_kernel(x, x)
    y_kernel = compute_kernel(y, y)
    xy_kernel = compute_kernel(x, y)
    return torch.mean(x_kernel) + torch.mean(y_kernel) - 2*torch.mean(xy_kernel)

In [26]:
config = {
    'hidden_size': 128,
    'dropout': 0.0,
    'embed_p': 0.0,
    'wd': 0.01,
    'bswap': 0.2,
    'lr': 1e-3,
    'epochs': 50
}

In [27]:
class VAERecreatedLoss(Module):
    "Measures how well we have created the original tabular inputs, plus the KL Divergence with the unit normal distribution"
    def __init__(self, cat_dict, dataset_size, bs, hidden_size, mmd_weight = 1000, reduction='mean'):
        ce = CrossEntropyLossFlat(reduction='none')
        mse = MSELossFlat(reduction='none')
        store_attr('cat_dict,ce,mse,dataset_size,bs,hidden_size,mmd_weight,reduction')
        
    def forward(self, preds, cat_targs, cont_targs):
        if(len(preds) == 4):
            cats,conts, z, kl_weight = preds
        else:
            cats,conts, z = preds
            kl_weight = 1
            
        true_samples = torch.randn((cats.shape[0],self.hidden_size))
        true_samples = nn.Parameter(true_samples).cuda()

        tot_ce, pos = [], 0
        for i, (k,v) in enumerate(self.cat_dict.items()):
            tot_ce += [self.ce(cats[:, pos:pos+v], cat_targs[:,i])]
            pos += v

        tot_ce = torch.stack(tot_ce, dim=1).mean(dim=1)
        cont_loss = self.mse(conts, cont_targs).view(conts.shape).mean(dim=1)
        recons_loss = (tot_ce + cont_loss)
        
        mmd_loss = compute_mmd(true_samples, z).repeat(cats.shape[0])
        
        total_loss = recons_loss + (mmd_loss * self.mmd_weight)
        
        if self.reduction == 'mean':
            return total_loss.mean()
        elif self.reduction == 'sum':
            return total_loss.sum()

        return total_loss

All we need to do is pass in our `total_cats` dictionary:

In [28]:
loss_func = VAERecreatedLoss(total_cats, df.shape[0], bs, config['hidden_size'], 1000)

Let's create some metrics for stuff we care about while fitting the model.  We have reconstruction metrics like MSE and CrossEntropy but we also have to worry about KLD.
Those metrics will help us see if the loss is dominated either by the KLD or by the reconstruction loss from MSE and CrossEntropy 

In [29]:
class MSEMetric(Metric):
    def __init__(self): self.preds = []
    def accumulate(self, learn):
        cats, conts, z = learn.pred
        cat_targs, cont_targs = learn.y
        norm_conts = conts.new([conts.size(1)])
        self.preds.append(to_detach(F.mse_loss(conts, cont_targs, reduction='sum') / norm_conts))
    @property
    def value(self):
        return np.array(self.preds).mean()
    
class CEMetric(Metric):
    def __init__(self): self.preds = []
    def accumulate(self, learn):
        cats, conts, z = learn.pred
        cat_targs, cont_targs = learn.y
        CE = cats.new([0])
        pos=0
        for i, (k,v) in enumerate(total_cats.items()):
            CE += F.cross_entropy(cats[:, pos:pos+v], cat_targs[:, i], reduction='sum')
            pos += v

        norm = cats.new([len(total_cats.keys())])
        self.preds.append(to_detach(CE/norm))
    @property
    def value(self):
        return np.array(self.preds).mean()
    
class MMDMetric(Metric):
    def __init__(self): self.preds = []
    def accumulate(self, learn):
        cats, conts, z = learn.pred
        true_samples = torch.randn((bs,config['hidden_size']))
        true_samples = nn.Parameter(true_samples).cuda()
        MMD = compute_mmd(true_samples, z)
        self.preds.append(to_detach(MMD))
    @property
    def value(self):
        return np.array(self.preds).mean()

We'll make an config dictionary for us to use as a list of all hyper parameters.  Also I would recommend against using early stopping because our AnnealedLossCallback will make the loss go worst once the KL divergence weight become larger than 0.

In [30]:
cbs = []
cbs += [EarlyStoppingCallback(patience=5)]
metrics = []
metrics += [MSEMetric(), CEMetric(), MMDMetric()]

And make our model & `Learner`

In [34]:
model = TabularVAE(emb_szs, len(cont_names), config['hidden_size'], ps=config['dropout'], cats=total_cats, embed_p=config['embed_p'], bswap=config['bswap'], low=tensor(low).cuda(), high=tensor(high).cuda())
learn = Learner(dls, model, lr=config['lr'], loss_func=loss_func, wd=config['wd'], opt_func=ranger, cbs=cbs, metrics=metrics).to_fp16()

Finally we'll fit for a few epochs:

In [35]:
learn.fit_flat_cos(config['epochs'], lr=0.0014)

epoch,train_loss,valid_loss,mse,ce,mmd,time
0,10.168085,9.563256,6804.4585,1954.3057,0.00014841557,00:07
1,5.327938,1.869316,3445.039,1688.6045,0.0001964484,00:07
2,3.342218,1.108065,2326.7554,1385.125,0.0001870337,00:07
3,2.32441,0.570403,1761.3877,1132.9485,0.00016434278,00:07
4,1.752037,0.440975,1424.0205,956.9856,0.0001496315,00:07
5,1.410149,0.371801,1197.2649,831.2296,0.00013893843,00:08
6,1.197507,0.338297,1037.1759,735.0688,0.00013073366,00:07
7,1.055543,0.297053,915.67053,660.0755,0.00012448643,00:07
8,0.966456,0.282033,821.8686,599.54987,0.000119693694,00:08
9,0.906478,0.287728,748.1394,550.2091,0.00011590719,00:08


No improvement since epoch 23: early stopping


# Getting the compressed representations

Next we're going to grade our compressed representations and then attempt to train on them.

In [36]:
dl = learn.dls.test_dl(df)

Let's predict over all the data manually using PyTorch:

In [37]:
outs = []
for batch in dl:
    with torch.no_grad():
        learn.model.eval()
        learn.model.cuda()
        out = learn.model(*batch[:2], True).cpu().numpy()
        outs.append(out)
outs = np.concatenate(outs)

In [38]:
outs.shape

(32561, 128)

As well as get the actual preds and targs:

In [39]:
(cat_preds, cont_preds, z), (cat_targs, cont_targs) = learn.get_preds(dl=dl, reorder=False)

# Measuring accuracy

## Continuous

In [40]:
from sklearn.metrics import r2_score

cont_preds = pd.DataFrame(cont_preds, columns=cont_names)
cont_targs = pd.DataFrame(cont_targs, columns=cont_names)

preds = pd.DataFrame((cont_preds.values * stds.values) + means.values, columns=cont_preds.columns)
targets = pd.DataFrame((cont_targs.values * stds.values) + means.values, columns=cont_targs.columns)

mi = (np.abs(targets-preds)).min().to_frame().T
ma = (np.abs(targets-preds)).max().to_frame().T
mean = (np.abs(targets-preds)).mean().to_frame().T
median = (np.abs(targets-preds)).median().to_frame().T
r2 = pd.DataFrame.from_dict({c:[r2_score(targets[c], preds[c])] for c in preds.columns})


for d,name in zip([mi,ma,mean,median,r2], ['Min', 'Max', 'Mean', 'Median', 'R2']):
    d = d.insert(0, 'GroupBy', name)
    
data = pd.concat([mi,ma,mean,median,r2])
data

Unnamed: 0,GroupBy,age,fnlwgt,education-num
0,Min,0.0,0.461,0.0
0,Max,51.168,463768.594,3.768
0,Mean,2.555,27842.54,0.254
0,Median,1.815,21517.459,0.196
0,R2,0.929,0.87,0.982


We can also grab the R2:

In [41]:
r2.mean(axis=1)

0   0.927
dtype: float64

## Categorical

In [42]:
cat_reduced = torch.zeros_like(cat_targs)
pos=0
for i, (k,v) in enumerate(total_cats.items()):
    cat_reduced[:,i] = cat_preds[:,pos:pos+v].argmax(dim=1)
    pos += v

In [43]:
cat_preds = pd.DataFrame(cat_reduced, columns=cat_names)
cat_targs = pd.DataFrame(cat_targs, columns=cat_names)

from sklearn.metrics import balanced_accuracy_score, f1_score

accuracy = pd.DataFrame.from_dict({c:[balanced_accuracy_score(cat_targs[c], cat_preds[c])] for c in cat_preds.columns})

In [44]:
f1 = pd.DataFrame.from_dict({c:[f1_score(cat_targs[c], cat_preds[c], average='weighted')] for c in cat_preds.columns})

In [45]:
for d,name in zip([accuracy, f1], ['Accuracy', 'F1']):
    d = d.insert(0, 'MetricName', name)
pd.concat([accuracy, f1])

Unnamed: 0,MetricName,workclass,education,marital-status,occupation,relationship,race,education-num_na
0,Accuracy,0.816,0.997,0.873,0.988,0.984,0.96,0.963
0,F1,0.992,0.999,0.995,0.995,0.993,0.995,0.999


And check it's overall accuracy:

In [46]:
accuracy.mean(axis=1)

0   0.940
dtype: float64

## Predicting

Now that we have our compressed representations, let's use them to train a new model

In [47]:
ys = df['salary'].to_numpy()

In [48]:
test_eq(len(outs), len(ys))

In [49]:
df_outs = pd.DataFrame(columns=['salary'] + list(range(0,config['hidden_size'])))

In [50]:
df_outs['salary'] = ys

In [51]:
df_outs[list(range(0,config['hidden_size']))] = outs

In [52]:
pd.options.mode.chained_assignment=None

In [53]:
splits = RandomSplitter()(range_of(df))

In [54]:
df_outs[list(range(0,config['hidden_size']))] = df_outs[list(range(0,config['hidden_size']))].astype(np.float16)

In [55]:
cont_names_ = list(range(0,config['hidden_size']))
to2 = TabularPandas(df_outs, procs = [Normalize], cont_names=cont_names_, splits=splits, y_names=['salary'], reduce_memory=False, 
                   y_block=CategoryBlock())

In [56]:
dls2 = to2.dataloaders(bs=1024)

In [57]:
def accuracy(inp, targ, axis=-1):
    "Compute accuracy with `targ` when `pred` is bs * n_classes"
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

In [58]:
learn2 = tabular_learner(dls2, layers=[200,100], config={'ps':0.05}, metrics=[accuracy])

In [59]:
learn2.fit(10, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.394843,0.351318,0.831849,00:00
1,0.369111,0.346518,0.835381,00:00
2,0.358863,0.344359,0.839988,00:00
3,0.352922,0.345499,0.841063,00:00
4,0.348409,0.345977,0.837224,00:00
5,0.344794,0.34404,0.840141,00:00
6,0.342773,0.344962,0.839373,00:00
7,0.340786,0.344012,0.839834,00:00
8,0.339176,0.347567,0.838298,00:00
9,0.337396,0.348554,0.83922,00:00
