In [2]:
!pip show fastai fastcore

Name: fastai
Version: 2.0.15
Summary: fastai simplifies training fast and accurate neural nets using modern best practices
Home-page: https://github.com/fastai/fastai/tree/master/
Author: Jeremy Howard, Sylvain Gugger, and contributors
Author-email: info@fast.ai
License: Apache Software License 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: torchvision, scikit-learn, packaging, pip, pyyaml, pillow, torch, fastprogress, requests, spacy, scipy, pandas, fastcore, matplotlib
Required-by: 
---
Name: fastcore
Version: 1.0.16
Summary: Python supercharged for fastai development
Home-page: https://github.com/fastai/fastcore/tree/master/
Author: Jeremy Howard and Sylvain Gugger
Author-email: infos@fast.ai
License: Apache Software License 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: packaging, pip
Required-by: fastai


In [3]:
from matplotlib import cm
from fastai.tabular.all import *

pd.set_option('display.float_format', lambda x: '%.3f' % x)

We'll use the `Adult Sample` dataset:

In [4]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

And declare the relevent information:

In [5]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()
splits = RandomSplitter()(range_of(df))

Next we need our own version of `ReadTabBatch` that will return our inputs

> The continous variables are still normalized if we used `Normalize`. Couldn't figure out an easy way to de-norm it, but it's okay that we do not

In [6]:
class ReadTabBatchIdentity(ItemTransform):
    "Read a batch of data and return the inputs as both `x` and `y`"
    def __init__(self, to): store_attr()

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
        if to.device is not None: res = to_device(res, to.device)
        return res
    
class TabularPandasIdentity(TabularPandas): pass

Next we need to make a new `TabDataLoader` that uses our `RadTabBatchIdentity`:

In [7]:
@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    "A transformed `DataLoader` for AutoEncoder problems with Tabular data"
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

And make `TabularPandasIdentity`'s `dl_type` to `TabDataLoaderIdentity`

In [8]:
TabularPandasIdentity._dl_type = TabDataLoaderIdentity

To start we'll make a very basic `to` object using our new `TabularPandasIdentity`:

In [9]:
to = TabularPandasIdentity(df, [Categorify, FillMissing, Normalize], cat_names, cont_names, splits=RandomSplitter(seed=32)(df))
dls = to.dataloaders(bs=1024)

Set the `n_inp` to 2:

In [10]:
dls.n_inp = 2

And then we'll calculate the embedding sizes:

In [11]:
emb_szs = get_emb_sz(to.train)

For each categorical variable we need to know the total possible values it can have:

In [12]:
total_cats = {k:len(v) for k,v in to.classes.items()}
total_cats

{'education': 17,
 'education-num_na': 3,
 'marital-status': 8,
 'occupation': 16,
 'race': 6,
 'relationship': 7,
 'workclass': 10}

We will need this dictionary in our loss function to figure out where to apply our `CrossEntropyLossFlat` for each categorical variables

Next we need to know the total number ouf outputs possible for our categorical variables

In [13]:
sum([v for k,v in total_cats.items()])

67

And let's keep a batch of our data for later

In [14]:
batch = dls.one_batch()

Next we need to know the means and standard deviations:

In [15]:
to.means

{'age': 38.5793696495067,
 'education-num': 10.079158782958984,
 'fnlwgt': 190006.02011593536}

We can store them in a `DataFrame` for easy adjustments:

In [16]:
means = pd.DataFrame.from_dict({k:[v] for k,v in to.means.items()})
stds = pd.DataFrame.from_dict({k:[v] for k,v in to.stds.items()})

We'll also use a SigmoidRange based on the un-normalized data to reduce the range our values can be:

In [17]:
low = (df[cont_names].min().to_frame().T.values - means.values) / stds.values
high = (df[cont_names].max().to_frame().T.values - means.values) / stds.values

In [18]:
low, high

(array([[-1.57952443, -1.67843578, -3.55613996]]),
 array([[ 3.76378659, 12.22741736,  2.3190849 ]]))

## Batch Swap Noise
Used in the winning solution for the Kaggle competition [Puerto Seguro Safe Driver Prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927)

In [19]:
class BatchSwapNoise(Module):
    "Swap Noise Module"
    def __init__(self, p): store_attr()

    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x

We'll make a custom `TabularAE` model (AutoEncoder) for us to use.

In [21]:
class TabularAE(TabularModel):
    "A simple AutoEncoder model"
    def __init__(self, emb_szs, n_cont, hidden_size, cats, low, high, ps=0.2, embed_p=0.01, bswap=None):
        super().__init__(emb_szs, n_cont, layers=[1024, 512, 256], out_sz=hidden_size, embed_p=embed_p, act_cls=Mish())
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(256, hidden_size, p=ps, act=Mish())))
        
        if(bswap != None): self.noise = BatchSwapNoise(bswap)
        self.decoder = nn.Sequential(
            LinBnDrop(hidden_size, 256, p=ps, act=Mish()),
            LinBnDrop(256, 512, p=ps, act=Mish()),
            LinBnDrop(512, 1024, p=ps, act=Mish())
        )
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(1024, n_cont, p=ps, bn=False, act=None),
            SigmoidRange(low=low, high=high)
        )
        
        self.decoder_cat = LinBnDrop(1024, self.activation_cats, p=ps, bn=False, act=None)
        
    def forward(self, x_cat, x_cont=None, encode=False):
        if(self.bswap != None):
            x_cat = self.noise(x_cat)
            x_cont = self.noise(x_cont)
        encoded = super().forward(x_cat, x_cont)
        if encode: return encoded # return the representation
        decoded_trunk = self.decoder(encoded)
        decoded_cats = self.decoder_cat(decoded_trunk)
        decoded_conts = self.decoder_cont(decoded_trunk)
        return decoded_cats, decoded_conts

We'll also need a loss function that can grade how well our features represent the original dataset. 

The categorical features will be graded on `CrossEntropyLossFlat` and the continous with `MSELossFlat`:

In [22]:
class RecreatedLoss(Module):
    "Measures how well we have created the original tabular inputs"
    def __init__(self, cat_dict):
        ce = CrossEntropyLossFlat(reduction='sum')
        mse = MSELossFlat(reduction='sum')
        store_attr('cat_dict,ce,mse')

    def forward(self, preds, cat_targs, cont_targs):
        cats, conts = preds
        tot_ce, pos = cats.new([0]), 0
        for i, (k,v) in enumerate(self.cat_dict.items()):
            tot_ce += self.ce(cats[:, pos:pos+v], cat_targs[:,i])
            pos += v
        
        norm_cats = cats.new([len(self.cat_dict)])
        norm_conts = conts.new([conts.size(1)])
        cat_loss = tot_ce/norm_cats
        cont_loss = self.mse(conts, cont_targs)/norm_conts
        total = cat_loss+cont_loss

        return total / cats.size(0)

All we need to do is pass in our `total_cats` dictionary:

In [23]:
loss_func = RecreatedLoss(total_cats)

We'll make an config dictionary for us to use:

In [24]:
config = {
    'hidden_size': 128,
    'dropout': 0.1,
    'embed_p': 0.01,
    'wd': 0.01,
    'bswap': 0.1,
    'lr': 1e-3,
    'epochs': 100
}

And make our model & `Learner`

In [25]:
model = TabularAE(emb_szs, len(cont_names), config['hidden_size'], ps=config['dropout'], cats=total_cats, embed_p=config['embed_p'], bswap=config['bswap'], low=tensor(low).cuda(), high=tensor(high).cuda())
learn = Learner(dls, model, lr=config['lr'], loss_func=loss_func, wd=config['wd'], opt_func=ranger).to_fp16()

Finally we'll fit for a few epochs:

In [26]:
learn.fit_flat_cos(config['epochs'], cbs=[EarlyStoppingCallback()], lr=4e-3)

epoch,train_loss,valid_loss,time
0,7.843971,3.538865,00:01
1,3.789461,1.069276,00:00
2,2.274931,0.483627,00:00
3,1.532943,0.230988,00:00
4,1.124401,0.181792,00:00
5,0.881739,0.130141,00:00
6,0.729499,0.137441,00:00


No improvement since epoch 5: early stopping


# Getting the compressed representations

Next we're going to grade our compressed representations and then attempt to train on them.

In [27]:
dl = learn.dls.test_dl(df)

Let's predict over all the data manually using PyTorch:

In [28]:
outs = []
for batch in dl:
    with torch.no_grad():
        learn.model.eval()
        learn.model.cuda()
        out = learn.model(*batch[:2], True).cpu().numpy()
        outs.append(out)
outs = np.concatenate(outs)

In [29]:
outs.shape

(32561, 128)

As well as get the actual preds and targs:

In [31]:
(cat_preds, cont_preds), (cat_targs, cont_targs) = learn.get_preds(dl=dl)

# Measuring accuracy

## Continuous

In [32]:
from sklearn.metrics import r2_score

cont_preds = pd.DataFrame(cont_preds, columns=cont_names)
cont_targs = pd.DataFrame(cont_targs, columns=cont_names)

preds = pd.DataFrame((cont_preds.values * stds.values) + means.values, columns=cont_preds.columns)
targets = pd.DataFrame((cont_targs.values * stds.values) + means.values, columns=cont_targs.columns)

mi = (np.abs(targets-preds)).min().to_frame().T
ma = (np.abs(targets-preds)).max().to_frame().T
mean = (np.abs(targets-preds)).mean().to_frame().T
median = (np.abs(targets-preds)).median().to_frame().T
r2 = pd.DataFrame.from_dict({c:[r2_score(targets[c], preds[c])] for c in preds.columns})


for d,name in zip([mi,ma,mean,median,r2], ['Min', 'Max', 'Mean', 'Median', 'R2']):
    d = d.insert(0, 'GroupBy', name)
    
data = pd.concat([mi,ma,mean,median,r2])
data

Unnamed: 0,GroupBy,age,fnlwgt,education-num
0,Min,0.0,4.563,0.0
0,Max,33.577,221926.921,3.517
0,Mean,2.36,28601.768,0.309
0,Median,1.939,23106.977,0.251
0,R2,0.95,0.873,0.975


We can also grab the R2:

In [33]:
r2.mean(axis=1)

0   0.933
dtype: float64

## Categorical

In [34]:
cat_reduced = torch.zeros_like(cat_targs)
pos=0
for i, (k,v) in enumerate(total_cats.items()):
    cat_reduced[:,i] = cat_preds[:,pos:pos+v].argmax(dim=1)
    pos += v

In [35]:
cat_preds = pd.DataFrame(cat_reduced, columns=cat_names)
cat_targs = pd.DataFrame(cat_targs, columns=cat_names)

from sklearn.metrics import balanced_accuracy_score, f1_score

accuracy = pd.DataFrame.from_dict({c:[balanced_accuracy_score(cat_targs[c], cat_preds[c])] for c in cat_preds.columns})

In [36]:
f1 = pd.DataFrame.from_dict({c:[f1_score(cat_targs[c], cat_preds[c], average='weighted')] for c in cat_preds.columns})

In [37]:
for d,name in zip([accuracy, f1], ['Accuracy', 'F1']):
    d = d.insert(0, 'MetricName', name)
pd.concat([accuracy, f1])

Unnamed: 0,MetricName,workclass,education,marital-status,occupation,relationship,race,education-num_na
0,Accuracy,0.767,0.876,0.756,0.882,0.988,0.736,0.957
0,F1,0.995,0.988,0.984,0.986,0.995,0.982,0.999


And check it's overall accuracy:

In [39]:
accuracy.mean(axis=1)

0   0.852
dtype: float64

## Predicting

Now that we have our compressed representations, let's use them to train a new model

In [40]:
ys = df['salary'].to_numpy()

In [41]:
test_eq(len(outs), len(ys))

In [42]:
df_outs = pd.DataFrame(columns=['salary'] + list(range(0,128)))

In [43]:
df_outs['salary'] = ys

In [44]:
df_outs[list(range(0,128))] = outs

In [45]:
pd.options.mode.chained_assignment=None

In [46]:
splits = RandomSplitter()(range_of(df))

In [47]:
df_outs[list(range(0,128))] = df_outs[list(range(0,128))].astype(np.float16)

In [48]:
cont_names = list(range(0,128))
to = TabularPandas(df_outs, procs = [Normalize], cont_names=cont_names, splits=splits, y_names=['salary'], reduce_memory=False, 
                   y_block=CategoryBlock())

In [49]:
dls = to.dataloaders(bs=1024)

In [50]:
def accuracy(inp, targ, axis=-1):
    "Compute accuracy with `targ` when `pred` is bs * n_classes"
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

In [51]:
learn = tabular_learner(dls, layers=[200,100], metrics=[accuracy])

In [52]:
learn.fit(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.38541,0.36107,0.834152,00:00
1,0.365105,0.351813,0.840602,00:00
2,0.3555,0.351886,0.837684,00:00
3,0.350068,0.35059,0.837377,00:00
4,0.346437,0.354474,0.840909,00:00
