In [None]:
# default_exp tab_ae

# Tabular ae boilderplate

These boilerplate code is learned from  `https://walkwithfastai.com/tab.ae`. I adopted and compiled here for better reuse for personal projects. 

In [None]:
#export
from fastai.tabular.all import *
from fastcore import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()
splits = RandomSplitter()(range_of(df))

In [None]:
to = TabularPandas(df, procs = [Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, 
                   splits=splits, y_names=['salary'], y_block=CategoryBlock())

In [None]:
dls = to.dataloaders(bs=1024)

In [None]:
#hide

# train_dl = TabDataLoader(to.train, bs = 1280)
# valid_dl = TabDataLoader(to.valid, bs = 1280)
# dls = DataLoaders(train_dl, valid_dl)

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=[accuracy])
learn.fit_one_cycle(20, 1e-2, cbs=[EarlyStoppingCallback()] )

epoch,train_loss,valid_loss,accuracy,time
0,0.439749,0.543225,0.776106,00:00
1,0.39462,0.406356,0.823556,00:00
2,0.375502,0.34822,0.84183,00:00
3,0.365792,0.354564,0.836456,00:00


No improvement since epoch 2: early stopping


## Custom Transforms and Dataloaders

In [None]:
#export
class ReadTabBatchIdentity(ItemTransform):
    "Read a batch of data and return the inputs as both `x` and `y`"
    def __init__(self, to): #self.to = to
        store_attr

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
        if to.device is not None: res = to_device(res, to.device)
        return res
    
class TabularPandasIdentity(TabularPandas): pass

In [None]:
#export

@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    "A transformed `DataLoader` for AutoEncoder problems with Tabular data"
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

In [None]:
TabularPandasIdentity._dl_type = TabDataLoaderIdentity

In [None]:
to = TabularPandasIdentity(df, [Categorify, FillMissing, Normalize], cat_names, cont_names, splits=RandomSplitter(seed=32)(df))
dls = to.dataloaders(bs=1024)

In [None]:
dls.n_inp = 2

In [None]:
import fastcore, fastai

In [None]:
total_cats = {k:len(v) for k,v in to.classes.items()}
total_cats

{'workclass': 10,
 'education': 17,
 'marital-status': 8,
 'occupation': 16,
 'relationship': 7,
 'race': 6,
 'education-num_na': 3}

In [None]:
sum([v for k,v in total_cats.items()])

67

In [None]:
to.means

{'age': 38.5793696495067,
 'fnlwgt': 190006.02011593536,
 'education-num': 10.079158508963875}

In [None]:
means = pd.DataFrame.from_dict({k:[v] for k,v in to.means.items()})
stds = pd.DataFrame.from_dict({k:[v] for k,v in to.stds.items()})

In [None]:
low = (df[cont_names].min().to_frame().T.values - means.values) / stds.values
high = (df[cont_names].max().to_frame().T.values - means.values) / stds.values

In [None]:
#export
class RecreatedLoss(Module):
    "Measures how well we have created the original tabular inputs"
    def __init__(self, cat_dict):
        ce = CrossEntropyLossFlat(reduction='sum')
        mse = MSELossFlat(reduction='sum')
        #store_attr('cat_dict,ce,mse')
        self.cat_dict = cat_dict
        self.ce = ce
        self.mse = mse

    def forward(self, preds, cat_targs, cont_targs):
        cats, conts = preds
        tot_ce, pos = cats.new([0]), 0
        for i, (k,v) in enumerate(self.cat_dict.items()):
            tot_ce += self.ce(cats[:, pos:pos+v], cat_targs[:,i])
            pos += v
        
        norm_cats = cats.new([len(self.cat_dict)])
        norm_conts = conts.new([conts.size(1)])
        cat_loss = tot_ce/norm_cats
        cont_loss = self.mse(conts, cont_targs)/norm_conts
        total = cat_loss+cont_loss

        return total / cats.size(0)

In [None]:
loss_func = RecreatedLoss(total_cats)

## The model

In [None]:
#export
class BatchSwapNoise(Module):
    "Swap Noise Module"
    def __init__(self, p): #store_attr()
        self.p = p


    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x

In [None]:
#export
class TabularAE(TabularModel):
    "A simple AutoEncoder model"
    def __init__(self, emb_szs, n_cont, hidden_size, cats, low, high, ps=0.2, embed_p=0.01, bswap=None):
        super().__init__(emb_szs, n_cont, layers=[1024, 512, 256], out_sz=hidden_size, embed_p=embed_p)
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(256, hidden_size, p=ps, act=Mish())))
        
        if(bswap != None): self.noise = BatchSwapNoise(bswap)
        self.decoder = nn.Sequential(
            LinBnDrop(hidden_size, 256, p=ps, act=Mish()),
            LinBnDrop(256, 512, p=ps, act=Mish()),
            LinBnDrop(512, 1024, p=ps, act=Mish())
        )
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(1024, n_cont, p=ps, bn=False, act=None),
            SigmoidRange(low=low, high=high)
        )
        
        self.decoder_cat = LinBnDrop(1024, self.activation_cats, p=ps, bn=False, act=None)
        
    def forward(self, x_cat, x_cont=None, encode=False):
        if(self.bswap != None):
            x_cat = self.noise(x_cat)
            x_cont = self.noise(x_cont)
        encoded = super().forward(x_cat, x_cont)
        if encode: return encoded # return the representation
        decoded_trunk = self.decoder(encoded)
        decoded_cats = self.decoder_cat(decoded_trunk)
        decoded_conts = self.decoder_cont(decoded_trunk)
        return decoded_cats, decoded_conts

In [None]:
emb_szs = get_emb_sz(to.train)

In [None]:
model = TabularAE(emb_szs, len(cont_names), 128, ps=0.1, cats=total_cats, embed_p=0.01,
                  bswap=.1, low=tensor(low), high=tensor(high))

In [None]:
learn = Learner(dls, model, loss_func=loss_func, wd=0.01, opt_func=ranger)

In [None]:
learn.fit_flat_cos(100, cbs=[EarlyStoppingCallback()], lr=4e-3)

epoch,train_loss,valid_loss,time
0,2.931388,1.604174,00:07
1,1.742198,1.117671,00:07
2,1.228289,0.416615,00:08
3,0.953491,0.23743,00:08
4,0.789248,0.159128,00:08
5,0.68652,0.135592,00:08
6,0.614851,0.134074,00:08
7,0.566391,0.113725,00:08
8,0.534278,0.126939,00:07


No improvement since epoch 7: early stopping


In [None]:
dl = learn.dls.test_dl(df)

In [None]:
outs = []
for batch in dl:
    with torch.no_grad():
        learn.model.eval()
        #learn.model.cuda()
        out = learn.model(*batch[:2], encode=True).cpu().numpy()
        outs.append(out)
outs = np.concatenate(outs)

In [None]:
outs.shape

(32561, 128)

In [None]:
(cat_preds, cont_preds), (cat_targs, cont_targs) = learn.get_preds(dl=dl)

## Measureing accuracy of the outputs

In [None]:
cont_preds = pd.DataFrame(cont_preds, columns=cont_names)
cont_targs = pd.DataFrame(cont_targs, columns=cont_names)

In [None]:
preds = pd.DataFrame((cont_preds.values * stds.values) + means.values, columns=cont_preds.columns)
targets = pd.DataFrame((cont_targs.values * stds.values) + means.values, columns=cont_targs.columns)

In [None]:
from sklearn.metrics import r2_score

In [None]:
mi = (np.abs(targets-preds)).min().to_frame().T
ma = (np.abs(targets-preds)).max().to_frame().T
mean = (np.abs(targets-preds)).mean().to_frame().T
median = (np.abs(targets-preds)).median().to_frame().T
r2 = pd.DataFrame.from_dict({c:[r2_score(targets[c], preds[c])] for c in preds.columns})

In [None]:
for d,name in zip([mi,ma,mean,median,r2], ['Min', 'Max', 'Mean', 'Median', 'R2']):
    d = d.insert(0, 'GroupBy', name)

In [None]:
data = pd.concat([mi,ma,mean,median,r2])
data

Unnamed: 0,GroupBy,age,fnlwgt,education-num
0,Min,6.6e-05,2.208931,1.6e-05
0,Max,22.355501,322066.885637,3.480927
0,Mean,2.255886,28005.661368,0.323721
0,Median,1.778438,21004.22326,0.254568
0,R2,0.952961,0.865969,0.971904


In [None]:
r2.mean(axis=1)

0    0.930278
dtype: float64

In [None]:
cat_reduced = torch.zeros_like(cat_targs)
pos=0
for i, (k,v) in enumerate(total_cats.items()):
    cat_reduced[:,i] = cat_preds[:,pos:pos+v].argmax(dim=1)
    pos += v

cat_preds = pd.DataFrame(cat_reduced, columns=cat_names)
cat_targs = pd.DataFrame(cat_targs, columns=cat_names)

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score

In [None]:
accuracy = pd.DataFrame.from_dict({c:[balanced_accuracy_score(cat_targs[c], cat_preds[c])] for c in cat_preds.columns})

In [None]:
f1 = pd.DataFrame.from_dict({c:[f1_score(cat_targs[c], cat_preds[c], average='weighted')] for c in cat_preds.columns})

In [None]:
for d,name in zip([accuracy, f1], ['Accuracy', 'F1']):
    d = d.insert(0, 'MetricName', name)
pd.concat([accuracy, f1])

Unnamed: 0,MetricName,workclass,education,marital-status,occupation,relationship,race,education-num_na
0,Accuracy,0.768378,0.971244,0.772691,0.924995,0.977052,0.850383,0.966026
0,F1,0.995304,0.996902,0.98528,0.994147,0.99343,0.991632,0.998785


In [None]:
accuracy.mean(axis=1)

0    0.89011
dtype: float64

## Prediction

In [None]:
ys = df['salary'].to_numpy()

In [None]:
df_outs = pd.DataFrame(columns=['salary'] + list(range(0,128)))
df_outs['salary'] = ys
df_outs[list(range(0,128))] = outs
df_outs[list(range(0,128))] = df_outs[list(range(0,128))].astype(np.float16)

In [None]:
cont_names = list(range(0,128))
splits = RandomSplitter()(range_of(df))
to = TabularPandas(df_outs, procs = [Normalize], cont_names=cont_names, splits=splits, y_names=['salary'], reduce_memory=False, 
                   y_block=CategoryBlock())

In [None]:
dls = to.dataloaders(bs=1024)

In [None]:
def accuracy(inp, targ, axis=-1):
    "Compute accuracy with `targ` when `pred` is bs * n_classes"
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=[accuracy])

In [None]:
learn.fit_one_cycle(20, 1e-2, cbs=[EarlyStoppingCallback()] )

epoch,train_loss,valid_loss,accuracy,time
0,0.345159,0.349436,0.836456,00:00
1,0.343819,0.354875,0.833538,00:00


No improvement since epoch 0: early stopping
