In [1]:
import pandas as pd

from fastai.tabular.all import *

In [2]:
DATA_PATH = '../data/preprocessed_training_data.pkl'

In [21]:
df = pd.read_pickle(DATA_PATH)[:400000]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  400000 non-null  object        
 1   timestamp           400000 non-null  datetime64[ns]
 2   campaignId          400000 non-null  object        
 3   platform            400000 non-null  object        
 4   softwareVersion     400000 non-null  object        
 5   sourceGameId        400000 non-null  object        
 6   country             400000 non-null  object        
 7   startCount          400000 non-null  int64         
 8   viewCount           400000 non-null  int64         
 9   clickCount          400000 non-null  int64         
 10  installCount        400000 non-null  int64         
 11  lastStart           369470 non-null  datetime64[ns]
 12  startCount1d        400000 non-null  int64         
 13  startCount7d        400000 no

In [22]:
class_count_df = df.groupby('install').count()['installCount']

n_0, n_1 = class_count_df[0], class_count_df[1]

w_1 = (n_0 + n_1) / (2.0 * n_1)
w_0 = (n_0 + n_1) / (2.0 * n_0)

In [43]:
dls = TabularDataLoaders.from_df(df, y_names='install', 
                                 cat_names=['campaignId', 'platform', 'softwareVersion', 'sourceGameId', 
                                            'connectionType', 'deviceType', 'country'],
                                 cont_names=['startCount', 'viewCount', 'clickCount', 'installCount',
                                             'startCount1d', 'startCount7d', 'timeSinceLastStart'],
                                 procs=[Categorify, FillMissing, Normalize])

In [44]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target):
        target = target.float()
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
               ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size())==2:
            loss = loss.sum(dim=1)
        return loss.mean()

In [47]:
weights = [w_0, w_1]
class_weights=torch.FloatTensor(weights).cuda()
# nn.CrossEntropyLoss(weight=class_weights)
roc_auc = RocAuc()

learn = tabular_learner(dls, loss_func=FocalLoss(), metrics=roc_auc)

In [48]:
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,roc_auc_score,time
0,0.021782,0.023738,0.5,01:20
1,0.020169,0.023134,0.5,01:21
2,0.025252,0.022846,0.5,01:21
