In [17]:
import pandas as pd

from fastai.tabular.all import *

In [18]:
DATA_PATH = '../data/preprocessed_training_data.pkl'

In [82]:
df = pd.read_pickle(DATA_PATH)
df['install'] = df['install'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3738937 entries, 0 to 3738936
Data columns (total 18 columns):
 #   Column              Dtype         
---  ------              -----         
 0   id                  object        
 1   timestamp           datetime64[ns]
 2   campaignId          object        
 3   platform            object        
 4   softwareVersion     object        
 5   sourceGameId        object        
 6   country             object        
 7   startCount          int64         
 8   viewCount           int64         
 9   clickCount          int64         
 10  installCount        int64         
 11  lastStart           datetime64[ns]
 12  startCount1d        int64         
 13  startCount7d        int64         
 14  connectionType      object        
 15  deviceType          object        
 16  install             category      
 17  timeSinceLastStart  float64       
dtypes: category(1), datetime64[ns](2), float64(1), int64(6), object(8)
memory usage: 4

In [83]:
class_count_df = df.groupby('install').count()['installCount']

n_0, n_1 = class_count_df[0], class_count_df[1]

w_0 = (n_0 + n_1) / (2.0 * n_0)
w_1 = (n_0 + n_1) / (2.0 * n_1)

splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [84]:
CAT_NAMES = ['campaignId', 'platform', 'softwareVersion', 
             'sourceGameId', 'connectionType', 'deviceType']
CONT_NAMES = ['startCount', 'viewCount', 'clickCount', 
              'installCount', 'startCount1d', 'startCount7d', 
              'timeSinceLastStart']

# dls = TabularDataLoaders.from_df(df, y_names='install', 
#                                  cat_names=CAT_NAMES,
#                                  cont_names=CONT_NAMES,
#                                  procs=[Categorify, FillMissing, Normalize],
#                                  splits=splits)
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df, 
                      procs_nn, 
                      cat_names=CAT_NAMES, 
                      cont_names=CONT_NAMES,
                      splits=splits, 
                      y_names='install')
dls = to_nn.dataloaders(1024)
dls.xs

Unnamed: 0,campaignId,platform,softwareVersion,sourceGameId,connectionType,deviceType,startCount,viewCount,clickCount,installCount,startCount1d,startCount7d,timeSinceLastStart
1066338,753,1,90,26912,3,26032,-0.225489,-0.202373,0.427989,-0.367839,-0.068810,-0.281545,-0.298751
2674864,6827,1,87,610,3,10019,-0.435658,-0.353790,-0.304065,0.933313,-0.416908,-0.310255,-0.304602
1611613,3372,1,90,10384,3,26730,-0.236551,-0.164519,-0.157654,-0.367839,0.018215,-0.367675,-0.305229
3031601,7072,1,137,12125,1,26292,-0.136997,-0.038338,-0.157654,0.933313,-0.503933,-0.511224,1.569238
3659657,7289,1,141,22376,1,20280,-0.159120,-0.050956,-0.157654,-0.367839,-0.503933,-0.224125,0.012615
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29811,9542,2,40,27600,3,23368,0.117417,0.277115,-0.157654,0.282737,-0.503933,0.263943,0.012615
3361041,6748,2,40,14576,3,23366,-0.435658,-0.379027,-0.304065,-0.367839,0.192264,-0.310255,-0.303766
441862,2865,1,137,5891,1,7463,0.681554,0.857547,-0.157654,-0.367839,-0.329884,0.407493,-0.034194
1672017,5031,1,130,24101,1,20154,2.075303,0.113079,3.502618,-0.367839,-0.242859,0.005554,-0.305229


In [85]:
doc(dls)

In [86]:
weights = [w_0, w_1]
class_weights=torch.FloatTensor(weights).cuda()
my_loss_func = CrossEntropyLossFlat(weight=class_weights)
# my_loss_func = torch.nn.CrossEntropyLoss(weight=class_weights)
roc_auc = RocAucBinary()

learn = tabular_learner(dls, layers=[500, 250], loss_func=my_loss_func, metrics=roc_auc)
learn.loss_func

FlattenedLoss of CrossEntropyLoss()

In [87]:
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,roc_auc_score,time
0,0.609632,0.605401,0.735232,01:58
1,0.568285,0.595737,0.748474,01:58
2,0.554826,0.59956,0.748663,01:58


In [88]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(9693, 273)
    (1): Embedding(3, 3)
    (2): Embedding(196, 31)
    (3): Embedding(34850, 559)
    (4): Embedding(4, 3)
    (5): Embedding(28324, 498)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(1374, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=1374, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
    )
  )
)