In [1]:
import pandas as pd

from fastai.tabular.all import *

In [2]:
DATA_PATH = '../data/preprocessed_training_data.pkl'

In [3]:
df = pd.read_pickle(DATA_PATH)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3738937 entries, 0 to 3738936
Data columns (total 18 columns):
 #   Column              Dtype         
---  ------              -----         
 0   id                  object        
 1   timestamp           datetime64[ns]
 2   campaignId          object        
 3   platform            object        
 4   softwareVersion     object        
 5   sourceGameId        object        
 6   country             object        
 7   startCount          int64         
 8   viewCount           int64         
 9   clickCount          int64         
 10  installCount        int64         
 11  lastStart           datetime64[ns]
 12  startCount1d        int64         
 13  startCount7d        int64         
 14  connectionType      object        
 15  deviceType          object        
 16  install             int64         
 17  timeSinceLastStart  float64       
dtypes: datetime64[ns](2), float64(1), int64(7), object(8)
memory usage: 513.5+ MB


In [5]:
class_count_df = df.groupby('install').count()
class_count_df

n_0, n_1 = class_count_df.iloc[0, 0], class_count_df.iloc[1, 0]

w_0 = (n_0 + n_1) / (2.0 * n_0)
w_1 = (n_0 + n_1) / (2.0 * n_1)

w_0, w_1

(0.5060559911190347, 41.78143438226354)

In [6]:
df['install'] = df['install'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3738937 entries, 0 to 3738936
Data columns (total 18 columns):
 #   Column              Dtype         
---  ------              -----         
 0   id                  object        
 1   timestamp           datetime64[ns]
 2   campaignId          object        
 3   platform            object        
 4   softwareVersion     object        
 5   sourceGameId        object        
 6   country             object        
 7   startCount          int64         
 8   viewCount           int64         
 9   clickCount          int64         
 10  installCount        int64         
 11  lastStart           datetime64[ns]
 12  startCount1d        int64         
 13  startCount7d        int64         
 14  connectionType      object        
 15  deviceType          object        
 16  install             category      
 17  timeSinceLastStart  float64       
dtypes: category(1), datetime64[ns](2), float64(1), int64(6), object(8)
memory usage: 4

In [7]:
CAT_NAMES = ['campaignId', 'platform', 'softwareVersion',
             'sourceGameId', 'connectionType', 'deviceType']
CONT_NAMES = ['startCount', 'viewCount', 'clickCount', 
              'installCount', 'startCount1d', 'startCount7d', 
              'timeSinceLastStart']

splits = RandomSplitter(valid_pct=0.2)(range_of(df))

dls = TabularDataLoaders.from_df(df, y_names='install', 
                                 cat_names=CAT_NAMES,
                                 cont_names=CONT_NAMES,
                                 procs=[Categorify, FillMissing, Normalize],
                                 splits=splits)

In [8]:
dls.xs

Unnamed: 0,campaignId,platform,softwareVersion,sourceGameId,connectionType,deviceType,startCount,viewCount,clickCount,installCount,startCount1d,startCount7d,timeSinceLastStart
3129785,7047,1,137,15671,1,24250,-0.490315,-0.416920,-0.158685,-0.367223,-0.241971,-0.453782,-0.304636
2663098,1723,2,33,34113,1,23383,0.327100,0.416161,1.465967,-0.367223,0.537111,1.297458,-0.305262
1153718,7280,1,119,27212,3,10659,0.636392,0.731722,-0.010989,0.931814,-0.501665,-0.023149,0.574101
2267185,2332,1,85,25535,3,6636,-0.147884,-0.025624,-0.158685,-0.367223,2.181839,0.436192,-0.275630
2871394,1918,2,40,27927,3,23383,-0.435084,-0.379053,-0.306381,0.282295,-0.501665,-0.539909,1.848071
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3010953,4415,2,40,34205,3,23390,-0.523453,-0.454787,-0.306381,-0.367223,-0.501665,-0.539909,-0.305470
3231064,6107,1,119,13543,3,26399,1.166608,1.362844,0.727489,6.127961,0.190852,0.895534,-0.305053
267655,7562,2,39,13761,3,23339,-0.479269,-0.404297,-0.158685,-0.367223,-0.155407,-0.425073,-0.304010
2280037,6774,1,101,18355,3,6635,-0.048469,-0.126604,-0.158685,2.230850,-0.415101,-0.453782,-0.277299


In [11]:
dls.valid_ds

                               id           timestamp  campaignId  platform  \
1555029  5c3b6428ede23481264dd85a 2019-01-13 16:15:36        4845         1   
3265350  5c397567cc1b39775c5338b5 2019-01-12 05:04:39        6347         1   
2186817  5c3af1395896a93d73c768f5 2019-01-13 08:05:13        6025         1   
3016192  5c432c90e234beb2a7c4319d 2019-01-19 13:56:32        8052         1   
1003313  5c366f18bc70363add3764c5 2019-01-09 22:00:56        7905         1   
...                           ...                 ...         ...       ...   
726591   5c353af75d3dc263db54d53c 2019-01-09 00:06:15        7044         1   
671103   5c36535346b35d2b21169257 2019-01-09 20:02:27         698         1   
3530163  5c42907200fe445fd3c82945 2019-01-19 02:50:26        9294         1   
3471701  5c3b26223e0e2959661e2f50 2019-01-13 11:50:58        2472         1   
808388   5c3c0f84a299a4d5d3189412 2019-01-14 04:26:44        1214         1   

         softwareVersion  sourceGameId country  sta

In [14]:
weights = [w_0, w_1]
class_weights=torch.FloatTensor(weights).cuda()
loss_func = CrossEntropyLossFlat(weights=class_weights)
roc_auc = RocAucBinary()

learn = tabular_learner(dls, loss_func=loss_func, metrics=roc_auc)
learn.loss_func

FlattenedLoss of CrossEntropyLoss()

In [15]:
# learn.lr_find()

In [None]:
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,roc_auc_score,time


In [None]:
learn.model

In [None]:
learn.fit_one_cycle(3)