In [1]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0
[0m

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.preprocessing import LabelEncoder
import torch

In [2]:
df = pd.read_csv("train.gz")
df.head(5)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [3]:
cat_features = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
num_features = []
features = cat_features + num_features
label_col = 'click'

In [4]:
class MultyLabelEncoder():
    def __init__(self,cat_cols):
        self.cat_cols = cat_cols
    
    def fit_transform(self,df):
        self.encoders = []
        for col in tqdm(self.cat_cols):
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.encoders += [le]
        return df
    
    def transform(self,df):
        for col,le in tqdm(zip(self.cat_cols,self.encoders),total=len(self.cat_cols)):
            df[col] = le.transform(df[col])
        return df

In [5]:
le = MultyLabelEncoder(cat_features)
df = le.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [6]:
class ColsCollect():
    def __init__(self,cat_cols,top_k=512):
        self.cat_cols = cat_cols
        self.top_k = top_k
    
    def fit_transform(self,df):
        self.good_cols = []
        for col in tqdm(self.cat_cols):
            good_cols = df[col].value_counts().index[:self.top_k]
            df[col] = df[col].apply(lambda x: x if x in good_cols else -1)
        return df

In [7]:
cols_collect = ColsCollect(cat_features)
df = cols_collect.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [8]:
le = MultyLabelEncoder(cat_features)
df = le.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [9]:
cat_idx = [idx for idx, feature in enumerate(features) if feature in cat_features]

In [10]:
cat_dims = [df[col].nunique() for col in tqdm(cat_features)]

  0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
(0.2 * 15) + (10 * (3)**(0.5))

In [11]:
cat_dims

[240,
 7,
 7,
 513,
 513,
 26,
 513,
 513,
 36,
 513,
 513,
 513,
 5,
 4,
 513,
 8,
 9,
 435,
 4,
 68,
 172,
 60]

In [12]:
params = {'n_d':64,
          'n_a':64,
          'n_steps':8,
          'gamma':1.9,
          'cat_emb_dim':32,
          'seed':56,
          'optimizer_fn':torch.optim.AdamW,
          'optimizer_params':{'lr':2e-2},
          'lambda_sparse':1e-3,
          'cat_idxs':cat_idx,
          'cat_dims':cat_dims,
          'verbose':1,
          'device_name':'cuda'}

reg = TabNetClassifier(**params)



In [13]:
reg

In [14]:
train_df,val_df = train_test_split(df,test_size=0.1,random_state=56)

In [15]:
del df

In [16]:
import gc
gc.collect()

40

In [17]:
X_train = train_df[features].values
y_train = train_df[label_col].values
        
X_val = val_df[features].values
y_val = val_df[label_col].values

In [26]:
reg.fit(X_train=X_train[:100_00000],
        y_train=y_train[:100_00000],
        eval_set=[(X_train[:100_0000], y_train[:100_0000]), (X_val[:100_000], y_val[:100_000])],
        eval_name=["train", "valid"],
        max_epochs=200,
        batch_size=2048*2,
        #loss_fn=mape_loss,
        num_workers=12)

epoch 0  | loss: 0.41909 | train_auc: 0.72811 | valid_auc: 0.72669 |  0:06:47s
epoch 1  | loss: 0.40234 | train_auc: 0.73567 | valid_auc: 0.73543 |  0:13:31s


KeyboardInterrupt: 