In [30]:
!pip install lightgbm feature_engine catboost

Collecting catboost
  Downloading catboost-1.2.2-cp39-cp39-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1 plotly-5.18.0 tenacity-8.2.3
[0m

In [31]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib import tzip
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from feature_engine.encoding import OneHotEncoder as OneHotEncoderV2
import lightgbm as lgb
from catboost import CatBoostClassifier,Pool

In [3]:
df = pd.read_csv("train.gz")

In [4]:
df = df[:1_000_000]

In [5]:
cat_features = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
num_features = []
features = cat_features + num_features
label_col = 'click'

In [6]:
for col in tqdm(cat_features):
    df[col] = df[col].astype('category')

  0%|          | 0/22 [00:00<?, ?it/s]

In [10]:
class MultiOneHotEncoder():
    def __init__(self,encoded_columns,encoded_features):
        self.encoded_columns = encoded_columns
        self.encoded_features = encoded_features
        
    def fit_transform(self,df):
        self.encoders = []
        for col,max_feat in tzip(self.encoded_columns,self.encoded_features):
            ohe = OneHotEncoderV2(top_categories=max_feat)
            df = pd.concat([df,ohe.fit_transform(df[[col]])],axis=1)
            self.encoders += [ohe]
        return df.drop(self.encoded_columns,axis=1)
            
    def transform(self,df):
        for col,max_feat,ohe in tzip(self.encoded_columns,self.encoded_features,self.encoders):
            df = pd.concat([df,ohe.transform(df[[col]])],axis=1)
        return df.drop(self.encoded_columns,axis=1)

In [35]:
ohe = MultiOneHotEncoder(cat_features,[128] * len(cat_features))

In [36]:
df_tr = ohe.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [40]:
train_df,val_df = train_test_split(df_tr,test_size=0.2,random_state=56)

In [41]:
train_pool = Pool(train_df.drop(['id', 'click'],axis=1),
                    label=train_df['click'],)

eval_pool = Pool(val_df.drop(['id', 'click'],axis=1),
                  label=val_df['click'])

In [42]:
params = {'iterations':1000,
          'learning_rate':0.1,
          'loss_function':'CrossEntropy',
          'max_depth':7,
          'eval_metric':'AUC',
          'task_type':'GPU',
          'gpu_ram_part':0.8,
          'random_seed':56}

model = CatBoostClassifier(**params)

In [43]:
model.fit(train_pool,eval_set=eval_pool,verbose=100)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6761326	best: 0.6761326 (0)	total: 9.8ms	remaining: 9.8s
100:	test: 0.7393850	best: 0.7393850 (100)	total: 896ms	remaining: 7.97s
200:	test: 0.7468751	best: 0.7468751 (200)	total: 1.79s	remaining: 7.1s
300:	test: 0.7498643	best: 0.7498643 (300)	total: 2.68s	remaining: 6.23s
400:	test: 0.7519017	best: 0.7519017 (400)	total: 3.58s	remaining: 5.35s
500:	test: 0.7533420	best: 0.7533420 (500)	total: 4.49s	remaining: 4.48s
600:	test: 0.7540974	best: 0.7540974 (600)	total: 5.4s	remaining: 3.58s
700:	test: 0.7548704	best: 0.7548704 (700)	total: 6.33s	remaining: 2.7s
800:	test: 0.7555005	best: 0.7555005 (800)	total: 7.26s	remaining: 1.8s
900:	test: 0.7559161	best: 0.7559161 (900)	total: 8.19s	remaining: 900ms
999:	test: 0.7563154	best: 0.7563154 (999)	total: 9.11s	remaining: 0us
bestTest = 0.7563153803
bestIteration = 999


<catboost.core.CatBoostClassifier at 0x7f6f3c3fe040>

In [20]:
train_ds = lgb.Dataset(train_df.drop(['id', 'click'],axis=1),
                    label=train_df['click'],
                      )

val_ds = lgb.Dataset(val_df.drop(['id', 'click'],axis=1),
                  label=val_df['click'],
                    )

In [26]:
lgbm_params = {'objective':'binary',
               'boosting':'gbdt',
               #'extra_trees':False,
               'metric':'auc',
               'learning_rate':0.1,
               #'num_leaves':256,
               'seed':56,
               'max_depth':6,
               #'xgboost_dart_mode':False,
               #'lambda_l1':0,
              # 'data_sample_strategy':'goss',
              }


model = lgb.train( lgbm_params,
                   train_ds,
                   valid_sets=val_ds,
                   num_boost_round=2_000,
                   )

[LightGBM] [Info] Number of positive: 128289, number of negative: 671711
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 457
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.160361 -> initscore=-1.655543
[LightGBM] [Info] Start training from score -1.655543


KeyboardInterrupt: 

In [23]:
model

<lightgbm.basic.Booster at 0x7f6fc58f5070>