In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp39-cp39-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1 plotly-5.18.0 tenacity-8.2.3
[0m

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier,Pool,cv

In [2]:
df = pd.read_csv("train.gz")

In [3]:
#df = df[:1_000_000]

In [4]:
def get_features(df_train,df_test,cols):
    for col in tqdm(cols):
        group = df_train[['click',col]].groupby(col)#.agg(['mean','count'])
        agg_group = group.agg(['mean','count'])
        
        agg_group[col] = agg_group.index
        agg_group[f'{col}_mean'] = agg_group['click']['mean']
        agg_group[f'{col}_count'] = agg_group['click']['count']
        agg_group = agg_group.drop(['click'],axis=1)
        agg_group = agg_group.set_axis([col, f'{col}_count',f'{col}_mean'], axis=1)
        agg_group.index = agg_group.index.rename('index')
        
        df_train[f'{col}_mean'] = group.transform('mean')
        df_train[f'{col}_count'] = group.transform('count')
        
        df_test = df_test.merge(agg_group,how='inner')
    return df_train,df_test

In [5]:
cat_features = ['C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type',
        'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
feature_cols = ['site_id','app_id','device_ip','C14','device_model','device_id','site_domain',
                'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
num_features = [ 'app_id_count','device_ip_count', 'C14_count',
                 'app_id_mean','device_ip_mean', 'C14_mean']#['month', 'day','week', 'quarter', 'weekofyear', 'dayofweek',
               # 'dayofyear', 'all_time','all_week']
features = cat_features + num_features
label_col = 'click'

In [6]:
class ColsCollect():
    def __init__(self,cat_cols,top_k=512):
        self.cat_cols = cat_cols
        self.top_k = top_k
    
    def fit_transform(self,df):
        self.good_cols = []
        for col in tqdm(self.cat_cols):
            good_cols = df[col].value_counts().index[:self.top_k]
            df[col] = df[col].apply(lambda x: x if x in good_cols else -1)
        return df

In [7]:
#cols_collect = ColsCollect(cat_features)
#df = cols_collect.fit_transform(df)

In [8]:
#df['dt'] = pd.to_datetime(df['hour'], format = '%y%m%d%H')

In [9]:
def make_base_time_features(df,col_dt='dt'):
    df['month'] = df[col_dt].apply(lambda x:x.month)
    df['day'] = df[col_dt].apply(lambda x:x.day)
    df['week'] = df[col_dt].apply(lambda x:x.week)
    df['hour'] = df[col_dt].apply(lambda x:x.hour)
    df['quarter'] = df[col_dt].apply(lambda x:x.quarter)
    #df['year'] = df[col_dt].apply(lambda x: x.year - 2012)
    
    df['weekofyear'] = df[col_dt].apply(lambda x:x.weekofyear)
    df['dayofweek'] = df[col_dt].apply(lambda x:x.dayofweek)
    df['dayofyear'] = df[col_dt].apply(lambda x:x.dayofyear)
    df['week'] = df[col_dt].apply(lambda x:x.week)
    
    df['all_time'] = df['dayofyear'] * 24 + df['hour']
    df['all_week'] = df['weekofyear'] + df['dayofweek'] / 7
    return df

def get_polynoms_from_column(df,col):
    min_v = df[col].min()
    max_v = df[col].max()
    
    df[f'sin_{col}'] = df[col].apply(sin)
    df[f'cos_{col}'] = df[col].apply(cos)
    df[f'sin_{col}^2'] = df[col].apply(sin) * df[col].apply(sin)
    df[f'cos_{col}^2'] = df[col].apply(cos) * df[col].apply(cos)
    
    df[f'{col}_sin'] = df[col].apply(lambda x: sin((x - min_v) / max_v * 2 * pi))
    df[f'{col}_cos'] = df[col].apply(lambda x: cos((x - min_v) / max_v * 2 * pi))
    df[f'{col}_sin^2'] = df[f'{col}_sin'] * df[f'{col}_sin']
    df[f'{col}_cos^2'] = df[f'{col}_cos'] * df[f'{col}_cos']
    return df

def get_dop_features(df):
    day = 24
    year = 365.2425*day
    df['vday_sin'] = (df.hour * 2 * np.pi / day)
    df['vday_sin'] = df['vday_sin'].apply(sin)
    df['vday_cos'] = (df.hour * 2 * np.pi / day)
    df['vday_cos'] = df['vday_cos'].apply(cos)
    df['vyear_sin'] = (df.hour * 2 * np.pi / year)
    df['vyear_sin'] = df['vyear_sin'].apply(sin)
    df['vyear_cos'] = (df.hour * 2 * np.pi / year)
    df['vyear_cos'] = df['vyear_cos'].apply(cos)
    
    return df

In [10]:
#df = make_base_time_features(df)

In [11]:
df.shape[0] * 0.8

32343173.6

In [12]:
train_df = df[:32343173]
val_df = df[32343173:]
#train_df,val_df = train_test_split(df,test_size=0.2,random_state=56)

In [13]:
del df

In [14]:
def get_features(df_train,df_test,cols):
    for col in tqdm(cols):
        group = df_train[['click',col]].groupby(col)#.agg(['mean','count'])
        agg_group = group.agg(['mean','count'])
        
        agg_group[col] = agg_group.index
        agg_group[f'{col}_mean'] = agg_group['click']['mean']
        agg_group[f'{col}_count'] = agg_group['click']['count']
        agg_group = agg_group.drop(['click'],axis=1)
        agg_group = agg_group.set_axis([col, f'{col}_count',f'{col}_mean'], axis=1)
        agg_group.index = agg_group.index.rename('index')
        
        df_train[f'{col}_mean'] = group.transform('mean')
        df_train[f'{col}_count'] = group.transform('count')
        
        df_test = df_test.merge(agg_group,how='inner')
    return df_train,df_test

In [15]:
train_df,val_df = get_features(train_df,val_df,['app_id','app_id','device_ip','C14'])

  0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
train_df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'app_id_mean',
       'app_id_count', 'device_ip_mean', 'device_ip_count', 'C14_mean',
       'C14_count'],
      dtype='object')

In [17]:
val_df

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C18,C19,C20,C21,app_id_count,app_id_mean,device_ip_count,device_ip_mean,C14_count,C14_mean
0,4.990036e+18,0,14102823,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,3,1839,-1,171,0.200071,876207,0.093675,1676,0.13986,3718
1,6.701787e+18,0,14102900,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,3,1839,-1,171,0.200071,876207,0.093675,1676,0.13986,3718
2,1.175641e+19,0,14102905,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,3,1839,-1,171,0.200071,876207,0.093675,1676,0.13986,3718
3,9.406604e+18,0,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,3,1839,-1,171,0.200071,876207,0.093675,1676,0.13986,3718
4,1.269040e+19,0,14102914,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,3,1831,-1,171,0.200071,876207,0.093675,1676,0.13986,3718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3721574,2.973288e+18,0,14102916,1007,0,85f751fd,c4e18dd6,50e219e0,d627cae2,5406e4db,...,3,175,100114,100,0.666667,3,1.000000,1,0.50000,4
3721575,3.179110e+18,1,14102916,1007,0,85f751fd,c4e18dd6,50e219e0,d627cae2,5406e4db,...,3,175,100114,100,0.666667,3,1.000000,1,0.50000,4
3721576,3.247450e+18,0,14102916,1007,0,85f751fd,c4e18dd6,50e219e0,d627cae2,5406e4db,...,3,175,100114,100,0.666667,3,1.000000,1,0.50000,4
3721577,3.885457e+18,0,14102916,1007,0,85f751fd,c4e18dd6,50e219e0,d627cae2,5406e4db,...,3,175,100114,100,0.666667,3,1.000000,1,0.50000,4


In [18]:
train_pool = Pool(train_df[features],
                  label=train_df[label_col],
                  cat_features=cat_features)

eval_pool = Pool(val_df[features],
                  label=val_df[label_col],
                  cat_features=cat_features)

In [19]:
params = {'iterations':800,
          'learning_rate':0.1,
          'loss_function':'CrossEntropy',
          'max_depth':7,
          'eval_metric':'AUC',
          'task_type':'GPU',
          'leaf_estimation_method':'Newton',
          'gpu_ram_part':0.8,
          'one_hot_max_size':32,
          'random_seed':56}
model = CatBoostClassifier(**params)

In [None]:
model.fit(train_pool,eval_set=eval_pool,verbose=1)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5928556	best: 0.5928556 (0)	total: 1.64s	remaining: 21m 49s
1:	total: 3.45s	remaining: 22m 56s
2:	total: 5.26s	remaining: 23m 17s
3:	total: 6.83s	remaining: 22m 38s
4:	total: 8.51s	remaining: 22m 32s
5:	test: 0.4833190	best: 0.5928556 (0)	total: 10.3s	remaining: 22m 39s
6:	total: 12.1s	remaining: 22m 48s
7:	total: 13.9s	remaining: 22m 59s
8:	total: 15.7s	remaining: 23m
9:	total: 17.5s	remaining: 23m 3s


In [25]:
preds = model.predict_proba(eval_pool)[:,1]

In [27]:
from sklearn.metrics import roc_auc_score

In [29]:
roc_auc_score(val_df[label_col],preds)

0.810079574859153

In [41]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,app_id,18.001241
1,site_id,15.552773
2,device_ip,14.183223
3,site_domain,8.822575
4,device_model,8.410217
5,C14,7.657343
6,device_id,5.039237
7,C17,4.853359
8,C21,3.243395
9,site_category,2.463928


In [21]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,site_id,16.763649
1,app_id,14.090229
2,device_ip,13.515227
3,device_model,8.493402
4,device_id,7.925742
5,C14,5.94886
6,device_ip_count,5.792664
7,C21,5.673746
8,site_domain,4.858984
9,C17,3.220845
