In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
path = './input/'

train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

# 97903891 = 184903891 - 87000000
train = pd.read_csv(path+'train.csv', usecols=train_columns, dtype=dtypes, skiprows=range(1, 97903891), nrows=87000000, parse_dates=['click_time'])
# train = pd.read_csv(path+'train.csv', usecols=train_columns, dtype=dtypes, parse_dates=['click_time'])
test = pd.read_csv(path+'test.csv', usecols=test_columns, dtype=dtypes, parse_dates=['click_time'])

In [3]:
# 训练集label
y_train = train['is_attributed'].values

sub = test[['click_id']]
del test['click_id']

# 训练集与测试集合并
data = pd.concat([train, test], axis=0)
del train, test
gc.collect()

# 时间处理
data['day'] = data['click_time'].dt.day.astype('uint8')
data['hour'] = data['click_time'].dt.hour.astype('uint8')

## 目标编码

In [4]:
# for cols in tqdm([['ip'], ['app'], ['ip','app'], ['ip','hour'], ['ip','os','device'], ['ip','app','os','device'], ['app','os','channel']]):
#     name = '_'.join(cols)
#     res = pd.DataFrame()
#     temp = data[cols + ['day', 'is_attributed']]
#     for period in [7,8,9,10]:
#         mean_ = temp[temp['day']<period].groupby(cols)['is_attributed'].mean().reset_index(name=name + '_mean_is_attributed')
#         mean_['day'] = period
#         res = res.append(mean_, ignore_index=True)
    
#     data = pd.merge(data, res, how='left', on=['day']+cols)

In [5]:
# 保留7 8 9 10号数据
data = data[data['day']>=7]
gc.collect()

14

## 统计特征

In [6]:
# count
for cols in tqdm([['ip'],['ip','os','device'],['ip','day','hour']]):
    name = '_'.join(cols)
    data[name+'_cnts'] = data.groupby(cols)['click_time'].transform('count')
    data[name+'_cnts'] = data[name+'_cnts'].astype('uint16')

# nunique
for f1 in ['ip']:
    for f2 in tqdm(['app','device','os','channel']):
        data[f1+'_'+f2+'_nuni'] = data.groupby([f1])[f2].transform('nunique')
        data[f1+'_'+f2+'_nuni'] = data[f1+'_'+f2+'_nuni'].astype('uint16') 
            
gc.collect()

100%|██████████| 3/3 [00:48<00:00, 13.99s/it]
100%|██████████| 4/4 [03:51<00:00, 60.41s/it]


0

## 时间差特征

In [7]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    for i in range(1,6):
        
        data['ct'] = (data['click_time'].astype(np.int64)//10**9).astype(np.int32)
        
        name = '{}_next_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(-i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        name = '{}_lag_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        data.drop(['ct'],axis=1,inplace=True)

100%|██████████| 2/2 [22:39<00:00, 652.64s/it]


In [8]:
subset = ['ip', 'os', 'device', 'app']
data['click_user_lab'] = 0
pos = data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 1
pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 2
pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 3

## 排序特征

In [9]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    name = '{}_click_asc_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)
    
    name = '{}_click_dec_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)

100%|██████████| 2/2 [05:57<00:00, 174.67s/it]


## 训练集/验证集/测试集

In [10]:
categorical_features = ['ip','app','os','channel','device','day','hour']
features = [f for f in data.columns if f not in ['click_time','is_attributed']]

In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
data = reduce_mem_usage(data)
gc.collect()

Mem. usage decreased to 9584.52 Mb (28.0% reduction)


0

## 负采样

In [13]:
# df_train_neg = data[(data['is_attributed'] == 0)&(data['day'] < 9)]
# df_rest = data[(data['is_attributed'] == 1)|(data['day'] >= 9)]
# del data
# gc.collect()

# df_train_neg = df_train_neg.sample(n=1000000)
# data = pd.concat([df_train_neg,df_rest]).sample(frac=1)
# del df_train_neg
# del df_rest
# gc.collect()

In [14]:
trn_x = data[:82000000][features]
val_x = data[82000000:87000000][features]
trn_y = y_train[:82000000]
val_y = y_train[82000000:87000000]

# test_x = data[87000000:][features]

# trn_x = data[data['day']<9][features]
# trn_y = data[data['day']<9]['is_attributed']

# val_x = data[data['day']==9][features]
# val_y = data[data['day']==9]['is_attributed']

# test_x = data[data['day']>9][features]

del data
gc.collect()

0

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(pd.concat([trn_x, val_x, test_x]))
trn_x[:] = scaler.transform(trn_x)
val_x[:] = scaler.transform(val_x)
test_x[:] = scaler.transform(test_x)

trn_x = trn_x.fillna(0)
val_x = val_x.fillna(0)
test_x = test_x.fillna(0)

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """
  


## LR

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=5, solver='sag')
model.fit(trn_x, trn_y)
val_preds = model.predict_proba(val_x)[:,1]
preds = model.predict_proba(test_x)[:,1]

In [21]:
from sklearn.metrics import roc_auc_score
roc_auc_score(val_y, val_preds)

0.8884591003977454

In [22]:
sub['is_attributed'] = None
sub['is_attributed'] = preds
sub.to_csv('lr_baseline.csv', index=False)

## XGBoost

In [13]:
params = {'eta': 0.2,
          'max_leaves': 2**9-1,  
          'max_depth': 9, 
          'subsample': 0.7, 
          'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'n_jobs':24,
          'random_state': 2020,
          'silent': True}
          
dtrain = xgb.DMatrix(trn_x, trn_y)
dvalid = xgb.DMatrix(val_x, val_y)
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds = 20, verbose_eval=10)

[10:55:01] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
[0]	train-auc:0.93852	valid-auc:0.943594
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.972529	valid-auc:0.973617
[20]	train-auc:0.980228	valid-auc:0.980547
[30]	train-auc:0.9836	valid-auc:0.984491
[40]	train-auc:0.985808	valid-auc:0.986155
[50]	train-auc:0.986874	valid-auc:0.986675
[60]	train-auc:0.987723	valid-auc:0.987025
[70]	train-auc:0.988345	valid-auc:0.987108
[80]	train-auc:0.988892	valid-auc:0.987193
[90]	train-auc:0.989311	valid-auc:0.987247
[100]	train-auc:0.989615	valid-auc:0.987265
[110]	train-auc:0.99008	valid-auc:0.98721
Stopping. Best iteration:
[97]	train-auc:0.989531	valid-auc:0.987325



In [14]:
dtest = xgb.DMatrix(test_x)
sub['is_attributed'] = None
sub['is_attributed'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.to_csv('talkingdata_xgboost.csv', index=False)

## CatBoost

In [None]:
params = {'learning_rate': 0.1,
          'depth': 8,
          'l2_leaf_reg': 10,
          'bootstrap_type': 'Bernoulli',
          'od_type': 'Iter',
          'od_wait': 50,
          'random_seed': 11,
          'allow_writing_files': False}
          
model = CatBoostClassifier(iterations=20000, eval_metric='AUC', **params)
model.fit(trn_x, trn_y,
          eval_set=(val_x, val_y),
          cat_features=categorical_features, 
          use_best_model=True, 
          early_stopping_rounds=50,
          verbose=10)

0:	test: 0.9425938	best: 0.9425938 (0)	total: 1m 3s	remaining: 14d 16h 41m 38s
10:	test: 0.9533948	best: 0.9533948 (10)	total: 10m 47s	remaining: 13d 14h 38m 9s


In [None]:
sub['is_attributed'] = None
sub['is_attributed'] = model.predict_proba(test_x)[:,1]
sub.to_csv('talkingdata_catboost.csv', index=False)