In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
path = './input/'

train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train = pd.read_csv(path+'train.csv', usecols=train_columns, dtype=dtypes, parse_dates=['click_time'])
test = pd.read_csv(path+'test.csv', usecols=test_columns, dtype=dtypes, parse_dates=['click_time'])

In [3]:
# 训练集label
y_train = train['is_attributed'].values

sub = test[['click_id']]
del test['click_id']

# 训练集与测试集合并
data = pd.concat([train, test], axis=0)
del train, test
gc.collect()

# 时间处理
data['day'] = data['click_time'].dt.day.astype('uint8')
data['hour'] = data['click_time'].dt.hour.astype('uint8')

## 目标编码

In [4]:
for cols in tqdm([['ip'], ['app'], ['ip','app'], ['ip','hour'], ['ip','os','device'], ['ip','app','os','device'], ['app','os','channel']]):
    name = '_'.join(cols)
    res = pd.DataFrame()
    temp = data[cols + ['day', 'is_attributed']]
    for period in [7,8,9,10]:
        mean_ = temp[temp['day']<period].groupby(cols)['is_attributed'].mean().reset_index(name=name + '_mean_is_attributed')
        mean_['day'] = period
        res = res.append(mean_, ignore_index=True)
    
    data = pd.merge(data, res, how='left', on=['day']+cols)

100%|██████████| 7/7 [4:44:09<00:00, 2435.68s/it]  


In [5]:
# # 保留7 8 9 10号数据
data = data[data['day']>=7]
gc.collect()

65

## 统计特征

In [6]:
# count
for cols in tqdm([['ip'],['ip','app'],['ip','os','device'],['ip','day','hour'],\
                  ['app','channel','day','hour'],['ip','device','day','hour']]):
    name = '_'.join(cols)
    data[name+'_cnts'] = data.groupby(cols)['click_time'].transform('count')
    data[name+'_cnts'] = data[name+'_cnts'].astype('uint16')

# nunique
for f1 in ['ip']:
    for f2 in tqdm(['app','device','os','channel']):
        data[f1+'_'+f2+'_nuni'] = data.groupby([f1])[f2].transform('nunique')
        data[f1+'_'+f2+'_nuni'] = data[f1+'_'+f2+'_nuni'].astype('uint16') 
            
gc.collect()

100%|██████████| 6/6 [04:10<00:00, 41.67s/it]
100%|██████████| 4/4 [07:00<00:00, 105.15s/it]


59

## 时间差特征

In [7]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    for i in range(1,6):
        
        data['ct'] = (data['click_time'].astype(np.int64)//10**9).astype(np.int32)
        
        name = '{}_next_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(-i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        name = '{}_lag_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        data.drop(['ct'],axis=1,inplace=True)

100%|██████████| 2/2 [2:00:50<00:00, 3625.38s/it]


In [8]:
subset = ['ip', 'os', 'device', 'app']
data['click_user_lab'] = 0
pos = data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 1
pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 2
pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 3

## 排序特征

In [9]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    name = '{}_click_asc_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)
    
    name = '{}_click_dec_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)

100%|██████████| 2/2 [12:40<00:00, 380.28s/it]


## 训练集/验证集/测试集

In [10]:
categorical_features = ['ip','app','os','channel','device','day','hour']
features = [f for f in data.columns if f not in ['click_time','is_attributed']]
features

['ip',
 'app',
 'device',
 'os',
 'channel',
 'day',
 'hour',
 'ip_mean_is_attributed',
 'app_mean_is_attributed',
 'ip_app_mean_is_attributed',
 'ip_hour_mean_is_attributed',
 'ip_os_device_mean_is_attributed',
 'ip_app_os_device_mean_is_attributed',
 'app_os_channel_mean_is_attributed',
 'ip_cnts',
 'ip_app_cnts',
 'ip_os_device_cnts',
 'ip_day_hour_cnts',
 'app_channel_day_hour_cnts',
 'ip_device_day_hour_cnts',
 'ip_app_nuni',
 'ip_device_nuni',
 'ip_os_nuni',
 'ip_channel_nuni',
 'ip_os_device_app_next_1_click',
 'ip_os_device_app_lag_1_click',
 'ip_os_device_app_next_2_click',
 'ip_os_device_app_lag_2_click',
 'ip_os_device_app_next_3_click',
 'ip_os_device_app_lag_3_click',
 'ip_os_device_app_next_4_click',
 'ip_os_device_app_lag_4_click',
 'ip_os_device_app_next_5_click',
 'ip_os_device_app_lag_5_click',
 'ip_os_device_app_day_next_1_click',
 'ip_os_device_app_day_lag_1_click',
 'ip_os_device_app_day_next_2_click',
 'ip_os_device_app_day_lag_2_click',
 'ip_os_device_app_day_nex

In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
data = reduce_mem_usage(data)
gc.collect()

Mem. usage decreased to 21318.78 Mb (40.7% reduction)


15

## 负采样

In [13]:
# # 对训练集进行负采样
# df_train_neg = data[(data['is_attributed'] == 0)&(data['day'] < 9)]
# df_train_neg = df_train_neg.sample(n=1000000)

# # 合并成新的数据集
# df_rest = data[(data['is_attributed'] == 1)|(data['day'] >= 9)]
# data = pd.concat([df_train_neg, df_rest]).sample(frac=1)
# del df_train_neg
# del df_rest
# gc.collect()

In [None]:
trn_x = data[data['day']<9][features]
trn_y = data[data['day']<9]['is_attributed']

val_x = data[data['day']==9][features]
val_y = data[data['day']==9]['is_attributed']

test_x = data[data['day']>9][features]

del data
gc.collect()

## LightGBM

In [15]:
params = {'num_leaves': 127,
          'min_data_in_leaf': 32, 
          'objective':'binary',
          'max_depth': -1,
          'learning_rate': 0.1,
          'min_child_samples': 20,
          'boosting': 'gbdt',
          'feature_fraction': 0.8,
          'bagging_freq': 1,
          'bagging_fraction': 0.8 ,
          'bagging_seed': 11,
          'metric': 'auc',
          'lambda_l1': 0.1,
          'verbosity': -1
         }
train_data = lgb.Dataset(trn_x.values.astype(np.float32), label=trn_y,
                         categorical_feature=categorical_features, feature_name=features)
valid_data = lgb.Dataset(val_x.values.astype(np.float32), label=val_y,
                         categorical_feature=categorical_features, feature_name=features)

clf = lgb.train(params,
                train_data,
                10000,
                early_stopping_rounds=100,
                valid_sets=[valid_data],
                verbose_eval=1
                )



[1]	valid_0's auc: 0.913983
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.555097
[3]	valid_0's auc: 0.715026
[4]	valid_0's auc: 0.724532
[5]	valid_0's auc: 0.759976
[6]	valid_0's auc: 0.76629
[7]	valid_0's auc: 0.796175
[8]	valid_0's auc: 0.833505
[9]	valid_0's auc: 0.84212
[10]	valid_0's auc: 0.843283
[11]	valid_0's auc: 0.850733
[12]	valid_0's auc: 0.871978
[13]	valid_0's auc: 0.875328
[14]	valid_0's auc: 0.875165
[15]	valid_0's auc: 0.87586
[16]	valid_0's auc: 0.880049
[17]	valid_0's auc: 0.88159
[18]	valid_0's auc: 0.883265
[19]	valid_0's auc: 0.886364
[20]	valid_0's auc: 0.884776
[21]	valid_0's auc: 0.886904
[22]	valid_0's auc: 0.890412
[23]	valid_0's auc: 0.888791
[24]	valid_0's auc: 0.89301
[25]	valid_0's auc: 0.894779
[26]	valid_0's auc: 0.894971
[27]	valid_0's auc: 0.895712
[28]	valid_0's auc: 0.895321
[29]	valid_0's auc: 0.8961
[30]	valid_0's auc: 0.896595
[31]	valid_0's auc: 0.900489
[32]	valid_0's auc: 0.900557
[33]	valid_0's auc: 0.901

In [None]:
params = {'eta': 0.2,
          'max_leaves': 2**9-1,  
          'max_depth': 9, 
          'subsample': 0.7, 
          'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'n_jobs':24,
          'random_state': 2020,
          'silent': True}
          
dtrain = xgb.DMatrix(trn_x, trn_y)
dvalid = xgb.DMatrix(val_x, val_y)
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds = 20, verbose_eval=10)

### 训练集合并验证集

In [None]:
trn_x = pd.concat([trn_x, val_x], axis=0, ignore_index=True)
trn_y = np.r_[trn_y, val_y] # 是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等，类似于pandas中的concat() 
del val_x
del val_y
gc.collect()

In [None]:
train_data = lgb.Dataset(trn_x.values.astype(np.float32), label=trn_y,
                        categorical_feature=categorical_features, feature_name=features)

In [None]:
trees = 400
clf = lgb.train(params,
                train_data,
                int(trees * 1.2),
                valid_sets=[train_data],
                verbose_eval=10
                )