In [25]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow.keras.models import Model

In [2]:
path = './input/'

train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

# 97903891 = 184903891 - 87000000
train = pd.read_csv(path+'train.csv', usecols=train_columns, dtype=dtypes, skiprows=range(1, 97903891), nrows=87000000, parse_dates=['click_time'])
test = pd.read_csv(path+'test.csv', usecols=test_columns, dtype=dtypes, parse_dates=['click_time'])

In [3]:
 # 训练集label
y_train = train['is_attributed'].values

sub = test[['click_id']]
del test['click_id']

# 训练集与测试集合并
data = pd.concat([train, test], axis=0)
del train, test
gc.collect()

# 时间处理
data['day'] = data['click_time'].dt.day.astype('uint8')
data['hour'] = data['click_time'].dt.hour.astype('uint8')

## 统计特征

In [4]:
# count
for cols in tqdm([['ip'],['ip','os','device'],['ip','day','hour']]):
    name = '_'.join(cols)
    data[name+'_cnts'] = data.groupby(cols)['click_time'].transform('count')
    data[name+'_cnts'] = data[name+'_cnts'].astype('uint16')

# nunique
for f1 in ['ip']:
    for f2 in tqdm(['app','device','os','channel']):
        data[f1+'_'+f2+'_nuni'] = data.groupby([f1])[f2].transform('nunique')
        data[f1+'_'+f2+'_nuni'] = data[f1+'_'+f2+'_nuni'].astype('uint16') 
            
gc.collect()

100%|██████████| 3/3 [00:48<00:00, 14.06s/it]
100%|██████████| 4/4 [03:50<00:00, 60.26s/it]


12

## 时间差特征

In [5]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    for i in range(1,6):
        
        data['ct'] = (data['click_time'].astype(np.int64)//10**9).astype(np.int32)
        
        name = '{}_next_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(-i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        name = '{}_lag_{}_click'.format('_'.join(cols), str(i))
        data[name] = (data.groupby(cols).ct.shift(i)-data.ct).astype(np.float32)
        data[name] = data[name].fillna(data[name].mean())
        data[name] = data[name].astype('uint16')
        
        data.drop(['ct'],axis=1,inplace=True)

100%|██████████| 2/2 [22:58<00:00, 660.81s/it]


In [6]:
subset = ['ip', 'os', 'device', 'app']
data['click_user_lab'] = 0
pos = data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 1
pos = (~data.duplicated(subset=subset, keep='first')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 2
pos = (~data.duplicated(subset=subset, keep='last')) & data.duplicated(subset=subset, keep=False)
data.loc[pos, 'click_user_lab'] = 3

## 排序特征

In [7]:
for cols in tqdm([['ip','os','device','app'],['ip','os','device','app','day']]):
    name = '{}_click_asc_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)
    
    name = '{}_click_dec_rank'.format('_'.join(cols)) 
    data[name] = data.groupby(cols)['click_time'].rank(ascending=True)

100%|██████████| 2/2 [06:00<00:00, 175.75s/it]


## 训练集/验证集/测试集

In [8]:
categorical_features = ['ip','app','os','channel','device','day','hour']
features = [f for f in data.columns if f not in ['click_time','is_attributed']]
numerical_features = [f for f in features if f not in categorical_features]

In [9]:
def process_sparse_feats(data, cols):
    for f in cols:
        data[f] = data[f].fillna(-999)
        data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
    return data

data = process_sparse_feats(data, categorical_features)

In [10]:
trn_x = data[:82000000][features]
val_x = data[82000000:87000000][features]
trn_y = y_train[:82000000]
val_y = y_train[82000000:87000000]

test_x = data[87000000:][features]

del data
gc.collect()

0

In [14]:
def deepfm_model(sparse_columns, dense_columns, train, test):
    
    ####### 稀疏特征处理部分 ##########
    sparse_input = []
    lr_embedding = []
    fm_embedding = []
    for col in sparse_columns:
        ####### 稀疏特征转换 ##########
        _input = Input(shape=(1,))
        sparse_input.append(_input)
        
        nums = pd.concat((train[col], test[col])).nunique()
        embed = Embedding(nums, 1, embeddings_regularizer=tf.keras.regularizers.l2(0.1))(_input)
        embed = Flatten()(embed)
        lr_embedding.append(embed)
        
        ####### fm层喂入向量初始化 ##########
        embed = Embedding(nums, 10, embeddings_regularizer=tf.keras.regularizers.l2(0.2))(_input)
        reshape = Reshape((10,))(embed)
        fm_embedding.append(reshape)
    
    ####### fm处理层 ##########
    fm_square = Lambda(lambda x: K.square(x))(Add()(fm_embedding))
    square_fm = Add()([Lambda(lambda x:K.square(x))(embed)
                     for embed in fm_embedding])
    snd_order_sparse_layer = subtract([fm_square, square_fm])
    snd_order_sparse_layer = Lambda(lambda x: x * 0.5)(snd_order_sparse_layer)
    
    ####### 数值特征处理 ##########
    dense_input = []
    for col in dense_columns:
        _input = Input(shape=(1,))
        dense_input.append(_input)
    concat_dense_input = concatenate(dense_input)
    fst_order_dense_layer = Activation(activation="relu")(BatchNormalization()(Dense(4)(concat_dense_input)))
    
    ####### 线性部分拼接 ##########
    fst_order_sparse_layer = concatenate(lr_embedding)
    linear_part = concatenate([fst_order_dense_layer, fst_order_sparse_layer])
    
    ####### fm向量与数值特征拼接喂入FC部分 ##########
    concat_fm_embedding = concatenate(fm_embedding)
    concat_fm_embedding_dense = concatenate([concat_fm_embedding, fst_order_dense_layer])
    fc_layer = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(128)(concat_fm_embedding_dense))))
    fc_layer = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(64)(fc_layer))))
    fc_layer = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(32)(fc_layer))))
    
    ######## 输出层 ##########
    output_layer = concatenate([linear_part, snd_order_sparse_layer, fc_layer])
    output_layer = Dense(1, activation='sigmoid')(output_layer)
    
    model = Model(inputs=sparse_input+dense_input, outputs=output_layer)
    
    return model

In [15]:
model = deepfm_model(categorical_features, numerical_features, trn_x, val_x)
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

In [16]:
train_sparse_x = [trn_x[f].values for f in categorical_features]
train_dense_x = [trn_x[f].values for f in numerical_features]
train_label = [trn_y]

valid_sparse_x = [val_x[f].values for f in categorical_features]
valid_dense_x = [val_x[f].values for f in numerical_features]
valid_label = [val_y]

In [None]:
from keras.callbacks import *
# 回调函数
filepath = "deepfm_model.h5"
checkpoint = ModelCheckpoint(
    filepath, monitor='val_auc', verbose=1, save_best_only=True, mode='max', save_weights_only=True)
reduce_lr = ReduceLROnPlateau(
    monitor='val_auc', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
    monitor='val_auc', min_delta=0.0001, patience=5, verbose=1, mode='max')

callbacks = [checkpoint, earlystopping]

hist = model.fit(train_sparse_x+train_dense_x, 
                  train_label,
                  batch_size=8192,
                  epochs=50,
                  validation_data=(valid_sparse_x+valid_dense_x, valid_label),
                  callbacks=callbacks,
                  shuffle=True)

Train on 82000000 samples, validate on 5000000 samples
Epoch 1/50
Epoch 00001: val_auc improved from -inf to 0.95615, saving model to deepfm_model.h5
Epoch 2/50
Epoch 00002: val_auc did not improve from 0.95615
Epoch 3/50
Epoch 00003: val_auc improved from 0.95615 to 0.95988, saving model to deepfm_model.h5
Epoch 4/50

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [26]:
test_sparse_x = [test_x[f].values for f in categorical_features]
test_dense_x = [test_x[f].values for f in numerical_features]

In [1]:
test_pred = model.predict(test_sparse_x+test_dense_x, batch_size=4096, verbose=100)