# Import packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
import gc
import time
import itertools
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score
from matplotlib import pyplot as plt
import seaborn as sns
import gzip
import torch
from deepctr_torch.models import DeepFM,DCN
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

In [2]:
np.random.seed(42)


# Data Preprocessing and Feature Engineering

In [3]:
#load data
train = pd.read_csv("./data/my_train.csv", header=0, dtype=str)
test_df = pd.read_csv("./data/my_test.csv", header=0, dtype=str)

In [4]:
test_df['data_label'] = 'test'
train['data_label'] = 'train'
data1 = pd.concat([test_df,train])

In [5]:
data1['hour'] = pd.to_datetime(data1['hour'], format = '%y%m%d%H')
data1['click'] = pd.to_numeric(data1['click'], errors='coerce')
data1['dt'] = data1['hour'].apply(lambda x:str(x)[:10])
device_id_sum = data1.groupby(['dt','device_id']).click.sum().reset_index(name='id_sum')
device_id_cnt = data1.groupby(['dt','device_id']).click.count().reset_index(name='id_cnt')
cnt_sum = pd.merge(device_id_cnt,device_id_sum,on=(['dt','device_id']),how='left')
cnt_sum['device_ctr'] = cnt_sum['id_sum']/cnt_sum['id_cnt']
cnt_sum = cnt_sum.sort_values(['device_id','dt'])
cnt_sum['device_ctr'] = cnt_sum['id_sum']/cnt_sum['id_cnt']
cnt_sum = cnt_sum.sort_values(['device_id','dt'])
cnt_sum['t_1']=cnt_sum.groupby(['device_id'])['device_ctr'].shift(1)
cnt_sum.groupby('device_id')['dt'].count().reset_index().sort_values('dt',ascending=False)
data1 = pd.merge(data1,cnt_sum,on=(['dt','device_id']),how='left')

In [6]:
data1['hour_of_day'] = data1['hour'].dt.hour
data1['weekday'] = (data1['hour'].dt.dayofweek)

In [7]:
obj_cols = ['site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model','device_type','device_conn_type'
,'hour_of_day','weekday','banner_pos']
c_cols = ['C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21','C1']

In [8]:
# sparse特征列
sparse_features = obj_cols+['hour_of_day','weekday']+ c_cols


In [9]:
# Padding
for f in ['device_type','device_conn_type','banner_pos'
          ,'C14','C15','C16','C17','C18', 'C19', 'C20', 'C21', 'C1']:
    data1[f] = data1[f].astype(str)

In [10]:
for f in sparse_features:
    data1[f] = data1[f].fillna('-1', )

In [11]:
# 对sparse onehot
for feat in sparse_features:
    lbe = LabelEncoder()
    data1[feat] = lbe.fit_transform(data1[feat])

In [12]:
# Continuous Features
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
dense_features = ['id_cnt',
 'id_sum',
 'device_ctr',
 't_1']
for f in dense_features:
    data1[f] = data1[f].fillna(0, )

mms = MinMaxScaler(feature_range=(0, 1))
data1[dense_features] = mms.fit_transform(data1[dense_features])


In [13]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data1[feat].nunique(),embedding_dim=4 )
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
# fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data1[feat].nunique(),embedding_dim=4 )
#                        for i,feat in enumerate(sparse_features)]


In [14]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


# Create Dataset 1 and Test Set 1

In [15]:
train = data1[data1['data_label']=='train']
Dataset_1 = train[:27000]
test_1 = data1[data1['data_label']=='test']


In [16]:
train_1_model_input = {name:Dataset_1[name] for name in feature_names}

test_1_model_input = {name:test_1[name] for name in feature_names}

# Train Model 1 and 8 on Dataset 1

In [17]:
from deepctr_torch.models import DeepFM,AFM,xDeepFM,AutoInt,DCN,DIFM
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'
# Set random seed
torch.manual_seed(42)
np.random.seed(42)

In [18]:
model_1 = DCN(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model_1.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model_1.fit(train_1_model_input,Dataset_1['click'].values,batch_size=128,epochs=1,verbose=2,validation_split=0.1)

test_1['pred_1']= model_1.predict(test_1_model_input, batch_size=128)
        

cpu
Train on 24300 samples, validate on 2700 samples, 190 steps per epoch
Epoch 1/1
4s - loss:  0.4312 - binary_crossentropy:  0.4312 - val_binary_crossentropy:  0.4038


In [19]:
model_best = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model_best.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model_best.fit(train_1_model_input,Dataset_1['click'].values,batch_size=128,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_best']= model_best.predict(test_1_model_input, batch_size=128)


cpu
Train on 24300 samples, validate on 2700 samples, 190 steps per epoch
Epoch 1/1
4s - loss:  0.4268 - binary_crossentropy:  0.4267 - val_binary_crossentropy:  0.3968


# Create Dataset 2 and Save the offline Data

In [20]:
train_2 = train[30000:360000]
train_2_model_input = {name:train_2[name] for name in feature_names}
train_2['pred']= model_1.predict(train_2_model_input, batch_size=128)
Dataset_2 = train_2.groupby(train_2.index // 10).apply(lambda x: x.loc[x["pred"].idxmax()])

In [21]:
Test_set_2 = Dataset_2[:3000]
Test_set_2_model_input = {name:Test_set_2[name] for name in feature_names}
new_train=Dataset_2[3000:]
new_train_model_input = {name:new_train[name] for name in feature_names}


In [22]:
Dataset_2['click'].to_csv("data/offline_data.csv")

# Train Model 2-7 on Dataset 2

In [None]:
model2 = DIFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model2.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model2.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_2']= model2.predict(test_1_model_input, batch_size=128)

model3 = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model3.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model3.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_3']= model3.predict(test_1_model_input, batch_size=128)

model4 = DCN(linear_feature_columns,dnn_feature_columns,task='binary',device=device)
model4.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model4.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_4']= model4.predict(test_1_model_input, batch_size=128)

model5 = DIFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device, dnn_hidden_units=(256, 128, 64),)
model5.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model5.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_5']= model5.predict(test_1_model_input, batch_size=128)

model6 = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary',device=device, dnn_hidden_units=(128, 128, 64))
model6.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model6.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_6']= model6.predict(test_1_model_input, batch_size=128)

model7 = DCN(linear_feature_columns,dnn_feature_columns,task='binary',device=device, dnn_hidden_units=(256, 128, 64),)
model7.compile("adam", "binary_crossentropy",metrics=['binary_crossentropy'], )
history = model7.fit(new_train_model_input,new_train['click'].values,batch_size=64,epochs=1,verbose=2,validation_split=0.1)
test_1['pred_7']= model7.predict(test_1_model_input, batch_size=128)

cpu
Train on 27000 samples, validate on 3000 samples, 422 steps per epoch
Epoch 1/1
11s - loss:  0.5953 - binary_crossentropy:  0.5953 - val_binary_crossentropy:  0.5768


# Evaluate all models on the Two test sets and generate reward table

In [None]:
import torch.nn as nn

# Define Binary Cross-Entropy loss function
criterion = nn.BCELoss()


In [None]:
test_1["pred_1"]= model_1.predict(test_1_model_input, batch_size=128)
test_1["pred_2"]= model2.predict(test_1_model_input, batch_size=128)
test_1["pred_3"]= model3.predict(test_1_model_input, batch_size=128)
test_1["pred_4"]= model4.predict(test_1_model_input, batch_size=128)
test_1["pred_5"]= model5.predict(test_1_model_input, batch_size=128)
test_1["pred_6"]= model6.predict(test_1_model_input, batch_size=128)
test_1["pred_7"]= model7.predict(test_1_model_input, batch_size=128)

test_1["pred_best"]= model_best.predict(test_1_model_input, batch_size=128)

for num_model in ["pred_1","pred_2","pred_3","pred_4","pred_5","pred_6","pred_7","pred_best"]:
    print(num_model, ", binary cross entropy loss: ", criterion(torch.tensor(test_1[num_model].values, dtype=torch.float32)
                                                                ,torch.tensor(test_1["click"].values, dtype=torch.float32)).item())

In [None]:
Test_set_2["pred_1"]= model.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_2"]= model2.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_3"]= model3.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_4"]= model4.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_5"]= model5.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_6"]= model6.predict(Test_set_2_model_input, batch_size=128)
Test_set_2["pred_7"]= model7.predict(Test_set_2_model_input, batch_size=128)

Test_set_2["pred_best"]= model_best.predict(Test_set_2_model_input, batch_size=128)

for num_model in ["pred_1","pred_2","pred_3","pred_4","pred_5","pred_6","pred_7","pred_best"]:
    print(num_model, ", binary cross entropy loss: ", criterion(torch.tensor(Test_set_2[num_model].values, dtype=torch.float32)
                                                                ,torch.tensor(Test_set_2["click"].values, dtype=torch.float32)).item())

In [None]:
reward1 = test_1.groupby(test_1.index // 10).apply(lambda x: x.loc[x["pred_1"].idxmax()])[["click"]]
for num_model in ["pred_2","pred_3","pred_4","pred_5","pred_6","pred_7","pred_best"]:
    rw =test_1.groupby(test_1.index // 10).apply(lambda x: x.loc[x[num_model].idxmax()])[["click"]]
    reward1[num_model] = rw["click"].values

In [None]:
reward1.rename(columns={'click': 'pred_1'}, inplace=True)


In [None]:
reward1.to_csv("data/reward_table.csv")