# 天猫用户复购预测-DIN

* Content：天猫用户复购预测,使用Attention机制的DNN模型完成预测;
* Author:  HuiHui
* Date:    2020-03-28
* Reference:
* 数据集：该数据集包含“双11”前6个月和“双11”当天匿名用户的购物日志，以及显示他们是否为重复购买者的标签信息。
    * label: 1'表示'user_id'是'merchant_id'的重复买家，而'0'则相反。'-1'表示'user_id'不是给定商家的新客户
    * activity_log: {用户id，商家id}之间的一组交互记录，其中每个记录都是一个动作，表示为“项目id:category id:brand id:time\u stamp:action\u type”#'用于分隔两个相邻元素。记录不按任何特定顺序排序

In [0]:
from google.colab import drive
drive.mount('/content/gdrive') #挂载网盘

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
os.chdir("/content/gdrive/My Drive/RS/Repeat Buyers Prediction") #改变当前工作目录到指定的路径

In [0]:
!pip install tqdm
%tensorflow_version 1.x
!pip install -q deepctr[gpu]

TensorFlow 1.x selected.


In [0]:
#!pip uninstall -y tensorflow
#!pip install tensorflow-gpu==1.14.0

In [0]:
#环境
import sys
print(sys.version)

import deepctr
print(deepctr.__version__)

import tensorflow as tf
print(tf.__version__)

if tf.test.gpu_device_name():
  print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
  print("Please install GPU version of TF")

3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]
0.7.4
1.15.2
Default GPU Device: /device:GPU:0


In [0]:
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# time1=time.time()
# time2=time.time()
# print(time2-time1)

# 用户行为，使用format1进行加载
# 加载全量样本
user_log = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1/train_format1.csv')
submission = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1/test_format1.csv')

# # 加载小样本
# user_log = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
# user_info = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1_small/sample_user_info.csv')
# train_data1 = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1_small/train.csv')
# submission = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1_small/test.csv')

train_data = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format2/train_format2.csv')

###### 数据处理 #######
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix.head())

# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

# 对离散特征做LabelEncoder（0～n-1）
lbe_merchant_id=LabelEncoder()
lbe_merchant_id.fit(np.r_[0,user_log['merchant_id'].values])
user_log['merchant_id']=lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id']=lbe_merchant_id.transform(matrix['merchant_id'])

lbe_user_id=LabelEncoder()
user_log['user_id']=lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id']=lbe_user_id.transform(user_info['user_id'])
matrix['user_id']=lbe_user_id.transform(matrix['user_id'])

lbe_item_id=LabelEncoder()
user_log['item_id']=lbe_item_id.fit_transform(user_log['item_id'])
lbe_cat_id=LabelEncoder()
user_log['cat_id']=lbe_cat_id.fit_transform(user_log['cat_id'])
lbe_brand_id=LabelEncoder()
user_log['brand_id']=lbe_brand_id.fit_transform(user_log['brand_id'])

user_log['merchant_id'].max(),user_log['user_id'].max() #统计不同商家个数：4994+1，不同买家的个数：19111+1(小样本时)
print(user_log['merchant_id'].max())
print(user_log['user_id'].max())
matrix = matrix.merge(user_info, on='user_id', how='left')

# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()

# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600 #用户一开始在淘宝买东西和最近一次在淘宝买东西的间隔时间
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计action_type为0，1，2，3的个数（原始操作，没有补0）
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
#print(matrix)

# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔

#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

lbe_action_type={0:1,1:2,2:3,3:4}
user_log['action_type']=user_log['action_type'].map(lbe_action_type) #action_type映射成1,2,3,4；方便后面用0补缺固定长度

# 用户行为sequence
# 把user_log里同user的这些数据合并成一个list
temp=pd.DataFrame(user_log.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))
# 列名称改成hist_merchant_id 和 hist_action_type 
temp.columns=['hist_merchant_id','hist_action_type']
matrix = matrix.merge(temp, on=['user_id'], how='left')

# 截取，补缺到定长M个
M=500
for feature in ['hist_merchant_id','hist_action_type']:
    matrix[feature]=matrix[feature].map(lambda x:np.array(x+[0]*(M-len(x)))[:M])
#print(matrix.info())

# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

#保存特征文件，调试不同算法模型
train_X.to_csv('./train_X.csv', index=False)
train_y.to_csv('./train_y.csv', index=False)
test_data.to_csv('./test_data.csv', index=False)
print("ok")

4995
424169




ok


In [0]:
##### 使用DIN模型 ######
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_feature_names
from deepctr.models import DIN, DIEN, DSIN
from sklearn.metrics import classification_report

M=500
#读取submission,存储结果
import gc
import numpy as np
import pandas as pd
import re
submission = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1/test_format1.csv') # 加载全量样本时
# submission = pd.read_csv('./Repeat Buyers Prediction DataSet/data_format1_small/test.csv') # 加载小样本时
submission['origin'] = 'test'
print(submission.head())

#读取保存好的特征文件，注意这里重新读取会将'hist_merchant_id','hist_action_type'读取成str,而不是list，因此需要将train_X和test_data中的hist_merchant_id，hist_action_type转回list
train_X=pd.read_csv('./train_X.csv')
train_y=pd.read_csv('./train_y.csv')
test_data=pd.read_csv('./test_data.csv')
#将train_X和test_data中的hist_merchant_id，hist_action_type转回list
def strlist_to_list(s):
  s=re.sub('[\[\]]','',s)#删除中括号
  s=s.replace("\n", "")#删除换行符号
  s=s.split() # 以空格（一个或多个）分隔
  s=[float(num) for num in s]
  return s
train_X['hist_merchant_id']=train_X['hist_merchant_id'].map(strlist_to_list)
train_X['hist_action_type']=train_X['hist_action_type'].map(strlist_to_list)
test_data['hist_merchant_id']=test_data['hist_merchant_id'].map(strlist_to_list)
test_data['hist_action_type']=test_data['hist_action_type'].map(strlist_to_list)
#print(type(train_X['hist_merchant_id'].values))
print("ok")

train_X['action_type']=3 #因为这里用户在商家是购买过商品的，所以添加了一列action_type，且设为3

#格式转换（将之前的特征名封装成了一个类）
feature_columns = []
for column in train_X.columns:
  if column != 'hist_merchant_id' and column != 'hist_action_type':
    #print(column)
    num = train_X[column].nunique()
    if num > 10000:
        dim = 10
    else:
        if num > 1000:
            dim = 8
        else:
            dim = 4
    #print(num)
    if column  == 'user_id':
        feature_columns += [SparseFeat(column, 424169+1, embedding_dim=dim)]
    elif column  == 'merchant_id':
        feature_columns += [SparseFeat(column, 4995+1, embedding_dim=dim)]
    elif column  == 'action_type':
        feature_columns += [SparseFeat(column, 4+1, embedding_dim=dim)]
    else:
        feature_columns += [DenseFeat(column, 1)]

#print(train_X['hist_merchant_id'].shape)
#M = len(train_X['hist_merchant_id'])
print('M=', M)

# maxlen为历史信息的长度，vocabulary_size为onehot的长度；VarLenSparseFeat为序列类型特征
# 注意正确使用VarLenSparseFeat，不要加参数：weight_name='hist_merchant_id'，weight_name='hist_action_type'
feature_columns += [VarLenSparseFeat(SparseFeat('hist_merchant_id', vocabulary_size=424169+1, embedding_dim=8), maxlen=M),
                   VarLenSparseFeat(SparseFeat('hist_action_type', vocabulary_size=4+1, embedding_dim=4),maxlen=M)]
hist_features=['merchant_id','action_type']
print(feature_columns)

# 使用DIN模型
# 这里要求inputs shapes删除维度（/通道数）以后形状相同;通道数位置为-1，即倒数第一个数
model=DIN(feature_columns, hist_features)
# 使用Adam优化器，二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
# 组装train_model_input，得到feature names，将train_X转换为字典格式
feature_names=list(train_X.columns)
train_model_input ={name:train_X[name].values for name in feature_names}

# histroy输入必须是二维数组
#进度条
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(train_model_input[fea]):
        l.append(i)
    train_model_input[fea]=np.array(l) #转换成二维数组

history = model.fit(train_model_input, train_y, verbose=True, epochs=10, validation_split=0.2,batch_size=512)

# 转换test__model_input
test_data['action_type']=3
test_model_input = {name:test_data[name].values for name in feature_names}#字典
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(test_model_input[fea]):
        l.append(i)
    test_model_input[fea]=np.array(l)#二维数组

# 得到预测结果
prob = model.predict(test_model_input)
submission['prob'] = prob
print(submission.head())
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('./prediction.csv', index=False)

   user_id  merchant_id  prob origin
0   163968         4605   NaN   test
1   360576         1581   NaN   test
2    98688         1964   NaN   test
3    98688         3645   NaN   test
4   295296         3361   NaN   test
ok
M= 500
[SparseFeat(name='user_id', vocabulary_size=424170, embedding_dim=10, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'), SparseFeat(name='merchant_id', vocabulary_size=4996, embedding_dim=8, use_hash=False, dtype='int32', embedding_name='merchant_id', group_name='default_group'), DenseFeat(name='prob', dimension=1, dtype='float32'), DenseFeat(name='u1', dimension=1, dtype='float32'), DenseFeat(name='u2', dimension=1, dtype='float32'), DenseFeat(name='u3', dimension=1, dtype='float32'), DenseFeat(name='u4', dimension=1, dtype='float32'), DenseFeat(name='u5', dimension=1, dtype='float32'), DenseFeat(name='u6', dimension=1, dtype='float32'), DenseFeat(name='u7', dimension=1, dtype='float32'), DenseFeat(name='u8', dimension=1, 

100%|██████████| 260864/260864 [00:00<00:00, 2331063.48it/s]
100%|██████████| 260864/260864 [00:00<00:00, 2328345.11it/s]


Train on 208691 samples, validate on 52173 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


100%|██████████| 261477/261477 [00:00<00:00, 2427937.70it/s]
100%|██████████| 261477/261477 [00:00<00:00, 2496333.11it/s]


   user_id  merchant_id  prob origin
0   163968         4605   0.0   test
1   360576         1581   0.0   test
2    98688         1964   0.0   test
3    98688         3645   0.0   test
4   295296         3361   0.0   test


In [0]:
print(submission.describe())

             user_id    merchant_id      prob
count  261477.000000  261477.000000  261477.0
mean   212121.259128    2539.620077       0.0
std    122480.366678    1451.697856       0.0
min         2.000000       2.000000       0.0
25%    106317.000000    1340.000000       0.0
50%    212289.000000    2482.000000       0.0
75%    318194.000000    3898.000000       0.0
max    424169.000000    4993.000000       0.0
