In [1]:
import pickle
import os
import re
import copy
from collections import Counter

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim

In [4]:
from sklearn.metrics import mean_squared_error

In [5]:
if torch.cuda.is_available():
    DEVICE = 'cuda:0'
else:
    DEVICE = 'cpu'

# 1.读取数据

In [6]:
data_df = pd.read_pickle('../data/processed/data_df.pkl')

In [7]:
total_usr = len(set(data_df.usr_id.tolist()))
total_item = len(set(data_df.item_id.tolist()))

In [8]:
usr_feature_df = pd.read_pickle('../data/processed/usr_feature_df.pkl')
item_feature_df = pd.read_pickle('../data/processed/item_feature_df.pkl')

In [9]:
u_feature_dim = usr_feature_df['features'][0].shape[0]
i_feature_dim = item_feature_df['features'][0].shape[0]

### 取特征辅助函数

In [10]:
def get_UI_feature(uid,iid,
           usr_feautre_df=usr_feature_df,
           item_feature_df=item_feature_df):
    '''给定用户和物品id进行特征查找 此处id是处理后的id即-1后的id
    '''
    
    u_feature = usr_feature_df.loc[uid,'features']

    u_feature = np.array(u_feature)

    i_feature = item_feature_df.loc[iid,'features']

    i_feature = np.array(i_feature)

    return u_feature,i_feature

# 2.深度模型 修改加入特征处理部分

In [11]:
class RecManModel(torch.nn.Module):
    def __init__(self,
                 usr_num:int,
                 item_num:int,
                 emb_usr_size:int,
                 emb_item_size:int,
                 usr_feature_dim:int,
                 item_feature_dim:int,
                 feature_hidden_size:int,
                 interact_hidden_size:int):
        '''
        usr_num 和 item_num 为对应用户和物品的总数
        emb_size设定用户和物品的隐变量维度
        feature_dim 为对应用户和物品的特征维度 处理后拼接
        feature_hidden_size 为输入特征映射的维度
        interact_hidden_size 为原来交互时特征维度
        '''
        super(RecManModel,self).__init__()
        self.usr_num = usr_num
        self.item_num = item_num
        self.emb_usr_size = emb_usr_size
        self.emb_item_size = emb_item_size
        self.usr_feature_dim = usr_feature_dim
        self.item_feature_dim = item_feature_dim
        self.feature_hidden_size = feature_hidden_size
        self.interact_hidden_size = interact_hidden_size
        
        # 查看显卡设备是否可用 
        if torch.cuda.is_available():
            self.device = 'cuda:0'
        else:
            self.device = 'cpu'
            
        self.UserEmbeddingLayer = torch.nn.Embedding(num_embeddings=self.usr_num,
                                                     embedding_dim=self.emb_usr_size)
        self.ItemEmbeddingLayer = torch.nn.Embedding(num_embeddings=self.item_num,
                                                     embedding_dim=self.emb_item_size)
        
        self.UserFeatureLayer = torch.nn.Linear(in_features=self.emb_usr_size,
                                                out_features=self.interact_hidden_size)
        self.ItemFeatureLayer = torch.nn.Linear(in_features=self.emb_item_size,
                                               out_features=self.interact_hidden_size)
        
        self.UserMannualFeatureLayer = torch.nn.Linear(in_features=self.usr_feature_dim,
                                                       out_features=self.feature_hidden_size)
        self.ItemMannualFeatureLayer = torch.nn.Linear(in_features=self.item_feature_dim,
                                                       out_features=self.feature_hidden_size)
        
        
        # to device
        self.UserEmbeddingLayer.to(self.device)
        self.UserFeatureLayer.to(self.device)
        self.ItemEmbeddingLayer.to(self.device)
        self.ItemFeatureLayer.to(self.device)
        self.UserMannualFeatureLayer.to(self.device)
        self.ItemMannualFeatureLayer.to(self.device)
        
        
    def forward(self,uid_batch,iid_batch,u_man_feature_batch,i_man_feature_batch):
        '''输入一个batch的usr和item进行交互
        '''
        if not torch.is_tensor(uid_batch):
            u_batch_tensor = torch.tensor(uid_batch)
        else:
            u_batch_tensor = uid_batch
        if not torch.is_tensor(iid_batch):
            i_batch_tensor = torch.tensor(iid_batch)
        else:
            i_batch_tensor = iid_batch
        
        if not torch.is_tensor(u_man_feature_batch):
            u_man_feature_batch = torch.tensor(u_man_feature_batch)
        if not torch.is_tensor(i_man_feature_batch):
            i_man_feature_batch = torch.tensor(i_man_feature_batch)
        
        # 装入设备
        u_batch_tensor = u_batch_tensor.to(self.device)
        i_batch_tensor = i_batch_tensor.to(self.device)
        u_man_feature_batch = u_man_feature_batch.to(self.device)
        i_man_feature_batch = i_man_feature_batch.to(self.device)
        
        # 数据类型转换
        u_man_feature_batch = u_man_feature_batch.to(torch.float)
        i_man_feature_batch = i_man_feature_batch.to(torch.float)
        
        # 嵌入 向量化
        
        u_emb_tensor = self.UserEmbeddingLayer(u_batch_tensor)
        i_emb_tensor = self.ItemEmbeddingLayer(i_batch_tensor)
        
        # 特征抽取 和 非线性化
        u_feature = self.UserFeatureLayer(u_emb_tensor)
        i_feature = self.ItemFeatureLayer(i_emb_tensor)
        
        u_feature = torch.relu(u_feature)
        i_feature = torch.relu(i_feature)
        
        # 外部特征映射
        
        u_mannual_feature = self.UserMannualFeatureLayer(u_man_feature_batch)
        i_mannual_feature = self.ItemMannualFeatureLayer(i_man_feature_batch)
        
        u_mannual_feature = torch.relu(u_mannual_feature)
        i_mannual_feature = torch.relu(i_mannual_feature)
        
        # 隐式显式特征拼接
        u_final_feature = torch.cat([u_feature,u_mannual_feature],dim=1)
        i_final_feature = torch.cat([i_feature,i_mannual_feature],dim=1)
        
        batch_size = u_feature.shape[0]
        interact_hidden_size = self.feature_hidden_size + self.interact_hidden_size
        u_final_feature = u_final_feature.reshape(batch_size,1,interact_hidden_size)
        i_final_feature = i_final_feature.reshape(batch_size,interact_hidden_size,1)
        
        output = torch.bmm(u_final_feature,i_final_feature)
        output = torch.squeeze(output)

        return output

In [12]:
nn = RecManModel(usr_num=total_usr,item_num=total_item,
                 emb_usr_size=50,emb_item_size=150,
                 usr_feature_dim=u_feature_dim,item_feature_dim=i_feature_dim,
                 feature_hidden_size=10,interact_hidden_size=25)

In [13]:
sample = data_df.sample(n=32)

usr_batch = [int(i) for i in sample.usr_id.tolist()]
item_batch = [int(i) for i in sample.item_id.tolist()]
sample =sample.reset_index(drop=True)

usr_feature,item_feature = [],[]
for num,i in enumerate(sample.itertuples()):
    
    usr_id = int(sample.loc[num,'usr_id'])-1
    item_id = int(sample.loc[num,'item_id'])-1
    u,i = get_UI_feature(usr_id,item_id)
    usr_feature.append(u)
    item_feature.append(i)
usr_feature = np.stack(usr_feature)
item_feature = np.stack(item_feature)

In [14]:
nn(usr_batch,item_batch,usr_feature,item_feature)

tensor([0.6906, 2.4191, 0.8169, 1.6972, 1.1681, 3.0045, 2.5780, 2.0790, 1.3601,
        0.4192, 1.9775, 1.6817, 3.8452, 1.9936, 1.0571, 1.5272, 0.8252, 2.3773,
        1.6979, 2.1573, 1.8671, 4.0534, 1.7376, 0.5534, 0.3037, 2.3241, 1.1503,
        0.7811, 3.0241, 1.0510, 0.7794, 0.5163], grad_fn=<SqueezeBackward0>)

### 交叉验证

In [15]:
class MLManDataSet(Dataset):
    def __init__(self,df,usr_feature_df=usr_feature_df,item_feature_df=item_feature_df):
        '''
        输入df 构造dataset
        输出 样本编号和标注
        '''
        self.df = df.copy()
        # 注意 原始数据用户id和物品id从1开始的，但是在embedding过程中是从0算的，因此此处减一
        self.df['usr_id'] = df['usr_id'].apply(lambda x:int(x)-1)
        self.df['item_id'] = df['item_id'].apply(lambda x:int(x)-1)
        self.df['rating'] = df['rating'].apply(lambda x:int(x))
        
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample_df = self.df.iloc[idx]
        usr_feature,item_feature = get_UI_feature(sample_df.usr_id,sample_df.item_id)
        sample = (sample_df.usr_id,sample_df.item_id,usr_feature,item_feature,sample_df.rating)
        return sample

In [16]:
cv_df_lst = []
for i in range(5):
    df_fname = '../data/processed/cv_{0}_df.pkl'.format(i+1)
    df = pd.read_pickle(df_fname)
    cv_df_lst.append(df)

### 定义模型、优化器、损失函数

In [17]:
model = RecManModel(usr_num=total_usr,item_num=total_item,
                 emb_usr_size=50,emb_item_size=150,
                 usr_feature_dim=u_feature_dim,item_feature_dim=i_feature_dim,
                 feature_hidden_size=10,interact_hidden_size=25)

In [18]:
model

RecManModel(
  (UserEmbeddingLayer): Embedding(943, 50)
  (ItemEmbeddingLayer): Embedding(1682, 150)
  (UserFeatureLayer): Linear(in_features=50, out_features=25, bias=True)
  (ItemFeatureLayer): Linear(in_features=150, out_features=25, bias=True)
  (UserMannualFeatureLayer): Linear(in_features=3, out_features=10, bias=True)
  (ItemMannualFeatureLayer): Linear(in_features=20, out_features=10, bias=True)
)

In [19]:
optimizer = torch.optim.SGD([
        {'params': model.parameters()},
                ], lr=0.005,momentum=0.9)

In [20]:
BATCH_SIZE = 1024

### 训练测试过程

仅训练一个step的结果

In [None]:
mse_lst = []

for test_idx in range(5):
    print('----------------')
    print('Test on cv_{0}_df'.format(test_idx+1))
    train_idx_lst = [i for i in range(5) if i!=test_idx]
    # 训练过程
    train_df_lst = []
    for train_idx in train_idx_lst:
        df = cv_df_lst[train_idx]
        train_df_lst.append(df[df['type']=='test'])
    train_df = pd.concat(train_df_lst)
    train_dataset = MLManDataSet(train_df)
    train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE)
    

    model.train()
    
    print('Starting training porcess...')
    for num,(uid_batch,iid_batch,usr_feature,item_feature,true_y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        criterion = torch.nn.MSELoss()
        pred_y = model(uid_batch,iid_batch,usr_feature,item_feature)
        true_y = true_y.to(torch.float).to(DEVICE)
        loss = criterion(pred_y,true_y)
        loss.requires_grad_(True)
        loss.backward()
        print('Batch:{0} Loss:{1}'.format(num,loss))
        optimizer.step()
    
    # 测试过程
    model.eval()
    df = cv_df_lst[test_idx]
    test_df = df[df['type']=='test']

    test_dataset = MLManDataSet(test_df)
    test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE)
    
    temp_mse_lst = []
    # 分batch进行MSE计算 最后平均
    for num,(test_uid_batch,test_iid_batch,test_usr_feature,test_item_feature,test_true_y) in enumerate(train_dataloader):
    
        test_pred_y = model(test_uid_batch,test_iid_batch,test_usr_feature,test_item_feature)
        test_pred_y = test_pred_y.cpu().detach().numpy()
        
        mse_batch = mean_squared_error(test_true_y,test_pred_y)
        temp_mse_lst.append(mse_batch)
        
    mse = np.average(temp_mse_lst)
    mse_lst.append(mse)
    print('MSE on cv_{0}_df:{1}'.format(test_idx+1,mse))
print('Average MSE:{0}'.format(np.average(mse_lst)))

----------------
Test on cv_1_df
Starting training porcess...
Batch:0 Loss:5.99376106262207
Batch:1 Loss:6.159246921539307
Batch:2 Loss:6.090018272399902
Batch:3 Loss:6.892715930938721
Batch:4 Loss:4.588624954223633
Batch:5 Loss:5.650768756866455
Batch:6 Loss:4.795273303985596
Batch:7 Loss:5.318187713623047
Batch:8 Loss:4.009875297546387
Batch:9 Loss:3.886538505554199
Batch:10 Loss:3.368165969848633
Batch:11 Loss:2.714223861694336
Batch:12 Loss:2.88714861869812
Batch:13 Loss:2.632943868637085
Batch:14 Loss:2.704464912414551
Batch:15 Loss:3.0262200832366943
Batch:16 Loss:3.131519317626953
Batch:17 Loss:4.6460771560668945
Batch:18 Loss:3.559967041015625
Batch:19 Loss:3.6407463550567627
Batch:20 Loss:3.476149082183838
Batch:21 Loss:3.1928277015686035
Batch:22 Loss:2.7037534713745117
Batch:23 Loss:2.392691135406494
Batch:24 Loss:2.4546163082122803
Batch:25 Loss:2.0850327014923096
Batch:26 Loss:2.214890718460083
Batch:27 Loss:2.2167513370513916
Batch:28 Loss:2.7848289012908936
Batch:29 Loss

Batch:4 Loss:0.9505516886711121
Batch:5 Loss:1.237420916557312
Batch:6 Loss:1.1586445569992065
Batch:7 Loss:1.4052116870880127
Batch:8 Loss:1.0392869710922241
Batch:9 Loss:1.255163311958313
Batch:10 Loss:1.0966659784317017
Batch:11 Loss:1.1131491661071777
Batch:12 Loss:1.1196949481964111
Batch:13 Loss:0.9964392185211182
Batch:14 Loss:0.9976995587348938
Batch:15 Loss:1.0105879306793213
Batch:16 Loss:1.097186803817749
Batch:17 Loss:1.1171531677246094
Batch:18 Loss:1.1770986318588257
Batch:19 Loss:1.1074777841567993
Batch:20 Loss:1.2725296020507812
Batch:21 Loss:0.9205043911933899
Batch:22 Loss:1.1705653667449951
Batch:23 Loss:1.1528295278549194
Batch:24 Loss:1.103266954421997
Batch:25 Loss:1.0771379470825195
Batch:26 Loss:1.130826711654663
Batch:27 Loss:0.9080814719200134
Batch:28 Loss:1.0639129877090454
Batch:29 Loss:1.0799121856689453
Batch:30 Loss:1.1695717573165894


### 对比lgbRegression

```
----------------
Test on cv_1_df
MSE on cv_5_df:0.9916941295553173
----------------
Test on cv_2_df
MSE on cv_5_df:0.9696588575493511
----------------
Test on cv_3_df
MSE on cv_5_df:0.9576872585148317
----------------
Test on cv_4_df
MSE on cv_5_df:0.9672447937961466
----------------
Test on cv_5_df
MSE on cv_5_df:0.97248790766502
Average MSE:0.9717545894161332
```