# 一个简单的推荐系统搭建(回归)

# 一、项目背景

试图模仿youtube经典双塔推荐系统架构

# 二、数据集简介

本项目用所采用的的数据集是movielens 1m的数据集(100K)

## 数据加载和预处理

In [1]:
# 因为数据中没有用户信息，所以处理电影信息
import pandas as pd
import numpy as np
movie_data = pd.read_csv("/home/aistudio/data/data104339/movies.csv",usecols=[0,2])
attrs = []
for index,row in movie_data.iterrows():
    # print(row[1])
    genres = row[1].split("|")
    for genre in genres:
        if genre not in attrs:
            attrs.append(genre)

In [2]:
movie_attrs = []
for row in movie_data["genres"].str.split("|"):
    temp = np.zeros(shape=(20))
    for attr in row:
        if attr in attrs:
            temp[attrs.index(attr)] = 1
    movie_attrs.append(temp)
movie_attrs = pd.DataFrame(movie_attrs)
movie_data = pd.concat([movie_data,movie_attrs],axis=1)
movie_data.drop("genres",axis=1,inplace=True)
movie_data.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
rating = pd.read_csv("/home/aistudio/data/data104339/ratings.csv",usecols=[0,1,2])
data = pd.merge(rating,movie_data,how='left',on="movieId")
col = [*range(23)]
data.columns = col
X = data.iloc[:,[0]+[*range(3,23)]].values
y = data.iloc[:,2].values.reshape(-1,1)

In [4]:
import paddle
from paddle.io import Dataset

class MyDataset(Dataset):
    """
    步骤一：继承paddle.io.Dataset类
    """
    def __init__(self,x,y,num_samples):
        """
        步骤二：实现构造函数，定义数据集大小
        """
        super(MyDataset, self).__init__()
        self.num_samples = num_samples
        self.x = x
        self.y = y
    def __getitem__(self, index):
        """
        步骤三：实现__getitem__方法，定义指定index时如何获取数据，并返回单条数据（训练数据，对应的标签）
        """
        data = paddle.to_tensor(self.x[index], dtype='float32')
        label = paddle.to_tensor(self.y[index],dtype='float32')

        return data, label

    def __len__(self):
        """
        步骤四：实现__len__方法，返回数据集总数目
        """
        return self.num_samples

# 测试定义的数据集
custom_dataset = MyDataset(X,y,1000)
train_loader = paddle.io.DataLoader(custom_dataset,batch_size=64, shuffle=True)


## 模型选择和开发

In [6]:
import paddle
class Rec(paddle.nn.Layer):
    def __init__(self):
        super(Rec, self).__init__()

        self.linear_1 = paddle.nn.Linear(21, 512)
        self.linear_2 = paddle.nn.Linear(512, 256)
        self.linear_3 = paddle.nn.Linear(256, 128)
        self.output= paddle.nn.Linear(128, 1)
        self.relu = paddle.nn.ReLU()
        self.relu_2 = paddle.nn.ReLU()

    def forward(self, inputs):
        y = self.linear_1(inputs)
        y = self.relu(y)
        y = self.linear_2(y)
        y = self.relu_2(y)
        y = self.linear_3(y)
        y = self.output(y)
       # y = paddle.clip(y,min=0,max=5)

        return y

rec = Rec()

## 模型网络结构可视化

In [12]:
model_.summary()

---------------------------------------------------------------------------
 Layer (type)       Input Shape          Output Shape         Param #    
   Linear-1          [[64, 21]]           [64, 512]           11,264     
    ReLU-1          [[64, 512]]           [64, 512]              0       
   Linear-2         [[64, 512]]           [64, 256]           131,328    
    ReLU-2          [[64, 256]]           [64, 256]              0       
   Linear-3         [[64, 256]]           [64, 128]           32,896     
   Linear-4         [[64, 128]]            [64, 1]              129      
Total params: 175,617
Trainable params: 175,617
Non-trainable params: 0
---------------------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.81
Params size (MB): 0.67
Estimated Total Size (MB): 1.49
---------------------------------------------------------------------------



{'total_params': 175617, 'trainable_params': 175617}

In [7]:
from paddle.metric import Metric
import numpy
# 定义MSE 评分
class MSE(Metric):
    """
    Precision (also called positive predictive value) is the fraction of
    relevant instances among the retrieved instances. Refer to
    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
    Noted that this class manages the precision score only for binary
    classification task.

    ......
    """

    def __init__(self, name='MSE', *args, **kwargs):
        super(MSE, self).__init__(*args, **kwargs)
        self._name = name
        self.MSE = 0

    def update(self, preds, labels):
        """
        Update the states based on the current mini-batch prediction results.
        Args:
            preds (numpy.ndarray): The prediction result, usually the output
               of two-class sigmoid function. It should be a vector (column
               vector or row vector) with data type: 'float64' or 'float32'.
           labels (numpy.ndarray): The ground truth (labels),
               the shape should keep the same as preds.
               The data type is 'int32' or 'int64'.
        """
        # if isinstance(preds, paddle.Tensor):
        #     preds = preds.numpy()
        # if isinstance(labels, paddle.Tensor):
        #     labels = labels.numpy()
        
        temp = paddle.to_tensor(preds - labels)
        self.MSE += paddle.sum(paddle.pow(temp,2)) # Σ (preds - labels)^2
            

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.MSE = 0

    def accumulate(self):
        """
        Calculate the final precision.

        Returns:
           A scaler float: results of the calculated precision.
        """
        # self.MSE = paddle.to_tensor(self.MSE)
        return paddle.mean(self.MSE)

    def name(self):
        """
        Returns metric name
        """
        return self._name

## 模型训练

In [8]:
model_ = paddle.Model(rec)
model_.prepare(optimizer=paddle.optimizer.Adam(parameters=model_.parameters()),
              loss=paddle.nn.MSELoss(reduction="mean"),
              metrics=MSE())

In [9]:
model_.fit(custom_dataset,
            epochs=100,
            batch_size=64,
            verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/100
       [4258.49804688]) - 9ms/step           
Epoch 2/100
       [1865.50012207]) - 6ms/step          


  return (isinstance(seq, collections.Sequence) and


Epoch 3/100
       [1534.73840332]) - 9ms/step          
Epoch 4/100
       [1378.05822754]) - 7ms/step         
Epoch 5/100
       [1291.77917480]) - 7ms/step         
Epoch 6/100
       [1233.90051270]) - 6ms/step         
Epoch 7/100
       [1273.40795898]) - 7ms/step         
Epoch 8/100
       [1159.14343262]) - 6ms/step         
Epoch 9/100
       [1106.58752441]) - 8ms/step         
Epoch 10/100
       [1096.23547363]) - 7ms/step         
Epoch 11/100
       [1120.56762695]) - 6ms/step         
Epoch 12/100
       [1076.82934570]) - 7ms/step         
Epoch 13/100
       [1042.75830078]) - 8ms/step         
Epoch 14/100
       [1052.34692383]) - 7ms/step         
Epoch 15/100
       [1034.26525879]) - 7ms/step         
Epoch 16/100
       [1023.22753906]) - 6ms/step         
Epoch 17/100
       [993.00885010]) - 6ms/step          
Epoch 18/100
       [1022.95935059]) - 7ms/step         
Epoch 19/100
       [1017.78143311]) - 6ms/step         
Epoch 20/100
       [1017.09179688]) 

## 效果展示

In [11]:

y_pre = np.array(model_.predict(train_loader.dataset[120:140][0]))
y_true = np.array(train_loader.dataset[120:140][1])
for i in range(20):
    print("预测值:真实值 : ",np.round(y_pre[0,i],1),y_true[i],end="\n")

Predict begin...
Predict samples: 420
预测值:真实值 :  [4.9] [5.]
预测值:真实值 :  [4.2] [4.]
预测值:真实值 :  [5.5] [5.]
预测值:真实值 :  [4.6] [4.]
预测值:真实值 :  [5.7] [5.]
预测值:真实值 :  [4.7] [4.]
预测值:真实值 :  [4.9] [5.]
预测值:真实值 :  [5.2] [5.]
预测值:真实值 :  [4.9] [5.]
预测值:真实值 :  [4.] [3.]
预测值:真实值 :  [4.5] [5.]
预测值:真实值 :  [4.9] [4.]
预测值:真实值 :  [3.7] [4.]
预测值:真实值 :  [4.5] [4.]
预测值:真实值 :  [5.] [5.]
预测值:真实值 :  [5.5] [5.]
预测值:真实值 :  [5.5] [5.]
预测值:真实值 :  [4.9] [5.]
预测值:真实值 :  [5.3] [5.]
预测值:真实值 :  [4.5] [4.]
