In [None]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from itertools import count
from scipy.sparse import csr
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction import DictVectorizer

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
loadpath = "/content/drive/My Drive/Colab Notebooks/FM推荐系统"
print(os.chdir(loadpath))
print(os.listdir(loadpath))#目前路径
# import zipfile
# # 传入压缩文件zfile.zip获取相关信息
# zip_file = zipfile.ZipFile(loadpath+'/datasets.zip')
# # 获取压缩文件中的内容
# f_content = zip_file.namelist()
# f_content
# #解压缩到主目录
# zip_extract = zip_file.extractall(loadpath)

In [None]:
path = "/content/drive/My Drive/Colab Notebooks/FM推荐系统/datasets"
batch_size = 64
learning_rate = 0.0001
weight_decay = 0.001
loss_dict = []
loss_fn =nn.MSELoss()
epochs_dict = []
DEBUG = True
epochs = 3 if DEBUG else 20

In [None]:
#这里我们主要用的是ua.base和ua.test，
#数据集ua.base, ua.test将u.data数据集分为训练集和测试集，每个用户在测试集中具有10个评分。
#ua.base和ua.test中的每个样例都分别由四组值（user_id | item_id | rating | timestamp）组成。

In [None]:
#有943个用户，对应1680个商品，90570多个交易记录
cols = ['user','item','rating','timestamp']
train = pd.read_csv(path+'/ua.base',delimiter='\t',names = cols)
y_train = train['rating'].values
train.head(5)

In [None]:
train.shape

In [None]:
train['user'].value_counts()

In [None]:
train['item'].value_counts()

In [None]:
#36号user的物品栏
train[train['user']==36]

In [None]:
def vectorize_dic(dic,ix=None,p=None,n=0,g=0):
    """
    dic -- 特征列表字典，关键字是特征名,之后会输入user_id
    ix -- 索引 (default None) 这个是item
    p -- 特征向量的维度 (number of columns in the sparse matrix) (default None) 有多少个物品
    """
    if ix==None:
        ix = dict()
    nz = n * g
    col_ix = np.empty(nz,dtype = int)#随机生成一个大小为nz的数组，元素为整数
    i = 0
    for k,lis in dic.items():#这里k就是字典的key，对应的user和item。这里的lis分别对应的user和item的value
        # users和users的list，或者是items和items的list
        for t in range(len(lis)):
            # 为编号为t的user或者item赋值
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1#分别给user和item累计计数
            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]#把字典里的数映射到矩阵中
        i += 1
    row_ix = np.repeat(np.arange(0,n),g)
    data = np.ones(nz)
    if p == None:
        p = len(ix)
    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix

# 分批次训练模型
def batcher(X_, y_=None, batch_size=-1):
    n_samples = X_.shape[0]
    if batch_size == -1:
        batch_size = n_samples
    if batch_size < 1:
        raise ValueError('Parameter batch_size={} 是不支持的'.format(batch_size))
    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None:
            ret_y = y_[i:i + batch_size]
            yield (ret_x, ret_y)

In [None]:
x_train,ix = vectorize_dic({'users':train['user'].values,#dic
                            'items':train['item'].values},#ix
                            n=len(train.index),
                            g=2)
x_train = x_train.todense()#从稀疏矩阵转化成普通矩阵
x_train.shape

In [None]:
x_train[0]

In [None]:
n,p = x_train.shape
k = 10#k为什么等于10
class FM_model(nn.Module):#用pytorch的轮子
    def __init__(self,p,k):
        super(FM_model,self).__init__()
        self.p = p
        self.k = k
        self.linear = nn.Linear(self.p,1,bias=True)#p个变量，y只有一个
        self.v = nn.Parameter(torch.randn(self.k,self.p))
    def fm_layer(self,x):
        linear_part = self.linear(x)
        inter_part1 = torch.mm(x,self.v.t())
        inter_part2 = torch.mm(torch.pow(x,2),torch.pow(self.v,2).t())
        output = linear_part + 0.5*torch.sum(torch.pow(inter_part1,2) - inter_part2)
        return output
    def forward(self,x):
        output = self.fm_layer(x)
        return output

In [None]:
model = FM_model(p,k)#p个变量，k个
optimer = torch.optim.SGD(model.parameters(),lr=0.0001,weight_decay=0.001)
loss_fn =nn.MSELoss()
print(model)

In [None]:
test = pd.read_csv('./datasets/ua.test',delimiter='\t',names = cols)
x_test,ix = vectorize_dic({'users':test['user'].values,
                           'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)
y_test = test['rating'].values
x_test = x_test.todense()

In [None]:
loss_fn =nn.MSELoss()
epochs=5
from tqdm import tqdm
for epoch in range(epochs):
    loss_epoch = 0.0
    loss_all = 0.0
    perm = np.random.permutation(x_train.shape[0])
    model.train()
    for x,y in tqdm(batcher(x_train[perm], y_train[perm], batch_size)):
        model.zero_grad()
        x = torch.as_tensor(np.array(x.tolist()), dtype=torch.float,device=torch.device('cpu'))
        y = torch.as_tensor(np.array(y.tolist()), dtype=torch.float,device=torch.device('cpu'))
        x = x.view(-1, p)
        y = y.view(-1, 1)
        preds = model(x)
        loss = loss_fn(preds,y)
        loss_all += loss.item()
        loss.backward()
        optimer.step()
    loss_epoch = loss_all/len(x)
    loss_dict.append(loss_epoch)
    epochs_dict.append(epoch)
    print(f"Epoch [{epoch}/{30}], "
              f"Loss: {loss_epoch:.8f} ")

In [None]:
#保存模型
torch.save({'epoch': epochs , 'state_dict': model.state_dict(), 'best_loss': min(loss_dict),
                            'optimizer': optimer.state_dict()},
                           './parameter.pkl')

In [None]:
plt.figure()  # 初始化画布
x1 = range(0, epochs) # 取横坐标的值
y1 = loss_dict
plt.xlabel('Epoch #')  # 设置坐标轴名称
plt.ylabel('Loss')
plt.plot(x1, y1)
plt.show() # 显示图片