In [1]:
import numpy as np
import pandas as pd


In [None]:
file_name='./data/ratings.csv'
movie_file_name='./data/movies.csv'

df=pd.read_csv(file_name,sep=',')
df_sort=df.sort_values(by=['userId','timestamp'],
                       ascending=(True,False))
user_num=df_sort['userId'].max()
item_num=df_sort['movieId'].max()

movie_df=pd.read_csv(movie_file_name,sep=',')
# print(df_sort.head())
# print(movie_df.head()) # movieId,


In [None]:
user_last_movie=[]
item_count=[]
for i in range(1,user_num+1):
    user_last_movie.append([i,df_sort.loc[df_sort['userId']==i]['movieId'].iloc[0]])

In [None]:
last_movie=np.zeros([df_sort.shape[0],2])
for user,movie in user_last_movie:
    last_movie[df_sort['userId']==user]=[user,movie]

last_movie_df=pd.DataFrame(last_movie,columns=['userId','lastMovie'],dtype=np.int64)

In [None]:
concat_df=pd.DataFrame(np.hstack([df_sort.values,last_movie_df.values]),columns=['user','item','rating','timestamp','niubi','last_movie'])

In [None]:
result_df=concat_df.iloc[:,[0,1,2,5]]

In [None]:
result_df.to_csv('./data/rating_lastMovie.csv')

In [None]:
result_df.head()

# 2.分数据集

In [None]:
data_df=pd.read_csv('./data/rating_lastMovie.csv',usecols=['user','item','rating','last_movie'])

In [None]:
data_df = data_df.sample(frac=1.0)  # 全部打乱
cut_idx = int(round(0.2 * data_df.shape[0]))
df_test, df_train = data_df.iloc[:cut_idx], data_df.iloc[cut_idx:]

In [None]:
print(df_test.shape,df_train.shape)

In [None]:
df_train.to_csv('./data/rating_train.csv')
df_test.to_csv('./data/rating_test.csv')

In [None]:
df_train.head(),df_test.head()

# 3.生成特征

In [2]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer

In [3]:
df_train=pd.read_csv('./data/rating_train.csv',usecols=['user','item','rating','last_movie'])
df_test=pd.read_csv('./data/rating_test.csv',usecols=['user','item','rating','last_movie'])


In [None]:
df_train.head(),df_test.head()

In [None]:
train=df_train #时间戳是不相关信息，可以去掉
test=df_test

# DictVectorizer会把数字识别为连续特征，这里把用户id、item id和lastmovie强制转为 catogorical identifier
train["item"]=train["item"].apply(lambda x:"c"+str(x))
train["user"]=train["user"].apply(lambda  x:"u"+str(x))
train["last_movie"]=train["last_movie"].apply(lambda  x:"l"+str(x))

test["item"]=test["item"].apply(lambda x:"c"+str(x))
test["user"]=test["user"].apply(lambda x:"u"+str(x))
test["last_movie"]=test["last_movie"].apply(lambda  x:"l"+str(x))

In [None]:
# 在构造特征向量时应该不考虑评分，只考虑用户数和电影数
train_no_rating=train.drop(['rating'],axis=1)
test_no_rating=test.drop(['rating'],axis=1)
all_df=pd.concat([train_no_rating,test_no_rating])
# all_df=pd.concat([train,test])
data_num=all_df.shape
print("all_df shape",all_df.shape)
# 打印前10行
# print("all_df head",all_df.head(10))

# 进行特征向量化,有多少特征，就会新创建多少列
vec=DictVectorizer()
vec.fit_transform(all_df.to_dict(orient='record'))
# 合并训练集与验证集，是为了one hot,用完可以释放
del all_df

x_train=vec.transform(train.to_dict(orient='record')).toarray()
x_test=vec.transform(test.to_dict(orient='record')).toarray()
# print(vec.feature_names_)   #查看转换后的别名
print("x_train shape",x_train.shape)
print("x_test shape",x_test.shape)

In [None]:
y_train=train['rating'].values.reshape(-1,1)
y_test=test['rating'].values.reshape(-1,1)
print("y_train shape",y_train.shape)
print("y_test shape",y_test.shape)

In [None]:
# train_dataset = Data.TensorDataset(torch.tensor(x_train),torch.tensor(y_train))
test_dataset=Data.TensorDataset(torch.tensor(x_test),torch.tensor(y_test))


In [None]:
BATCH_SIZE=25
test_loader=Data.DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [None]:
for feature,rating in test_loader:
    print(feature.shape)
    print(rating.shape)
    break

In [None]:
class FM(nn.Module):
    def __init__(self,feature_num,factor_num):
        super(FM,self).__init__()
        self.feature_num=feature_num
        self.factor_num=factor_num
        self.linear=nn.Linear(self.feature_num,1,bias=True) # Linear 
        self.v=nn.Parameter(torch.rand(self.feature_num,self.factor_num)) # Interaction
        
    def forward(self,x):
        inter_1=torch.matmul(x,self.v).pow(2).sum(1,keepdim=True)
        inter_2=torch.matmul(x.pow(2),self.v.pow(2)).sum(1,keepdim=True)
        
        out_inter=0.5*(inter_1-inter_2)
        out_linear=self.linear(x)
        
        return out_inter+out_linear

In [None]:
data_num,feature_num=x_test.shape

In [None]:
model=FM(feature_num=feature_num,factor_num=5)
optimizer=torch.optim.SGD(model.parameters(),lr=1e-2)
loss_fn=nn.MSELoss()

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
loss_train_set=[]
for epoch in range(35):
    step=0
    loss_sum=0.0
    for batch_x,batch_y in tqdm(test_loader):
        batch_x = batch_x.clone().detach().float()
        batch_y = batch_y.clone().detach().float()
        output=model(batch_x)
        loss=loss_fn(output,batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         print(output.dtype)
#         print(batch_y.dtype)
#         break
        loss_sum+=loss.item()
        step+=1
#     break
    loss_train_set.append(loss_sum/step)
    print('Loss of Epoch {}: {:.2f}'.format(epoch,loss_sum/step))
        

In [None]:
plt.plot(loss_train_set)

# FFM 的实现
对于FFM，其差别是将特征的域区分开来，并认为域之间的交互是不同的，
FM中的输入为[样本个数，特征维度]，

FFM中的输入为[样本个数，域维度，特征维度]

其实和FM是一样的，只不过求和的维度不一样而已，最后再一起求和

不同域的特征维度可能不一样，但是其Embedding维度应该是一样的

In [4]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder

In [5]:
df_feature=df_test.drop(['rating'],axis=1)

In [6]:
df_feature

Unnamed: 0,user,item,last_movie
0,560.0,27815.0,80831.0
1,43.0,208.0,1356.0
2,606.0,224.0,2355.0
3,97.0,593.0,377.0
4,387.0,296.0,348.0
...,...,...,...
20162,525.0,914.0,120807.0
20163,606.0,27178.0,2355.0
20164,600.0,151.0,6874.0
20165,239.0,4022.0,8529.0


In [7]:
field_list=[]
for feature in df_feature.columns:
    lbe=OneHotEncoder()
    field_list.append(lbe.fit_transform(np.expand_dims(df_feature[feature].values,axis=1)).toarray())

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
field_list[1].shape

(20167, 5134)

In [None]:
field_list[0].sum(axis=1)

In [12]:
dtype=torch.FloatTensor
class FFM(nn.Module):
    """
    feature_num应该是一个列表，里面是不同域的维度
    """
    def __init__(self,feature_num,field_num,factor_num):
        super(FFM,self).__init__()
        self.feature_num=feature_num
        self.factor_num=factor_num
        self.field_num=field_num
        for index,fea_num in enumerate(feature_num):
            setattr(self,
                    'params_'+str(index),
                    nn.Parameter(torch.rand(fea_num,factor_num)).type(dtype))
            setattr(self,
                    'linear_'+str(index),
                    nn.Linear(fea_num,1,bias=True).type(dtype))
            
    def embedding(self,feature_list):
        linear_sum=0
        sample_num=feature_list[0].shape[0]
        embedding_mat=torch.zeros([self.field_num,sample_num,self.factor_num])
        for i in range(self.field_num):
            linear_layer=getattr(self,'linear_'+str(i))
            params=getattr(self,'params_'+str(i))
            linear_sum+=linear_layer(feature_list[i])
            embedding_mat[i,:]=torch.mm(feature_list[i],params)
        
        return linear_sum,embedding_mat
    
    def forward(self,x):
        linear_sum,embedding_mat=self.embedding(x)
        embedding_mat=embedding_mat.permute(1,0,2)
        square_of_sum=torch.pow(torch.sum(embedding_mat,dim=1,keepdim=True),2)
        sum_of_square=torch.sum(embedding_mat*embedding_mat,dim=1,keepdim=True)
        cross_term=square_of_sum-sum_of_square
        cross_term=0.5*torch.sum(cross_term,dim=2,keepdim=False)
        
        return linear_sum+cross_term
        
    

In [13]:
feature_num=np.array([feature.shape[1] for feature in field_list])
field_num=feature_num.shape[0]
factor_num=3

In [14]:
feature_num

array([ 610, 5134,  510])

In [15]:
model=FFM(feature_num,field_num,factor_num)

In [16]:
tensor_list=[]
for feature in field_list:
    tensor_list.append(torch.tensor(feature[:4],dtype=torch.float32))

In [17]:
tensor_list

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])]

In [18]:
model.forward(tensor_list)

tensor([[1.7725],
        [3.0419],
        [2.4380],
        [1.9246]], grad_fn=<AddBackward0>)