In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,get_feature_names

In [2]:
#加载数据
data=pd.read_csv("ratings.csv")
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
1048570,7120,168,5.0,1175543061
1048571,7120,253,4.0,1175542225
1048572,7120,260,5.0,1175542035
1048573,7120,261,4.0,1175543376


In [10]:
data=data.rename(columns={'movieId':'movie_id','userId':'user_id'})
data

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,3.5,1112486027
1,1,28,3.5,1112484676
2,1,31,3.5,1112484819
3,1,46,3.5,1112484727
4,1,49,3.5,1112484580
...,...,...,...,...
1048570,7120,163,5.0,1175543061
1048571,7120,247,4.0,1175542225
1048572,7120,254,5.0,1175542035
1048573,7120,255,4.0,1175543376


In [11]:
sparse_features = ["movie_id", "user_id"]
target = ['rating']

In [12]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
fixlen_feature_columns

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


[SparseFeat(name='movie_id', vocabulary_size=14026, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7fa9cc98be50>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=7120, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7fa9cca8cfd0>, embedding_name='user_id', group_name='default_group', trainable=True)]

In [14]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id']

In [15]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
train_model_input

{'movie_id': array([1624, 1234,  753, ...,  447, 6003, 4626]),
 'user_id': array([ 698, 4699, 1087, ..., 2030, 2702,  130])}

In [16]:
test_model_input = {name:test[name].values for name in feature_names}
test_model_input

{'movie_id': array([4035,  287, 9414, ..., 9328, 1469, 1737]),
 'user_id': array([ 115, 6147,  728, ..., 6070, 6484,  970])}

In [17]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
model

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.engine.training.Model at 0x7fa9cc97f210>

In [18]:
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
pred_ans

array([[1.4501697],
       [3.9530144],
       [3.0553806],
       ...,
       [3.9695551],
       [2.559624 ],
       [2.7558515]], dtype=float32)

In [19]:
# 输出MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
mse

0.738

In [20]:
# 输出RMSE
rmse = mse ** 0.5
rmse

0.8590692637965812