In [12]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,get_feature_names

In [10]:
#数据加载
data = pd.read_csv("movielens_sample.txt")
data

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009
...,...,...,...,...,...,...,...,...,...,...
195,1427,3596,3,974840560,Screwed (2000),Comedy,M,25,12,21401
196,3868,1626,3,965855033,Fire Down Below (1997),Action|Drama|Thriller,M,18,12,73112
197,249,2369,3,976730191,Desperately Seeking Susan (1985),Comedy|Romance,F,18,14,48126
198,5720,349,4,958503395,Clear and Present Danger (1994),Action|Adventure|Thriller,M,25,0,60610


In [14]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
fixlen_feature_columns

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f8c8006cb50>, embedding_name='movie_id', group_name='default_group', trainable=True),
 SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f8c805c5490>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f8ca3744990>, embedding_name='gender', group_name='default_group', trainable=True),
 SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f8ca3786650>, emb

In [16]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']

In [17]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
train_model_input

{'movie_id': array([  6, 104, 185, 134, 116, 141,  46, 105,  89,  24, 130, 113,  30,
        148, 107,  11,  60,  17,  43, 129,  69,  75,   2, 100,  25, 146,
         96, 112,  97,  36, 138, 178,  13, 136,  67, 114, 112, 160, 103,
        152, 108,  12,   7,  50,  39, 173,  76, 176, 101, 142, 170, 167,
         98,  72, 149,  15, 125, 161, 186, 140, 166, 137,  78, 124,  40,
        154,   8,  87, 156, 144,  57,  99,  66, 163, 180,   9,  44, 164,
        126,  73, 106,  52,  16, 132, 133, 184,   9, 109,  86,  56, 155,
        170, 182,  64,  33,  54, 102, 157,   1, 135,  38,  32,  35, 172,
         66,  28,  42,  19,  72,  81,  34, 173,  27,   5, 174,  79,  55,
         85,  22, 181,  47,  61, 127, 158,  83,  70, 126,  84,  49, 165,
        117, 120,  35,  31,  77,  82,  21, 111, 123,  26,  34,  94,  74,
          3,  62,  92,  71, 145,  10, 175,  20,  37, 115, 151,  90,  63,
          4,  91, 147, 118]),
 'user_id': array([ 12,  54,  28, 112, 144, 163,  81, 121, 131,  43, 146, 188, 134

In [18]:
test_model_input = {name:test[name].values for name in feature_names}
test_model_input

{'movie_id': array([169,  95,  27, 123,  45,  18, 150, 179,  48, 110,  88,  14,  29,
        183,  59,  68,  51,  41, 143, 169, 168,  65, 149,  53,  93, 159,
        121, 171,   0, 119, 177,  23, 122, 162,  80,  58, 131, 139, 153,
        128]),
 'user_id': array([110,  70, 189,  41, 187, 158,  29, 137,  74,  88, 118,  23, 108,
         99, 169, 161,  79,  34, 114, 123, 192,  53, 142,   2,  40, 164,
         62, 122,  26,  87, 115,  93,  65,  44, 171, 140,  75,  83, 127,
         49]),
 'gender': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1]),
 'age': array([3, 4, 0, 2, 5, 3, 2, 2, 1, 1, 1, 2, 1, 1, 1, 6, 5, 2, 3, 1, 2, 5,
        2, 1, 3, 5, 3, 6, 1, 2, 1, 3, 1, 3, 2, 2, 3, 5, 0, 2]),
 'occupation': array([ 7, 19,  9, 18, 19, 16,  0, 18,  4,  4, 16, 15,  4,  0,  9,  2, 11,
         3,  0,  4,  6, 15,  0,  4,  4, 11,  5, 16, 13,  3,  4,  0,  0, 14,
        11, 16,  3, 13,  9,  0]),
 'zip': array([1

In [19]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
model

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.engine.training.Model at 0x7f8c241acf50>

In [20]:
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
pred_ans

array([[0.0144382 ],
       [0.01441939],
       [0.01500458],
       [0.01428198],
       [0.01447808],
       [0.01450632],
       [0.01657853],
       [0.01427349],
       [0.01441241],
       [0.01448798],
       [0.01463262],
       [0.01426773],
       [0.01449382],
       [0.01654253],
       [0.01421927],
       [0.01420567],
       [0.01446282],
       [0.01373824],
       [0.01580603],
       [0.01446905],
       [0.01381005],
       [0.0143879 ],
       [0.01546564],
       [0.01449203],
       [0.01630777],
       [0.01441208],
       [0.01442638],
       [0.01449333],
       [0.01446844],
       [0.01372336],
       [0.01446677],
       [0.01453829],
       [0.01459706],
       [0.01637971],
       [0.01430004],
       [0.01435131],
       [0.01385697],
       [0.01445062],
       [0.0140951 ],
       [0.01619717]], dtype=float32)

In [22]:
# 输出MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
mse

12.1536

In [23]:
# 输出RMSE
rmse = mse ** 0.5
rmse

3.4862013711201483