In [1]:
from google.colab import drive 
drive.mount('/content/drive') 

import os 
os.chdir("/content/drive/My Drive/Colab Notebooks/Week7")

Mounted at /content/drive


In [None]:
!pip install deepctr[gpu]

In [58]:
import pandas as pd
import numpy as np
from deepctr.models import WDL
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from warnings import filterwarnings
filterwarnings('ignore')

### 数据加载
参考movielens_sample数据集，转换了一个10万行的数据文件

In [51]:
# data = pd.read_csv('./movielens_sample.txt')
data = pd.read_csv('../data/movielens_100k.txt')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,196,242,3,881250949,Kolya (1996),Comedy,M,49,20,55105
1,196,257,2,881251577,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi,M,49,20,55105
2,196,111,4,881251793,"Truth About Cats & Dogs, The (1996)",Comedy|Romance,M,49,20,55105
3,196,25,4,881251955,"Birdcage, The (1996)",Comedy,M,49,20,55105
4,196,382,4,881251843,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama,M,49,20,55105


### 特征选择与编码

In [52]:
# 选择稀疏特征
sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
target = ['rating']

# 对稀疏特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]  # 默认embedding_dim=4
print(fixlen_feature_columns)

[SparseFeat(name='user_id', vocabulary_size=943, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fdc87f06668>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='movie_id', vocabulary_size=1682, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fdc87f06828>, embedding_name='movie_id', group_name='default_group', trainable=True), SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fdc87f06588>, embedding_name='gender', group_name='default_group', trainable=True), SparseFeat(name='age', vocabulary_size=61, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializ

#### 1. 不使用genres字段

In [83]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']

In [84]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2, random_state=12)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用Wide&Deep进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression', dnn_hidden_units=(32, 64, 32))
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
# 使用Wide&Deep进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 0.9603124491539199


#### 2. 使用genres字段

In [64]:
genres_set = set()
for ls_g in data['genres'].str.split('|'):
  genres_set = genres_set.union(ls_g)
key2index = {genre: idx for idx, genre in enumerate(sorted(list(genres_set)))}
key2index

{'Action': 0,
 'Adventure': 1,
 'Animation': 2,
 "Children's": 3,
 'Comedy': 4,
 'Crime': 5,
 'Documentary': 6,
 'Drama': 7,
 'Fantasy': 8,
 'Film-Noir': 9,
 'Horror': 10,
 'Musical': 11,
 'Mystery': 12,
 'Romance': 13,
 'Sci-Fi': 14,
 'Thriller': 15,
 'War': 16,
 'Western': 17,
 'unknown': 18}

In [65]:
genres_list = [list(map(lambda k: key2index[k], ls_g)) for ls_g in data['genres'].str.split('|')]
display(data.iloc[:3])
print("genres对应编码示例：\n", genres_list[:3])

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,195,241,3,881250949,Kolya (1996),Comedy,1,39,20,415
1,195,256,2,881251577,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi,1,39,20,415
2,195,110,4,881251793,"Truth About Cats & Dogs, The (1996)",Comedy|Romance,1,39,20,415


genres对应编码示例：
 [[4], [0, 1, 4, 14], [4, 13]]


In [72]:
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',vocabulary_size= len(
    key2index) + 1,embedding_dim=4), maxlen=max_len, combiner='mean',weight_name=None)]


In [85]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', 'genres']

### 切分数据集与模型训练

In [86]:
# 将数据集切分成训练集和测试集
data["genres"] = genres_list
train, test = train_test_split(data, test_size=0.2, random_state=12)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用Wide&Deep进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression', dnn_hidden_units=(32, 64, 32))
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
# 使用Wide&Deep进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 0.9554580053565934


### 总结
使用了genres字段后，测试集RMSE从不使用的0.9603减少为0.9555，减少了0.50%。可能还需要进一步调整模型参数来提高预测效果。