In [99]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [100]:
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

# 查看数据基本情况

In [101]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [102]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [103]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [104]:
print("用户样本数:",len(users_df))
print("电影样本数:",len(movies_df))
print("评分样本数:", len(ratings_df))

用户样本数: 6040
电影样本数: 3883
评分样本数: 1000209


In [105]:
# 查看 movies_df 的空值情况
print("movies_df 的空值情况：")
print(movies_df.isna().sum())

# 查看 ratings_df 的空值情况
print("\nratings_df 的空值情况：")
print(ratings_df.isna().sum())

# 查看 users_df 的空值情况
print("\nusers_df 的空值情况：")
print(users_df.isna().sum())

movies_df 的空值情况：
movie_id    0
title       0
genres      0
dtype: int64

ratings_df 的空值情况：
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

users_df 的空值情况：
user_id       0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64


# 数据预处理

In [106]:
genres=movies_df['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies_df, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings_df, movies, on='movie_id')

In [107]:
genres

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [108]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [109]:
# 有哪些电影没有参与过评分
unrated_movies = movies_df[~movies_df['movie_id'].isin(ratings_df['movie_id'])]
print("没有参与过评分的电影:")
print(unrated_movies)
print("没有参与过评分的电影数量为:", len(unrated_movies))

没有参与过评分的电影:
      movie_id                                title                 genres
50          51                Guardian Angel (1994)  Action|Drama|Thriller
107        109  Headless Body in Topless Bar (1995)                 Comedy
113        115     Happiness Is in the Field (1995)                 Comedy
141        143                         Gospa (1995)                  Drama
281        284                  New York Cop (1996)           Action|Crime
...        ...                                  ...                    ...
3581      3650            Anguish (Angustia) (1986)                 Horror
3681      3750                Boricua's Bond (2000)                  Drama
3759      3829               Mad About Mambo (2000)         Comedy|Romance
3786      3856                  Autumn Heart (1999)                  Drama
3837      3907   Prince of Central Park, The (1999)                  Drama

[177 rows x 3 columns]
没有参与过评分的电影数量为: 177


In [110]:
# 有哪些用户没有参与过评分
unrated_users=users_df[~users_df['user_id'].isin(ratings_df['user_id'])]
print("没有参与过评分的用户:")
print(unrated_users)
print("没有参与过评分的用户数量为:", len(unrated_users))

没有参与过评分的用户:
Empty DataFrame
Columns: [user_id, gender, age, occupation, zip_code]
Index: []
没有参与过评分的用户数量为: 0


# 特征工程

## 用户特征工程

### 1. 用户统计特征

In [111]:
user_stats=data.groupby('user_id')['rating'].agg([
    ('mean_rating', 'mean'), # 平均评分
    ('rating_std', 'std'), # 评分标准差
    ('rating_count','count'), # 评分次数
    ('rating_min','min'), # 最小评分
    ('rating_max','max') # 最大评分
]).reset_index()


# 计算用户评分严格程度
global_mean_rating=data['rating'].mean()
user_stats['rating_strictness']=global_mean_rating-user_stats['mean_rating']

# 计算用户评分波动程度
user_stats['rating_variability']=user_stats['rating_std']/user_stats['mean_rating']

# 查看结果
print(user_stats.head())

   user_id  mean_rating  rating_std  rating_count  rating_min  rating_max  \
0        1     4.188679    0.680967            53           3           5   
1        2     3.713178    1.001513           129           1           5   
2        3     3.901961    0.984985            51           1           5   
3        4     4.190476    1.077917            21           1           5   
4        5     3.146465    1.132699           198           1           5   

   rating_strictness  rating_variability  
0          -0.607115            0.162573  
1          -0.131614            0.269719  
2          -0.320396            0.252433  
3          -0.608912            0.257230  
4           0.435100            0.359991  


### 2. 用户电影类型偏好特征

In [112]:
# 获取所有电影类型列
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]


# 计算用户对每种类型的评分次数和平均评分（用户ID、电影类型以及评分次数）
user_genre_stats=data.groupby('user_id')[genre_columns].sum().reset_index()

# 计算用户对每种类型的偏好程度（按行处理，将每一个用户对某一类型的评分，除以该用户对所有类型评分的总和）
for genre in genre_columns:
    user_genre_stats[f'{genre}_favorite_degree']=user_genre_stats[genre]/user_genre_stats[genre_columns].sum(axis=1)

for genre in genre_columns:
    user_genre_stats[f'{genre}_rating_cnt']=user_genre_stats[genre]

# 计算用户最喜欢的类型（返回最大值所在的索引），axis=1表示按照行操作
user_genre_stats['favorite_genre']=user_genre_stats[genre_columns].idxmax(axis=1)

# 计算用户喜欢的类型数量（评分过的类型数）（得到该行中评分大于0的类型数量）
user_genre_stats['num_liked_genres']=(user_genre_stats[genre_columns]>0).sum(axis=1)

user_genre_stats.drop(columns=genre_columns,inplace=True)

# 合并所有用户特征
user_features=pd.merge(user_stats, user_genre_stats, on='user_id')

# 添加用户活跃度分段特征
user_features['activity_level'] = pd.cut(
    user_features['rating_count'],
    bins=[0, 5, 20, 100, float('inf')],
    labels=['inactive', 'casual', 'active', 'super']
)

# 用户活跃度特征编码
activity_encoder = LabelEncoder()
user_features['activity_level_encoded'] = activity_encoder.fit_transform(user_features['activity_level'])

# 用户最喜欢的类型编码
genre_encoder = LabelEncoder()
user_features['favorite_genre_encoded'] = genre_encoder.fit_transform(user_features['favorite_genre'])


In [113]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


## 电影特征工程

In [114]:
# 计算电影的基本评分统计
movie_stats=data.groupby('movie_id')['rating'].agg([
    ('movie_mean_rating', 'mean'),
    ('movie_rating_std', 'std'),
    ('movie_rating_count', 'count')
])

# 填充可能存在的NaN值
movie_stats['movie_rating_std']=movie_stats['movie_rating_std'].fillna(0)

print("movie_stats:")
print(movie_stats)

# 合并电影原始信息
movie_features=pd.merge(movies, movie_stats, on='movie_id')

# 添加电影热度特征
movie_features['popularity'] = pd.cut(
    movie_features['movie_rating_count'],
    bins=[0, 10, 100, 500, float('inf')],
    labels=['niche', 'moderate', 'popular', 'blockbuster']
)

# 电影热度编码
popularity_encoder = LabelEncoder()
movie_features['popularity_encoded'] = popularity_encoder.fit_transform(movie_features['popularity'])

# 计算电影类型纯度（类型数量越少，纯度越高）
movie_features['genre_purity']=1/movie_features[genre_columns].sum(axis=1)

# 提取电影发布年份
movie_features['year'] = movie_features['title'].str.extract(r'\((\d{4})\)')
movie_features['year'] = movie_features['year'].fillna('1990').astype(int)

# 创建标题长度特征
movie_features['title_length'] = movie_features['title'].str.len()

movie_stats:
          movie_mean_rating  movie_rating_std  movie_rating_count
movie_id                                                         
1                  4.146846          0.852349                2077
2                  3.201141          0.983172                 701
3                  3.016736          1.071712                 478
4                  2.729412          1.013381                 170
5                  3.006757          1.025086                 296
...                     ...               ...                 ...
3948               3.635731          1.014196                 862
3949               4.115132          1.009804                 304
3950               3.666667          1.046107                  54
3951               3.900000          1.057331                  40
3952               3.780928          0.935074                 388

[3706 rows x 3 columns]


In [115]:
print("电影维度:", movie_features.shape)
print("用户维度:", user_features.shape)

电影维度: (3706, 29)
用户维度: (6040, 49)


## 检查user_features和movie_features的空值情况

In [116]:
# 检查user_features中哪些列含有空值
user_features_null_columns = user_features.columns[user_features.isna().any()].tolist()
print("user_features中含有空值的列:", user_features_null_columns)

# 检查movie_features中哪些列含有空值
movie_features_null_columns = movie_features.columns[movie_features.isna().any()].tolist()
print("movie_features中含有空值的列:", movie_features_null_columns)

user_features中含有空值的列: []
movie_features中含有空值的列: []


## 保存特征数据

In [117]:
user_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/user_features.csv', index=False)
movie_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/movie_features.csv', index=False)

In [118]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


In [119]:
movie_features.head()

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,popularity_encoded,genre_purity,year,title_length
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4.146846,0.852349,2077,blockbuster,0,0.333333,1995,16
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3.201141,0.983172,701,blockbuster,0,0.333333,1995,14
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3.016736,1.071712,478,popular,3,0.5,1995,23
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2.729412,1.013381,170,popular,3,0.5,1995,24
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.006757,1.025086,296,popular,3,1.0,1995,34


## 拼接交互数据和user_features以及item_features

In [120]:
interactions = data[['user_id', 'movie_id', 'rating']]
full_data = pd.merge(interactions, user_features, on='user_id')
full_data = pd.merge(full_data, movie_features, on='movie_id')

In [121]:
# 筛选非数值型的列
non_numeric_columns = full_data.select_dtypes(exclude=['number']).columns

# 根据筛选出的列名获取对应的数据
non_numeric_data = full_data[non_numeric_columns]

print("非数值型的列名：", non_numeric_columns)
print("非数值型的数据：")
print(non_numeric_data.head())

remaining_columns = [col for col in full_data.columns if col not in non_numeric_columns]
new_data=full_data[remaining_columns]
print("还剩下的列数量:", len(remaining_columns))


非数值型的列名： Index(['favorite_genre', 'activity_level', 'title', 'genres', 'popularity'], dtype='object')
非数值型的数据：
  favorite_genre activity_level                                   title  \
0          Drama         active  One Flew Over the Cuckoo's Nest (1975)   
1          Drama         active        James and the Giant Peach (1996)   
2          Drama         active                     My Fair Lady (1964)   
3          Drama         active                  Erin Brockovich (2000)   
4          Drama         active                    Bug's Life, A (1998)   

                         genres   popularity  
0                         Drama  blockbuster  
1  Animation|Children's|Musical  blockbuster  
2               Musical|Romance  blockbuster  
3                         Drama  blockbuster  
4   Animation|Children's|Comedy  blockbuster  
还剩下的列数量: 74


## 检查movie_features和user_features的空值情况

In [122]:
user_numeric_cols = [
    'mean_rating', 'rating_std', 'rating_count', 'rating_min', 'rating_max',
    'rating_strictness', 'rating_variability', 'num_liked_genres'
] + [col for col in user_features.columns if '_favorite_degree' in col] + [
    'activity_level_encoded', 'favorite_genre_encoded'
]

# 电影特征列
movie_numeric_cols = [
    'movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 
    'genre_purity', 'year', 'title_length'
] + [col for col in movie_features.columns if col.startswith('genre_') and not col.endswith('_sum')] + [
    'popularity_encoded'
]
print('user_numeric_cols:', user_numeric_cols)
print('movie_numeric_cols:', movie_numeric_cols)

user_numeric_cols: ['mean_rating', 'rating_std', 'rating_count', 'rating_min', 'rating_max', 'rating_strictness', 'rating_variability', 'num_liked_genres', 'Action_favorite_degree', 'Adventure_favorite_degree', 'Animation_favorite_degree', "Children's_favorite_degree", 'Comedy_favorite_degree', 'Crime_favorite_degree', 'Documentary_favorite_degree', 'Drama_favorite_degree', 'Fantasy_favorite_degree', 'Film-Noir_favorite_degree', 'Horror_favorite_degree', 'Musical_favorite_degree', 'Mystery_favorite_degree', 'Romance_favorite_degree', 'Sci-Fi_favorite_degree', 'Thriller_favorite_degree', 'War_favorite_degree', 'Western_favorite_degree', 'activity_level_encoded', 'favorite_genre_encoded']
movie_numeric_cols: ['movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity', 'year', 'title_length', 'genre_purity', 'popularity_encoded']


In [123]:
# 找出哪些电影特/征列包含NaN值
nan_columns = [col for col in movie_numeric_cols 
               if movie_features[col].isna().any()]
print("包含NaN值的特征列:", nan_columns)

# 查看NaN值的具体分布
print("\nNaN值统计:")
print(movie_features[nan_columns].isna().sum())

包含NaN值的特征列: []

NaN值统计:
Series([], dtype: float64)


In [124]:
# 找出哪些电影特/征列包含NaN值
nan_columns = [col for col in user_numeric_cols 
               if user_features[col].isna().any()]
print("包含NaN值的特征列:", nan_columns)

# 查看NaN值的具体分布
print("\nNaN值统计:")
print(user_features[nan_columns].isna().sum())

包含NaN值的特征列: []

NaN值统计:
Series([], dtype: float64)


# 构建解藕的双塔模型

In [170]:
# 准备数值特征列（排除ID和分类列）
user_feat_cols = [col for col in user_features.columns 
                 if col not in ['user_id', 'activity_level', 'favorite_genre']]
movie_feat_cols = [col for col in movie_features.columns 
                  if col not in ['movie_id', 'title', 'genres', 'popularity']]

print("用户数值型特征数量:", len(user_feat_cols))
print("电影数值型特征数量:", len(movie_feat_cols))

用户数值型特征数量: 46
电影数值型特征数量: 25


## 构建用户塔

In [153]:
def build_user_tower(embedding_dim=64):
    user_id_input = Input(shape=(1,), name='user_id')
    user_features_input = Input(shape=(len(user_feat_cols),), name='user_features')
    
    # 用户ID嵌入层
    user_embedding = Embedding(input_dim=user_features['user_id'].max()+1, 
                             output_dim=embedding_dim)(user_id_input)
    user_embedding = Flatten()(user_embedding)
    
    # 合并特征
    user_tower = Concatenate()([user_embedding, user_features_input])
    user_tower = Dense(256, activation='relu')(user_tower)
    user_tower = Dense(128, activation='relu')(user_tower)
    user_tower = Dense(embedding_dim, name='user_embedding')(user_tower)  # 确保输出维度一致
    
    return Model(inputs=[user_id_input, user_features_input], outputs=user_tower)

## 构建电影塔

In [154]:
def build_movie_tower(embedding_dim=64):
    movie_id_input = Input(shape=(1,), name='movie_id')
    movie_features_input = Input(shape=(len(movie_feat_cols),), name='movie_features')
    
    # 电影ID嵌入层
    movie_embedding = Embedding(input_dim=movie_features['movie_id'].max()+1, 
                              output_dim=embedding_dim)(movie_id_input)
    movie_embedding = Flatten()(movie_embedding)
    
    # 合并特征
    movie_tower = Concatenate()([movie_embedding, movie_features_input])
    movie_tower = Dense(256, activation='relu')(movie_tower)
    movie_tower = Dense(128, activation='relu')(movie_tower)
    movie_tower = Dense(embedding_dim, name='movie_embedding')(movie_tower)  # 确保输出维度一致
    
    return Model(inputs=[movie_id_input, movie_features_input], outputs=movie_tower)

# 构建双塔模型
user_tower = build_user_tower()
movie_tower = build_movie_tower()

# 查看模型结构
user_tower.summary()
movie_tower.summary()

In [None]:
# 测试用户塔输出形状
test_user_id = np.array([[1]])
test_user_features = user_features[user_features['user_id'] == 1][user_feat_cols].values.reshape(1, -1) # [1, 46]
print("用户特征性状:", test_user_features.shape)
user_embedding = user_tower.predict({'user_id': test_user_id, 'user_features': test_user_features}) # [1, 64]
print("用户嵌入形状:", user_embedding.shape)

# 测试电影塔输出形状
test_movie_id = np.array([[1]])
test_movie_features = movie_features[movie_features['movie_id'] == 1][movie_feat_cols].values.reshape(1, -1) # [1,25]
print("电影特征性状:", test_movie_features.shape)
movie_embedding = movie_tower.predict({'movie_id': test_movie_id, 'movie_features': test_movie_features}) # [1, 64]
print("电影嵌入形状:", movie_embedding.shape)

用户特征性状: (1, 46)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
用户嵌入形状: (1, 64)
电影特征性状: (1, 25)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
电影嵌入形状: (1, 64)


# 准备pairwise训练数据

In [None]:
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# 准备用户-电影交互数据
user_rated_movies = defaultdict(set)
for _, row in ratings_df.iterrows():
    user_rated_movies[row['user_id']].add(row['movie_id'])

# 获取所有电影ID
all_movie_ids = movie_features['movie_id'].unique()

# 生成训练样本
def prepare_pairwise_data(data, num_negatives=4):
    # 获取所有电影ID
    all_movie_ids = data['movie_id'].unique()
    
    pairs = []
    labels = []
    
    # 为每个用户生成正负样本对
    for user_id, group in tqdm(data.groupby('user_id'), desc='生成正负样本对'):
        # 用户评分过的电影(正样本)
        pos_movies = group['movie_id'].values
        
        # 用户未评分过的电影(负样本)
        neg_movies = np.setdiff1d(all_movie_ids, pos_movies)
        
        # 为每个正样本生成num_negatives个负样本
        for pos_movie in pos_movies:
            # 随机选择负样本
            neg_samples = np.random.choice(neg_movies, size=num_negatives, replace=False)
            
            # 创建正负对
            for neg_movie in neg_samples:
                pairs.append((user_id, pos_movie, neg_movie))
                labels.append(1)  # 1表示正样本得分应高于负样本
    
    return np.array(pairs), np.array(labels)


# 生成训练对
# 其中train_pairs包含三个东西，一是user_id, 二是pos_movie_id, 三是neg_movie_id
train_pairs, train_labels = prepare_pairwise_data(ratings_df)

# 划分训练验证集
from sklearn.model_selection import train_test_split
train_pairs, val_pairs = train_test_split(train_pairs, test_size=0.2, random_state=42)

生成正负样本对: 100%|██████████| 6040/6040 [00:30<00:00, 199.17it/s]


# 创建数据生成器

In [225]:
def data_generator(pairs, batch_size=512):
    while True:
        # 随机打乱
        indices = np.random.permutation(len(pairs))
        
        for i in range(0, len(indices), batch_size):
            batch_indices = indices[i:i+batch_size]
            batch_pairs = pairs[batch_indices]
            
            # 获取用户和电影ID
            user_ids = batch_pairs[:, 0].astype('int32')
            pos_movie_ids = batch_pairs[:, 1].astype('int32')
            neg_movie_ids = batch_pairs[:, 2].astype('int32')
            
            # 获取用户特征
            user_feats = user_features[user_features['user_id'].isin(user_ids)]
            user_feats = user_feats[user_feat_cols].values.astype('float32')
            
            # 获取正样本电影特征
            pos_movie_feats = movie_features[movie_features['movie_id'].isin(pos_movie_ids)]
            pos_movie_feats = pos_movie_feats[movie_feat_cols].values.astype('float32')
            
            # 获取负样本电影特征
            neg_movie_feats = movie_features[movie_features['movie_id'].isin(neg_movie_ids)]
            neg_movie_feats = neg_movie_feats[movie_feat_cols].values.astype('float32')
            
            # 准备正样本输入
            pos_inputs = {
                'user_id': user_ids,
                'user_features': user_feats,
                'movie_id': pos_movie_ids,
                'movie_features': pos_movie_feats
            }
            
            # 准备负样本输入
            neg_inputs = {
                'user_id': user_ids,
                'user_features': user_feats,
                'movie_id': neg_movie_ids,
                'movie_features': neg_movie_feats
            }
            
            # 合并正负样本
            inputs = {
                'user_id': np.concatenate([pos_inputs['user_id'], neg_inputs['user_id']]),
                'user_features': np.concatenate([pos_inputs['user_features'], neg_inputs['user_features']]),
                'movie_id': np.concatenate([pos_inputs['movie_id'], neg_inputs['movie_id']]),
                'movie_features': np.concatenate([pos_inputs['movie_features'], neg_inputs['movie_features']]),
                'dummy_labels': np.zeros(len(pos_inputs['user_id']) + len(neg_inputs['user_id']))  # 虚拟标签
            }
            
            # 输出可以是任意值，因为损失由ContrastiveLossLayer计算
            dummy_output = np.zeros(len(inputs['user_id']))
            
            yield inputs, dummy_output

# 自定义对比损失函数

In [226]:
class ContrastiveLossLayer(Layer):
    def __init__(self, margin=1.0, **kwargs):
        super(ContrastiveLossLayer, self).__init__(**kwargs)
        self.margin = margin
    
    def call(self, inputs):
        y_true, y_pred = inputs
        # 分离正样本和负样本的预测得分
        pos_scores = y_pred[:tf.shape(y_pred)[0]//2]  # 前半部分是正样本
        neg_scores = y_pred[tf.shape(y_pred)[0]//2:]  # 后半部分是负样本
        
        # 计算对比损失
        loss = tf.maximum(0.0, self.margin - (pos_scores - neg_scores))
        self.add_loss(tf.reduce_mean(loss))
        return y_pred  # 通常返回输入以便于模型构建
    
    def get_config(self):
        config = super().get_config()
        config.update({"margin": self.margin})
        return config

# 连体模型构建

In [227]:
from tensorflow.keras.layers import Layer
import tensorflow as tf

class DotProductSimilarity(Layer):
    def __init__(self, **kwargs):
        super(DotProductSimilarity, self).__init__(**kwargs)
    
    def call(self, inputs):
        user_embedding, movie_embedding = inputs
        return tf.reduce_sum(user_embedding * movie_embedding, axis=1, keepdims=True)
    
    def get_config(self):
        return super().get_config()
    
def build_siamese_model_with_loss(user_tower, movie_tower):
    # 用户输入
    user_id_input = Input(shape=(1,), name='user_id')
    user_features_input = Input(shape=(len(user_feat_cols),), name='user_features')
    
    # 电影输入
    movie_id_input = Input(shape=(1,), name='movie_id')
    movie_features_input = Input(shape=(len(movie_feat_cols),), name='movie_features')
    
    # 获取嵌入向量
    user_embedding = user_tower([user_id_input, user_features_input])
    movie_embedding = movie_tower([movie_id_input, movie_features_input])
    
    # 使用自定义层计算点积相似度
    similarity_score = DotProductSimilarity()([user_embedding, movie_embedding])

    # 添加虚拟标签输入
    dummy_labels = Input(shape=(1,), name='dummy_labels')
    
    # 添加损失层
    loss_output = ContrastiveLossLayer(margin=1.0)([dummy_labels, similarity_score])
    
    return Model(
        inputs={
            'user_id': user_id_input,
            'user_features': user_features_input,
            'movie_id': movie_id_input,
            'movie_features': movie_features_input,
            'dummy_labels': dummy_labels
        },
        outputs=loss_output
    )

In [228]:
# 构建双塔模型
user_tower = build_user_tower()
movie_tower = build_movie_tower()

# 构建连体模型
siamese_model = build_siamese_model(user_tower, movie_tower)

In [229]:
# 编译模型
margin = 1.0
loss = ContrastiveLoss(margin=margin)
siamese_model.compile(optimizer='adam', loss=loss)

# 创建训练和验证数据生成器
train_generator = data_generator(train_pairs, batch_size=512)
val_generator = data_generator(val_pairs, batch_size=512)

# 计算训练和验证步骤数
train_steps = len(train_pairs) // 512
val_steps = len(val_pairs) // 512

# 训练模型
siamese_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))


In [230]:

# 训练模型
history = siamese_model.fit(
    train_generator,
    steps_per_epoch=len(train_pairs)//512,
    epochs=10,
    validation_data=val_generator,
    validation_steps=len(val_pairs)//512
)

Epoch 1/10


Expected: {'user_id': 'user_id', 'user_features': 'user_features', 'movie_id': 'movie_id', 'movie_features': 'movie_features'}
Received: inputs={'user_id': 'Tensor(shape=(None,))', 'user_features': 'Tensor(shape=(None, 46))', 'movie_id': 'Tensor(shape=(None,))', 'movie_features': 'Tensor(shape=(None, 25))', 'dummy_labels': 'Tensor(shape=(None,))'}


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_67_1/Cast:0", shape=(None,), dtype=float32). Expected shape (None, 25), but input has incompatible shape (None,)[0m

Arguments received by Functional.call():
  • inputs={'user_id': 'tf.Tensor(shape=(None,), dtype=int32)', 'user_features': 'tf.Tensor(shape=(None, 46), dtype=float32)', 'movie_id': 'tf.Tensor(shape=(None,), dtype=int32)', 'movie_features': 'tf.Tensor(shape=(None, 25), dtype=float32)', 'dummy_labels': 'tf.Tensor(shape=(None,), dtype=float64)'}
  • training=True
  • mask={'user_id': 'None', 'user_features': 'None', 'movie_id': 'None', 'movie_features': 'None', 'dummy_labels': 'None'}