In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

# 查看数据基本情况

In [3]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
print("用户样本数:",len(users_df))
print("电影样本数:",len(movies_df))
print("评分样本数:", len(ratings_df))

用户样本数: 6040
电影样本数: 3883
评分样本数: 1000209


In [7]:
# 查看 movies_df 的空值情况
print("movies_df 的空值情况：")
print(movies_df.isna().sum())

# 查看 ratings_df 的空值情况
print("\nratings_df 的空值情况：")
print(ratings_df.isna().sum())

# 查看 users_df 的空值情况
print("\nusers_df 的空值情况：")
print(users_df.isna().sum())

movies_df 的空值情况：
movie_id    0
title       0
genres      0
dtype: int64

ratings_df 的空值情况：
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

users_df 的空值情况：
user_id       0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64


In [112]:
# 1. 按照 'user_id' 进行分组，并计算每个用户的评论数量
user_review_counts = ratings_df.groupby('user_id')['movie_id'].count()

# 2. 计算平均评论数量
average_reviews_per_user = user_review_counts.mean()

# 3. 打印结果
print(f"平均每个用户评论过的电影数量为: {average_reviews_per_user:.2f} 部")

# 可选：查看评论数量的分布情况
print("\n每个用户评论过的电影数量的统计信息:")
print(user_review_counts.describe())

平均每个用户评论过的电影数量为: 165.60 部

每个用户评论过的电影数量的统计信息:
count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
Name: movie_id, dtype: float64


# 数据预处理

In [8]:
genres=movies_df['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies_df, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings_df, movies, on='movie_id')

In [9]:
genres

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# 有哪些电影没有参与过评分
unrated_movies = movies_df[~movies_df['movie_id'].isin(ratings_df['movie_id'])]
print("没有参与过评分的电影:")
print(unrated_movies)
print("没有参与过评分的电影数量为:", len(unrated_movies))

没有参与过评分的电影:
      movie_id                                title                 genres
50          51                Guardian Angel (1994)  Action|Drama|Thriller
107        109  Headless Body in Topless Bar (1995)                 Comedy
113        115     Happiness Is in the Field (1995)                 Comedy
141        143                         Gospa (1995)                  Drama
281        284                  New York Cop (1996)           Action|Crime
...        ...                                  ...                    ...
3581      3650            Anguish (Angustia) (1986)                 Horror
3681      3750                Boricua's Bond (2000)                  Drama
3759      3829               Mad About Mambo (2000)         Comedy|Romance
3786      3856                  Autumn Heart (1999)                  Drama
3837      3907   Prince of Central Park, The (1999)                  Drama

[177 rows x 3 columns]
没有参与过评分的电影数量为: 177


In [12]:
# 有哪些用户没有参与过评分
unrated_users=users_df[~users_df['user_id'].isin(ratings_df['user_id'])]
print("没有参与过评分的用户:")
print(unrated_users)
print("没有参与过评分的用户数量为:", len(unrated_users))

没有参与过评分的用户:
Empty DataFrame
Columns: [user_id, gender, age, occupation, zip_code]
Index: []
没有参与过评分的用户数量为: 0


# 特征工程

## 用户特征工程

### 1. 用户统计特征

In [13]:
user_stats=data.groupby('user_id')['rating'].agg([
    ('mean_rating', 'mean'), # 平均评分
    ('rating_std', 'std'), # 评分标准差
    ('rating_count','count'), # 评分次数
    ('rating_min','min'), # 最小评分
    ('rating_max','max') # 最大评分
]).reset_index()


# 计算用户评分严格程度
global_mean_rating=data['rating'].mean()
user_stats['rating_strictness']=global_mean_rating-user_stats['mean_rating']

# 计算用户评分波动程度
user_stats['rating_variability']=user_stats['rating_std']/user_stats['mean_rating']

# 查看结果
print(user_stats.head())

   user_id  mean_rating  rating_std  rating_count  rating_min  rating_max  \
0        1     4.188679    0.680967            53           3           5   
1        2     3.713178    1.001513           129           1           5   
2        3     3.901961    0.984985            51           1           5   
3        4     4.190476    1.077917            21           1           5   
4        5     3.146465    1.132699           198           1           5   

   rating_strictness  rating_variability  
0          -0.607115            0.162573  
1          -0.131614            0.269719  
2          -0.320396            0.252433  
3          -0.608912            0.257230  
4           0.435100            0.359991  


### 2. 用户电影类型偏好特征

In [14]:
# 获取所有电影类型列
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]


# 计算用户对每种类型的评分次数和平均评分（用户ID、电影类型以及评分次数）
user_genre_stats=data.groupby('user_id')[genre_columns].sum().reset_index()

# 计算用户对每种类型的偏好程度（按行处理，将每一个用户对某一类型的评分，除以该用户对所有类型评分的总和）
for genre in genre_columns:
    user_genre_stats[f'{genre}_favorite_degree']=user_genre_stats[genre]/user_genre_stats[genre_columns].sum(axis=1)

for genre in genre_columns:
    user_genre_stats[f'{genre}_rating_cnt']=user_genre_stats[genre]

# 计算用户最喜欢的类型（返回最大值所在的索引），axis=1表示按照行操作
user_genre_stats['favorite_genre']=user_genre_stats[genre_columns].idxmax(axis=1)

# 计算用户喜欢的类型数量（评分过的类型数）（得到该行中评分大于0的类型数量）
user_genre_stats['num_liked_genres']=(user_genre_stats[genre_columns]>0).sum(axis=1)

user_genre_stats.drop(columns=genre_columns,inplace=True)

# 合并所有用户特征
user_features=pd.merge(user_stats, user_genre_stats, on='user_id')

# 添加用户活跃度分段特征
user_features['activity_level'] = pd.cut(
    user_features['rating_count'],
    bins=[0, 5, 20, 100, float('inf')],
    labels=['inactive', 'casual', 'active', 'super']
)

# 用户活跃度特征编码
activity_encoder = LabelEncoder()
user_features['activity_level_encoded'] = activity_encoder.fit_transform(user_features['activity_level'])

# 用户最喜欢的类型编码
genre_encoder = LabelEncoder()
user_features['favorite_genre_encoded'] = genre_encoder.fit_transform(user_features['favorite_genre'])


In [15]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


## 电影特征工程

In [16]:
# 计算电影的基本评分统计
movie_stats=data.groupby('movie_id')['rating'].agg([
    ('movie_mean_rating', 'mean'),
    ('movie_rating_std', 'std'),
    ('movie_rating_count', 'count')
])

# 填充可能存在的NaN值
movie_stats['movie_rating_std']=movie_stats['movie_rating_std'].fillna(0)

print("movie_stats:")
print(movie_stats)

# 合并电影原始信息
movie_features=pd.merge(movies, movie_stats, on='movie_id')

# 添加电影热度特征
movie_features['popularity'] = pd.cut(
    movie_features['movie_rating_count'],
    bins=[0, 10, 100, 500, float('inf')],
    labels=['niche', 'moderate', 'popular', 'blockbuster']
)

# 电影热度编码
popularity_encoder = LabelEncoder()
movie_features['popularity_encoded'] = popularity_encoder.fit_transform(movie_features['popularity'])

# 计算电影类型纯度（类型数量越少，纯度越高）
movie_features['genre_purity']=1/movie_features[genre_columns].sum(axis=1)

# 提取电影发布年份
movie_features['year'] = movie_features['title'].str.extract(r'\((\d{4})\)')
movie_features['year'] = movie_features['year'].fillna('1990').astype(int)

# 创建标题长度特征
movie_features['title_length'] = movie_features['title'].str.len()

movie_stats:
          movie_mean_rating  movie_rating_std  movie_rating_count
movie_id                                                         
1                  4.146846          0.852349                2077
2                  3.201141          0.983172                 701
3                  3.016736          1.071712                 478
4                  2.729412          1.013381                 170
5                  3.006757          1.025086                 296
...                     ...               ...                 ...
3948               3.635731          1.014196                 862
3949               4.115132          1.009804                 304
3950               3.666667          1.046107                  54
3951               3.900000          1.057331                  40
3952               3.780928          0.935074                 388

[3706 rows x 3 columns]


In [17]:
print("电影维度:", movie_features.shape)
print("用户维度:", user_features.shape)

电影维度: (3706, 29)
用户维度: (6040, 49)


## 检查user_features和movie_features的空值情况

In [18]:
# 检查user_features中哪些列含有空值
user_features_null_columns = user_features.columns[user_features.isna().any()].tolist()
print("user_features中含有空值的列:", user_features_null_columns)

# 检查movie_features中哪些列含有空值
movie_features_null_columns = movie_features.columns[movie_features.isna().any()].tolist()
print("movie_features中含有空值的列:", movie_features_null_columns)

user_features中含有空值的列: []
movie_features中含有空值的列: []


## 保存特征数据

In [19]:
user_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/user_features.csv', index=False)
movie_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/movie_features.csv', index=False)

In [20]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


In [21]:
movie_features.head()

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,popularity_encoded,genre_purity,year,title_length
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4.146846,0.852349,2077,blockbuster,0,0.333333,1995,16
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3.201141,0.983172,701,blockbuster,0,0.333333,1995,14
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3.016736,1.071712,478,popular,3,0.5,1995,23
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2.729412,1.013381,170,popular,3,0.5,1995,24
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.006757,1.025086,296,popular,3,1.0,1995,34


## 数据准备：生成正负样本对

### 读取数值型特征

In [22]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# 从 user_features 和 movie_features 中筛选出数值型特征
def select_numeric_columns(df):
    # 选择数值型列（int 和 float）
    numeric_columns = df.select_dtypes(include=['number']).columns
    return df[numeric_columns]

# 筛选用户和电影的数值型特征
user_features_numeric = select_numeric_columns(user_features)
movie_features_numeric = select_numeric_columns(movie_features)

user_features_numeric = user_features_numeric.astype('float32')
movie_features_numeric = movie_features_numeric.astype('float32')

# 合并用户特征和电影特征
user_movie_pairs = pd.merge(ratings_df[['user_id', 'movie_id', 'rating']], user_features_numeric, on='user_id')
user_movie_pairs = pd.merge(user_movie_pairs, movie_features_numeric, on='movie_id')



In [23]:
user_movie_pairs = pd.merge(user_movie_pairs, user_stats[['user_id', 'mean_rating']], on='user_id')

# 确保删除了合并时产生的无用列
user_movie_pairs.drop(columns=['mean_rating_x'], inplace=True, errors='ignore')
user_movie_pairs.rename(columns={'mean_rating_y': 'mean_rating'}, inplace=True)

# 根据用户的评分和平均评分来设定正负样本
user_movie_pairs['label'] = user_movie_pairs.apply(
    lambda x: 1 if x['rating'] > x['mean_rating'] else 0, axis=1
)


### 负样本采样

In [24]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

def vectorized_generate_negatives(user_movie_pairs, movies_df, ratings_df, num_hard_negatives=5, num_disliked_negatives=3):
    """
    为每个用户生成负样本，包括：
    - 随机负样本（用户未观看的）
    - Hard Negative（热门高评分未观看）
    - 用户已看但不喜欢的电影（低于该电影平均分）

    Args:
        user_movie_pairs (pd.DataFrame): 用户-电影交互对。
        movies_df (pd.DataFrame): 包含电影信息。
        ratings_df (pd.DataFrame): 包含用户评分数据。
        num_hard_negatives (int): 每个用户生成的 Hard Negative 样本数。
        num_disliked_negatives (int): 每个用户生成的低评分负样本数。

    Returns:
        pd.DataFrame: 包含负样本的 DataFrame，列为 'user_id', 'movie_id', 'label'
    """
    # 创建用户-电影交互矩阵
    user_movie_matrix = pd.crosstab(user_movie_pairs['user_id'], user_movie_pairs['movie_id'])

    all_users = user_movie_matrix.index
    all_movies = user_movie_matrix.columns

    # 计算电影平均评分与观看次数
    movie_stats = ratings_df.groupby('movie_id')['rating'].agg(['mean', 'count']).reset_index()
    movie_stats.rename(columns={'mean': 'avg_rating', 'count': 'rating_count'}, inplace=True)

    # 合并电影统计信息
    movies_with_stats = pd.merge(movies_df, movie_stats, on='movie_id', how='left')
    movies_with_stats['rating_count'].fillna(0, inplace=True)

    # 热门高分电影筛选
    popularity_threshold = movies_with_stats['rating_count'].quantile(0.75)
    high_rating_threshold = 4.0 # 高评分阈值
    popular_high_rated_movies = movies_with_stats[
        (movies_with_stats['avg_rating'] >= high_rating_threshold) &
        (movies_with_stats['rating_count'] >= popularity_threshold)
    ]['movie_id'].tolist()

    # 缓存用户已观看电影
    user_history = user_movie_pairs.groupby('user_id')['movie_id'].apply(list).to_dict()

    # 缓存用户评分信息
    user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
    movie_avg_rating_map = movie_stats.set_index('movie_id')['avg_rating'].to_dict()

    negative_pairs = []

    for user_id in tqdm(all_users, desc='为每个用户生成负样本'):
        seen_movies = set(user_movie_matrix.columns[user_movie_matrix.loc[user_id] > 0])
        user_watched_movies = user_history.get(user_id, [])
        unseen_movies = list(set(all_movies) - seen_movies)

        # 1. 未看过的随机电影
        if unseen_movies:
            random_negative_movie = np.random.choice(unseen_movies)
            negative_pairs.append({
                'user_id': user_id,
                'movie_id': random_negative_movie,
                'label': 0
            })

        # 2. 热门高评分负样本
        hard_negatives = generate_popular_high_rated_negatives(
            user_watched_movies,
            popular_high_rated_movies,
            num_hard_negatives
        )
        for mid in hard_negatives:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})

        # 3. 看过但评分低于平均值的电影
        user_movie_ratings = user_ratings.get(user_id, {})
        disliked = []
        for mid, rating in user_movie_ratings.items():
            avg_rating = movie_avg_rating_map.get(mid, 0)
            if rating < avg_rating:
                disliked.append(mid)
        sampled_disliked = random.sample(disliked, min(len(disliked), num_disliked_negatives))
        for mid in sampled_disliked:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})

    return pd.DataFrame(negative_pairs)


def generate_popular_high_rated_negatives(user_watched_movies, all_popular_high_rated_movies, num_negatives):
    available_negatives = list(set(all_popular_high_rated_movies) - set(user_watched_movies))
    if not available_negatives:
        return []
    return list(np.random.choice(available_negatives, size=min(num_negatives, len(available_negatives)), replace=False))


def create_features(data, user_features, movie_features):
    user_data = user_features[user_features['user_id'].isin(data['user_id'])].set_index('user_id')
    movie_data = movie_features[movie_features['movie_id'].isin(data['movie_id'])].set_index('movie_id')
    
    user_feats = user_data.loc[data['user_id']].values
    movie_feats = movie_data.loc[data['movie_id']].values
    
    return user_feats, movie_feats


In [25]:
# 获取负样本
negative_samples=vectorized_generate_negatives(user_movie_pairs, movies_df, ratings_df)

# 获取正样本
positive_samples = user_movie_pairs[user_movie_pairs['label']==1][['user_id', 'movie_id', 'label']]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_with_stats['rating_count'].fillna(0, inplace=True)
  user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
为每个用户生成负样本: 100%|██████████| 6040/6040 [00:02<00:00, 2600.15it/s]


In [26]:
# 将正样本和负样本合并在一起
data=pd.concat([positive_samples, negative_samples])


In [27]:
def build_pairwise_training_data(pos_df, neg_df):
    """
    构造 (user, pos_item, neg_item) 三元组
    要求：每个用户有正样本和负样本
    """
    # 保证负样本存在
    neg_df_grouped = neg_df.groupby('user_id')
    triplets = []

    for idx, row in tqdm(pos_df.iterrows(), desc='构建三元组进度'):
        user_id = row['user_id']
        pos_item = row['movie_id']

        if user_id not in neg_df_grouped.groups:
            continue

        user_negatives = neg_df_grouped.get_group(user_id)['movie_id'].values
        for neg_item in user_negatives:
            triplets.append({'user_id': user_id, 'pos_movie_id': pos_item, 'neg_movie_id': neg_item})

    return pd.DataFrame(triplets)

pairs_pos_neg=build_pairwise_training_data(positive_samples, negative_samples)

构建三元组进度: 543158it [00:34, 15636.85it/s]


## 构建双塔模型

In [63]:
user_feature_dim = user_features_numeric.shape[1]-1
movie_feature_dim = movie_features_numeric.shape[1]-1
num_users=max(user_features['user_id'])+1
num_movies=max(movie_features['movie_id'])+1
embedding_dim = 64


In [73]:
# 2. 构建用户塔模型
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Embedding, Input, Dense, Multiply, Flatten, Dropout

def create_user_tower(num_users, user_feature_dim, embedding_dim, dropout_rate=0.2):
    input_user_id = Input(shape=(1,), name='user_id_input')
    embedding_user = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(input_user_id)
    embedding_user_flat = Flatten()(embedding_user)

    input_user_features = Input(shape=(user_feature_dim,), name='user_features_input')
    merged_features = layers.Concatenate()([embedding_user_flat, input_user_features])

    tower = Dense(128, activation='relu')(merged_features)
    tower=Dropout(dropout_rate)(tower)
    tower = Dense(embedding_dim, name='user_embedding_output')(tower)
    return models.Model(inputs=[input_user_id, input_user_features], outputs=tower)

# 3. 构建电影塔模型 (包含 movie_id)
def create_movie_tower(num_movies, movie_feature_dim, embedding_dim, dropout_rate=0.2):
    input_movie_id = Input(shape=(1,), name='movie_id_input')
    embedding_movie = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(input_movie_id)
    embedding_movie_flat = Flatten()(embedding_movie)

    input_movie_features = Input(shape=(movie_feature_dim,), name='movie_features_input')
    merged_features = layers.Concatenate()([embedding_movie_flat, input_movie_features])

    tower = Dense(128, activation='relu')(merged_features)
    tower=Dropout(dropout_rate)(tower)
    tower = Dense(embedding_dim, name='movie_embedding_output')(tower)
    return models.Model(inputs=[input_movie_id, input_movie_features], outputs=tower)


# 创建用户塔和电影塔实例
user_tower = create_user_tower(num_users, user_feature_dim, embedding_dim)
movie_tower = create_movie_tower(num_movies, movie_feature_dim, embedding_dim)


In [74]:
user_tower.summary()

In [75]:
movie_tower.summary()

## 定义pairwise损失函数

In [67]:
# 4. 定义 pairwise 损失函数 (Margin Ranking Loss)
def pairwise_loss(positive_similarity, negative_similarity, margin=0.1):
    loss = tf.maximum(0.0, margin - positive_similarity + negative_similarity)
    return tf.reduce_mean(loss)

In [77]:
user_ids = pairs_pos_neg['user_id'].values.astype('int32').reshape(-1, 1)
positive_movie_ids = pairs_pos_neg['pos_movie_id'].values.astype('int32').reshape(-1, 1)
negative_movie_ids = pairs_pos_neg['neg_movie_id'].values.astype('int32').reshape(-1, 1)

user_feature_dict = user_features_numeric.set_index('user_id').to_dict('index')
movie_feature_dict = movie_features_numeric.set_index('movie_id').to_dict('index')

user_feature_list = np.array([list(user_feature_dict[uid].values()) for uid in pairs_pos_neg['user_id']])
positive_movie_feature_list = np.array([list(movie_feature_dict[mid].values()) for mid in pairs_pos_neg['pos_movie_id']])
negative_movie_feature_list = np.array([list(movie_feature_dict[mid].values()) for mid in pairs_pos_neg['neg_movie_id']])


In [78]:
user_ids_train, user_ids_val, user_features_train, user_features_val, pos_movie_ids_train, pos_movie_ids_val, pos_movie_features_train, pos_movie_features_val, neg_movie_ids_train, neg_movie_ids_val, neg_movie_features_train, neg_movie_features_val = train_test_split(
    user_ids, user_feature_list, positive_movie_ids, positive_movie_feature_list, negative_movie_ids, negative_movie_feature_list, test_size=0.2, random_state=42
)

In [86]:
print("训练集数据量:", len(user_ids_train))
print("测试集数据量:", len(user_ids_val))

训练集数据量: 3910201
测试集数据量: 977551


## 模型训练

In [None]:
# 5. 定义模型训练步骤 (需要处理 ID 输入)
def train_step_with_id(user_ids_batch, user_features_batch, positive_movie_ids_batch, positive_movie_features_batch, negative_movie_ids_batch, negative_movie_features_batch, user_tower, movie_tower, optimizer):
    with tf.GradientTape() as tape:
        user_embeddings = user_tower([user_ids_batch, user_features_batch])
        positive_movie_embeddings = movie_tower([positive_movie_ids_batch, positive_movie_features_batch])
        negative_movie_embeddings = movie_tower([negative_movie_ids_batch, negative_movie_features_batch])

        # 计算余弦相似度
        positive_similarity = tf.reduce_sum(tf.multiply(user_embeddings, positive_movie_embeddings), axis=-1)
        negative_similarity = tf.reduce_sum(tf.multiply(user_embeddings, negative_movie_embeddings), axis=-1)

        loss = pairwise_loss(positive_similarity, negative_similarity)

    gradients = tape.gradient(loss, user_tower.trainable_variables + movie_tower.trainable_variables)
    optimizer.apply_gradients(zip(gradients, user_tower.trainable_variables + movie_tower.trainable_variables))
    return loss

In [95]:
# 7. 定义训练参数
epochs = 2
batch_size = 1000
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# 8. 开始训练
for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    num_batches = len(user_ids_train) // batch_size
    for batch in range(num_batches):
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size

        user_ids_batch = user_ids_train[start_index:end_index]
        user_features_batch = user_features_train[start_index:end_index]
        pos_movie_ids_batch = pos_movie_ids_train[start_index:end_index]
        pos_movie_features_batch = pos_movie_features_train[start_index:end_index]
        neg_movie_ids_batch = neg_movie_ids_train[start_index:end_index]
        neg_movie_features_batch = neg_movie_features_train[start_index:end_index]

        loss = train_step_with_id(
            user_ids_batch, user_features_batch,
            pos_movie_ids_batch, pos_movie_features_batch,
            neg_movie_ids_batch, neg_movie_features_batch,
            user_tower, movie_tower, optimizer
        )
        if batch % 100 == 0:
            print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy():.4f}")

Epoch 1
Batch 0/3910, Loss: 0.0675
Batch 100/3910, Loss: 0.0571
Batch 200/3910, Loss: 0.0723
Batch 300/3910, Loss: 0.1260
Batch 400/3910, Loss: 0.1726
Batch 500/3910, Loss: 0.1199
Batch 600/3910, Loss: 0.0664
Batch 700/3910, Loss: 0.0632
Batch 800/3910, Loss: 0.0502
Batch 900/3910, Loss: 0.0658
Batch 1000/3910, Loss: 0.2159
Batch 1100/3910, Loss: 0.1043
Batch 1200/3910, Loss: 0.0533
Batch 1300/3910, Loss: 0.0685
Batch 1400/3910, Loss: 0.0488
Batch 1500/3910, Loss: 0.0657
Batch 1600/3910, Loss: 0.0389
Batch 1700/3910, Loss: 0.0726
Batch 1800/3910, Loss: 0.0678
Batch 1900/3910, Loss: 0.0330
Batch 2000/3910, Loss: 0.0473
Batch 2100/3910, Loss: 0.0355
Batch 2200/3910, Loss: 0.0506
Batch 2300/3910, Loss: 0.0482
Batch 2400/3910, Loss: 0.0497
Batch 2500/3910, Loss: 0.0378
Batch 2600/3910, Loss: 0.0812
Batch 2700/3910, Loss: 0.0397
Batch 2800/3910, Loss: 0.0302
Batch 2900/3910, Loss: 0.0475
Batch 3000/3910, Loss: 0.0381
Batch 3100/3910, Loss: 0.0253
Batch 3200/3910, Loss: 0.0339
Batch 3300/391

## 将电影embedding存储到FAISS中

### 获取movie embedding

In [96]:
import faiss

# 1. 从 movie_features 中提取 movie_id 和特征数据
all_movie_ids = movie_features['movie_id'].values.astype('int32').reshape(-1, 1)
movie_feature_columns = [col for col in movie_features_numeric.columns if col != 'movie_id']
all_movie_features = movie_features_numeric[movie_feature_columns].values.astype('float32')


# 2. 使用 movie_tower 的 predict 方法生成 movie embeddings
movie_embeddings = movie_tower.predict([all_movie_ids, all_movie_features])

# 3. 打印 movie_embeddings 的形状，确认已成功生成
print("Shape of movie_embeddings:", movie_embeddings.shape)

[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step
Shape of movie_embeddings: (3706, 64)


### 将movie embedding存储到FAISS中

In [97]:
# 1. 获取电影 Embedding 和对应的 movie_id
movie_ids = movie_features['movie_id'].values
movie_embeddings_array = movie_embeddings.astype('float32')  # FAISS 需要 float32 类型

# 2. 指定 Embedding 的维度
embedding_dimension = movie_embeddings_array.shape[1]

In [98]:
# 3. 构建 FAISS Index
index = faiss.IndexFlatL2(embedding_dimension)

# 4. 将电影 Embedding 添加到 FAISS Index 中
index.add(movie_embeddings_array)

# 5. 创建一个 movie_id 到 FAISS 索引的映射，方便后续查找
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(movie_ids)}


### 使用FAISS进行召回

In [113]:
def get_similar_movies(user_embedding, faiss_index, movie_ids, top_k=50):
    """
    根据用户 Embedding 在 FAISS 索引中查找最相似的电影。

    Args:
        user_embedding (np.array): 用户 Embedding 向量 (shape: (1, embedding_dimension)).
        faiss_index: 构建好的 FAISS 索引。
        movie_ids (np.array): 所有电影的 movie_id 列表，与 FAISS 索引中的顺序对应。
        top_k (int): 需要召回的电影数量。

    Returns:
        list: 最相似的 top_k 个电影的 movie_id 列表。
    """
    distances, indices = faiss_index.search(np.expand_dims(user_embedding.astype('float32'), axis=0), top_k)
    similar_movie_ids = [movie_ids[i] for i in indices[0]]
    return similar_movie_ids

In [114]:
# 1. 获取验证集用户的embeddings
user_embeddings_val = user_tower.predict([user_ids_val, user_features_val])

[1m30549/30549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 310us/step


In [119]:
all_retrieved_movie_ids = []
for user_embedding in tqdm(user_embeddings_val, '获取每个用户的召回结果'):
    top_k = 50  # 你想为每个用户召回的电影数量
    similar_movie_ids = get_similar_movies(user_embedding, index, movie_features['movie_id'].values, top_k=top_k)
    all_retrieved_movie_ids.append(similar_movie_ids)

获取每个用户的召回结果: 100%|██████████| 977551/977551 [00:37<00:00, 25991.17it/s]


In [120]:
# 3. 获取验证集用户的正向交互电影 (ground truth)
positive_interactions_val = positive_samples[positive_samples['user_id'].isin(user_ids_val.flatten())]

In [121]:
# 为每个验证集用户创建一个他们实际交互过的电影列表
ground_truth = {}
for user_id in np.unique(user_ids_val):
    ground_truth[user_id] = positive_interactions_val[positive_interactions_val['user_id'] == user_id]['movie_id'].tolist()

In [122]:
ordered_ground_truth = [ground_truth.get(user_id, []) for user_id in user_ids_val.flatten()]

## 离线指标评估

### NDCG

In [125]:
def calculate_ndcg_at_k(ground_truth, predictions, k):
    """
    计算 NDCG@K。

    Args:
        ground_truth (list of lists): 每个用户实际交互过的物品 ID 列表。
        predictions (list of lists): 每个用户召回的 Top-K 个物品 ID 列表。
        k (int): Top-K 的值.

    Returns:
        float: NDCG@K 的平均值.
    """
    all_ndcg = []
    for i in tqdm(range(len(ground_truth)), desc='计算 NDCG@{}'.format(k)):
        actual = ground_truth[i]
        predicted = predictions[i][:k]

        # 创建相关性得分，实际交互过的为 1，否则为 0
        relevance = np.array([1 if item_id in actual else 0 for item_id in predicted])

        # 如果实际相关的物品数量大于 0，则计算 NDCG
        if len(actual) > 0 and len(predicted) > 0:
            # 创建理想的 relevance 列表 (将实际相关的物品排在最前面)
            ideal_relevance = np.ones(min(k, len(actual)))
            ideal_dcg = np.sum(ideal_relevance / np.log2(np.arange(2, len(ideal_relevance) + 2)))

            # 计算实际的 DCG
            actual_dcg = np.sum(relevance / np.log2(np.arange(2, k + 2)))

            if ideal_dcg > 0:
                all_ndcg.append(actual_dcg / ideal_dcg)
            else:
                all_ndcg.append(0.0)  # 如果没有相关的物品，NDCG 为 0
        else:
            all_ndcg.append(0.0)

    return np.mean(all_ndcg) if all_ndcg else 0.0

In [126]:
ndcg_at_k = calculate_ndcg_at_k(ordered_ground_truth, all_retrieved_movie_ids, 50)
print(f"NDCG@{top_k} on the validation set: {ndcg_at_k:.4f}")

计算 NDCG@50: 100%|██████████| 977551/977551 [04:32<00:00, 3582.32it/s]


NDCG@50 on the validation set: 0.0135


In [127]:
def calculate_recall_at_k(ground_truth, predictions, k):
    """
    计算 Recall@K。

    Args:
        ground_truth (list of lists): 每个用户实际交互过的物品 ID 列表。
        predictions (list of lists): 每个用户召回的 Top-K 个物品 ID 列表。
        k (int): Top-K 的值。

    Returns:
        float: Recall@K 的平均值。
    """
    total_recall = 0
    num_users = len(ground_truth)
    for i in tqdm(range(num_users), desc=f"计算 Recall@{k}"):
        actual_items = set(ground_truth[i])
        predicted_items = set(predictions[i][:k])
        if len(actual_items) > 0:
            intersection = len(actual_items.intersection(predicted_items))
            recall = intersection / len(actual_items)
            total_recall += recall
    return total_recall / num_users if num_users > 0 else 0.0

In [128]:
top_k_recall = 50
recall_at_k = calculate_recall_at_k(ordered_ground_truth, all_retrieved_movie_ids, top_k_recall)
print(f"Recall@{top_k_recall} on the validation set: {recall_at_k:.4f}")

计算 Recall@50: 100%|██████████| 977551/977551 [00:06<00:00, 152599.93it/s]

Recall@50 on the validation set: 0.0025



