In [47]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [48]:
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

# 查看数据基本情况

In [49]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [51]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [52]:
print("用户样本数:",len(users_df))
print("电影样本数:",len(movies_df))
print("评分样本数:", len(ratings_df))

用户样本数: 6040
电影样本数: 3883
评分样本数: 1000209


In [53]:
# 查看 movies_df 的空值情况
print("movies_df 的空值情况：")
print(movies_df.isna().sum())

# 查看 ratings_df 的空值情况
print("\nratings_df 的空值情况：")
print(ratings_df.isna().sum())

# 查看 users_df 的空值情况
print("\nusers_df 的空值情况：")
print(users_df.isna().sum())

movies_df 的空值情况：
movie_id    0
title       0
genres      0
dtype: int64

ratings_df 的空值情况：
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

users_df 的空值情况：
user_id       0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64


# 数据预处理

In [54]:
genres=movies_df['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies_df, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings_df, movies, on='movie_id')

In [55]:
genres

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [56]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [57]:
# 有哪些电影没有参与过评分
unrated_movies = movies_df[~movies_df['movie_id'].isin(ratings_df['movie_id'])]
print("没有参与过评分的电影:")
print(unrated_movies)
print("没有参与过评分的电影数量为:", len(unrated_movies))

没有参与过评分的电影:
      movie_id                                title                 genres
50          51                Guardian Angel (1994)  Action|Drama|Thriller
107        109  Headless Body in Topless Bar (1995)                 Comedy
113        115     Happiness Is in the Field (1995)                 Comedy
141        143                         Gospa (1995)                  Drama
281        284                  New York Cop (1996)           Action|Crime
...        ...                                  ...                    ...
3581      3650            Anguish (Angustia) (1986)                 Horror
3681      3750                Boricua's Bond (2000)                  Drama
3759      3829               Mad About Mambo (2000)         Comedy|Romance
3786      3856                  Autumn Heart (1999)                  Drama
3837      3907   Prince of Central Park, The (1999)                  Drama

[177 rows x 3 columns]
没有参与过评分的电影数量为: 177


In [58]:
# 有哪些用户没有参与过评分
unrated_users=users_df[~users_df['user_id'].isin(ratings_df['user_id'])]
print("没有参与过评分的用户:")
print(unrated_users)
print("没有参与过评分的用户数量为:", len(unrated_users))

没有参与过评分的用户:
Empty DataFrame
Columns: [user_id, gender, age, occupation, zip_code]
Index: []
没有参与过评分的用户数量为: 0


# 特征工程

## 用户特征工程

### 1. 用户统计特征

In [59]:
user_stats=data.groupby('user_id')['rating'].agg([
    ('mean_rating', 'mean'), # 平均评分
    ('rating_std', 'std'), # 评分标准差
    ('rating_count','count'), # 评分次数
    ('rating_min','min'), # 最小评分
    ('rating_max','max') # 最大评分
]).reset_index()


# 计算用户评分严格程度
global_mean_rating=data['rating'].mean()
user_stats['rating_strictness']=global_mean_rating-user_stats['mean_rating']

# 计算用户评分波动程度
user_stats['rating_variability']=user_stats['rating_std']/user_stats['mean_rating']

# 查看结果
print(user_stats.head())

   user_id  mean_rating  rating_std  rating_count  rating_min  rating_max  \
0        1     4.188679    0.680967            53           3           5   
1        2     3.713178    1.001513           129           1           5   
2        3     3.901961    0.984985            51           1           5   
3        4     4.190476    1.077917            21           1           5   
4        5     3.146465    1.132699           198           1           5   

   rating_strictness  rating_variability  
0          -0.607115            0.162573  
1          -0.131614            0.269719  
2          -0.320396            0.252433  
3          -0.608912            0.257230  
4           0.435100            0.359991  


### 2. 用户电影类型偏好特征

In [60]:
# 获取所有电影类型列
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]


# 计算用户对每种类型的评分次数和平均评分（用户ID、电影类型以及评分次数）
user_genre_stats=data.groupby('user_id')[genre_columns].sum().reset_index()

# 计算用户对每种类型的偏好程度（按行处理，将每一个用户对某一类型的评分，除以该用户对所有类型评分的总和）
for genre in genre_columns:
    user_genre_stats[f'{genre}_favorite_degree']=user_genre_stats[genre]/user_genre_stats[genre_columns].sum(axis=1)

for genre in genre_columns:
    user_genre_stats[f'{genre}_rating_cnt']=user_genre_stats[genre]

# 计算用户最喜欢的类型（返回最大值所在的索引），axis=1表示按照行操作
user_genre_stats['favorite_genre']=user_genre_stats[genre_columns].idxmax(axis=1)

# 计算用户喜欢的类型数量（评分过的类型数）（得到该行中评分大于0的类型数量）
user_genre_stats['num_liked_genres']=(user_genre_stats[genre_columns]>0).sum(axis=1)

user_genre_stats.drop(columns=genre_columns,inplace=True)

# 合并所有用户特征
user_features=pd.merge(user_stats, user_genre_stats, on='user_id')

# 添加用户活跃度分段特征
user_features['activity_level'] = pd.cut(
    user_features['rating_count'],
    bins=[0, 5, 20, 100, float('inf')],
    labels=['inactive', 'casual', 'active', 'super']
)

# 用户活跃度特征编码
activity_encoder = LabelEncoder()
user_features['activity_level_encoded'] = activity_encoder.fit_transform(user_features['activity_level'])

# 用户最喜欢的类型编码
genre_encoder = LabelEncoder()
user_features['favorite_genre_encoded'] = genre_encoder.fit_transform(user_features['favorite_genre'])


In [61]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


## 电影特征工程

In [62]:
# 计算电影的基本评分统计
movie_stats=data.groupby('movie_id')['rating'].agg([
    ('movie_mean_rating', 'mean'),
    ('movie_rating_std', 'std'),
    ('movie_rating_count', 'count')
])

# 填充可能存在的NaN值
movie_stats['movie_rating_std']=movie_stats['movie_rating_std'].fillna(0)

print("movie_stats:")
print(movie_stats)

# 合并电影原始信息
movie_features=pd.merge(movies, movie_stats, on='movie_id')

# 添加电影热度特征
movie_features['popularity'] = pd.cut(
    movie_features['movie_rating_count'],
    bins=[0, 10, 100, 500, float('inf')],
    labels=['niche', 'moderate', 'popular', 'blockbuster']
)

# 电影热度编码
popularity_encoder = LabelEncoder()
movie_features['popularity_encoded'] = popularity_encoder.fit_transform(movie_features['popularity'])

# 计算电影类型纯度（类型数量越少，纯度越高）
movie_features['genre_purity']=1/movie_features[genre_columns].sum(axis=1)

# 提取电影发布年份
movie_features['year'] = movie_features['title'].str.extract(r'\((\d{4})\)')
movie_features['year'] = movie_features['year'].fillna('1990').astype(int)

# 创建标题长度特征
movie_features['title_length'] = movie_features['title'].str.len()

movie_stats:
          movie_mean_rating  movie_rating_std  movie_rating_count
movie_id                                                         
1                  4.146846          0.852349                2077
2                  3.201141          0.983172                 701
3                  3.016736          1.071712                 478
4                  2.729412          1.013381                 170
5                  3.006757          1.025086                 296
...                     ...               ...                 ...
3948               3.635731          1.014196                 862
3949               4.115132          1.009804                 304
3950               3.666667          1.046107                  54
3951               3.900000          1.057331                  40
3952               3.780928          0.935074                 388

[3706 rows x 3 columns]


In [63]:
print("电影维度:", movie_features.shape)
print("用户维度:", user_features.shape)

电影维度: (3706, 29)
用户维度: (6040, 49)


## 检查user_features和movie_features的空值情况

In [64]:
# 检查user_features中哪些列含有空值
user_features_null_columns = user_features.columns[user_features.isna().any()].tolist()
print("user_features中含有空值的列:", user_features_null_columns)

# 检查movie_features中哪些列含有空值
movie_features_null_columns = movie_features.columns[movie_features.isna().any()].tolist()
print("movie_features中含有空值的列:", movie_features_null_columns)

user_features中含有空值的列: []
movie_features中含有空值的列: []


## 保存特征数据

In [65]:
user_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/user_features.csv', index=False)
movie_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/movie_features.csv', index=False)

In [66]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


In [67]:
movie_features.head()

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,popularity_encoded,genre_purity,year,title_length
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4.146846,0.852349,2077,blockbuster,0,0.333333,1995,16
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3.201141,0.983172,701,blockbuster,0,0.333333,1995,14
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3.016736,1.071712,478,popular,3,0.5,1995,23
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2.729412,1.013381,170,popular,3,0.5,1995,24
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.006757,1.025086,296,popular,3,1.0,1995,34


## 数据准备：生成正负样本对

### 读取数值型特征

In [68]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# 从 user_features 和 movie_features 中筛选出数值型特征
def select_numeric_columns(df):
    # 选择数值型列（int 和 float）
    numeric_columns = df.select_dtypes(include=['number']).columns
    return df[numeric_columns]

# 筛选用户和电影的数值型特征
user_features_numeric = select_numeric_columns(user_features)
movie_features_numeric = select_numeric_columns(movie_features)

user_features_numeric = user_features_numeric.astype('float32')
movie_features_numeric = movie_features_numeric.astype('float32')

# 合并用户特征和电影特征
user_movie_pairs = pd.merge(ratings_df[['user_id', 'movie_id', 'rating']], user_features_numeric, on='user_id')
user_movie_pairs = pd.merge(user_movie_pairs, movie_features_numeric, on='movie_id')

In [69]:
user_movie_pairs = pd.merge(user_movie_pairs, user_stats[['user_id', 'mean_rating']], on='user_id')

# 确保删除了合并时产生的无用列
user_movie_pairs.drop(columns=['mean_rating_x'], inplace=True, errors='ignore')
user_movie_pairs.rename(columns={'mean_rating_y': 'mean_rating'}, inplace=True)

# 根据用户的评分和平均评分来设定正负样本
user_movie_pairs['label'] = user_movie_pairs.apply(
    lambda x: 1 if x['rating'] > x['mean_rating'] else 0, axis=1
)


### 负样本采样

In [70]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

def vectorized_generate_negatives(user_movie_pairs, movies_df, ratings_df, num_hard_negatives=5, num_disliked_negatives=3):
    """
    为每个用户生成负样本，包括：
    - 随机负样本（用户未观看的）
    - Hard Negative（热门高评分未观看）
    - 用户已看但不喜欢的电影（低于该电影平均分）

    Args:
        user_movie_pairs (pd.DataFrame): 用户-电影交互对。
        movies_df (pd.DataFrame): 包含电影信息。
        ratings_df (pd.DataFrame): 包含用户评分数据。
        num_hard_negatives (int): 每个用户生成的 Hard Negative 样本数。
        num_disliked_negatives (int): 每个用户生成的低评分负样本数。

    Returns:
        pd.DataFrame: 包含负样本的 DataFrame，列为 'user_id', 'movie_id', 'label'
    """
    # 创建用户-电影交互矩阵
    user_movie_matrix = pd.crosstab(user_movie_pairs['user_id'], user_movie_pairs['movie_id'])

    all_users = user_movie_matrix.index
    all_movies = user_movie_matrix.columns

    # 计算电影平均评分与观看次数
    movie_stats = ratings_df.groupby('movie_id')['rating'].agg(['mean', 'count']).reset_index()
    movie_stats.rename(columns={'mean': 'avg_rating', 'count': 'rating_count'}, inplace=True)

    # 合并电影统计信息
    movies_with_stats = pd.merge(movies_df, movie_stats, on='movie_id', how='left')
    movies_with_stats['rating_count'].fillna(0, inplace=True)

    # 热门高分电影筛选
    popularity_threshold = movies_with_stats['rating_count'].quantile(0.75)
    high_rating_threshold = 4.0 # 高评分阈值
    popular_high_rated_movies = movies_with_stats[
        (movies_with_stats['avg_rating'] >= high_rating_threshold) &
        (movies_with_stats['rating_count'] >= popularity_threshold)
    ]['movie_id'].tolist()

    # 缓存用户已观看电影
    user_history = user_movie_pairs.groupby('user_id')['movie_id'].apply(list).to_dict()

    # 缓存用户评分信息
    user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
    movie_avg_rating_map = movie_stats.set_index('movie_id')['avg_rating'].to_dict()

    negative_pairs = []

    for user_id in tqdm(all_users, desc='为每个用户生成负样本'):
        seen_movies = set(user_movie_matrix.columns[user_movie_matrix.loc[user_id] > 0])
        user_watched_movies = user_history.get(user_id, [])
        unseen_movies = list(set(all_movies) - seen_movies)

        # 1. 未看过的随机电影
        if unseen_movies:
            random_negative_movie = np.random.choice(unseen_movies)
            negative_pairs.append({
                'user_id': user_id,
                'movie_id': random_negative_movie,
                'label': 0
            })

        # 2. 热门高评分负样本
        hard_negatives = generate_popular_high_rated_negatives(
            user_watched_movies,
            popular_high_rated_movies,
            num_hard_negatives
        )
        for mid in hard_negatives:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})

        # 3. 看过但评分低于平均值的电影
        user_movie_ratings = user_ratings.get(user_id, {})
        disliked = []
        for mid, rating in user_movie_ratings.items():
            avg_rating = movie_avg_rating_map.get(mid, 0)
            if rating < avg_rating:
                disliked.append(mid)
        sampled_disliked = random.sample(disliked, min(len(disliked), num_disliked_negatives))
        for mid in sampled_disliked:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})

    return pd.DataFrame(negative_pairs)


def generate_popular_high_rated_negatives(user_watched_movies, all_popular_high_rated_movies, num_negatives):
    available_negatives = list(set(all_popular_high_rated_movies) - set(user_watched_movies))
    if not available_negatives:
        return []
    return list(np.random.choice(available_negatives, size=min(num_negatives, len(available_negatives)), replace=False))


def create_features(data, user_features, movie_features):
    user_data = user_features[user_features['user_id'].isin(data['user_id'])].set_index('user_id')
    movie_data = movie_features[movie_features['movie_id'].isin(data['movie_id'])].set_index('movie_id')
    
    user_feats = user_data.loc[data['user_id']].values
    movie_feats = movie_data.loc[data['movie_id']].values
    
    return user_feats, movie_feats


In [71]:
# 获取负样本
negative_samples=vectorized_generate_negatives(user_movie_pairs, movies_df, ratings_df)

# 获取正样本
positive_samples = user_movie_pairs[user_movie_pairs['label']==1][['user_id', 'movie_id', 'label']]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_with_stats['rating_count'].fillna(0, inplace=True)
  user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
为每个用户生成负样本: 100%|██████████| 6040/6040 [00:02<00:00, 2294.58it/s]


In [72]:
# 将正样本和负样本合并在一起
data=pd.concat([positive_samples, negative_samples])


In [73]:
def build_pairwise_training_data(pos_df, neg_df):
    """
    构造 (user, pos_item, neg_item) 三元组
    要求：每个用户有正样本和负样本
    """
    # 保证负样本存在
    neg_df_grouped = neg_df.groupby('user_id')
    triplets = []

    for idx, row in tqdm(pos_df.iterrows(), desc='构建三元组进度'):
        user_id = row['user_id']
        pos_item = row['movie_id']

        if user_id not in neg_df_grouped.groups:
            continue

        user_negatives = neg_df_grouped.get_group(user_id)['movie_id'].values
        for neg_item in user_negatives:
            triplets.append({'user_id': user_id, 'pos_movie_id': pos_item, 'neg_movie_id': neg_item})

    return pd.DataFrame(triplets)

pairs_pos_neg=build_pairwise_training_data(positive_samples, negative_samples)

构建三元组进度: 543158it [00:37, 14543.10it/s]


### 划分训练集和验证集并获取特征

In [74]:
# 将三元组数据分为训练集和验证集
train_triplets, val_triplets = train_test_split(pairs_pos_neg, test_size=0.2, random_state=42)

In [75]:
train_triplets.head()

Unnamed: 0,user_id,pos_movie_id,neg_movie_id
3305865,4050,2288,2580
4092274,5046,637,3196
1246519,1523,1259,2762
1025136,1273,1188,1610
2610226,3280,3498,1233


In [98]:
def create_pairwise_features(triplets_df, user_features, movie_features):
    user_data = user_features.set_index('user_id')
    movie_data = movie_features.set_index('movie_id')

    # 获取用户特征和用户 ID
    user_feats = user_data.loc[triplets_df['user_id']].values
    user_ids = triplets_df['user_id'].values

    # 获取正样本电影特征和正样本电影 ID
    pos_movie_feats = movie_data.loc[triplets_df['pos_movie_id']].values
    pos_movie_ids = triplets_df['pos_movie_id'].values

    # 获取负样本电影特征和负样本电影 ID
    neg_movie_feats = movie_data.loc[triplets_df['neg_movie_id']].values
    neg_movie_ids = triplets_df['neg_movie_id'].values

    return (
        tf.convert_to_tensor(user_ids, dtype=tf.int32),
        tf.convert_to_tensor(user_feats, dtype=tf.float32),
        tf.convert_to_tensor(pos_movie_ids, dtype=tf.int32),
        tf.convert_to_tensor(pos_movie_feats, dtype=tf.float32),
        tf.convert_to_tensor(neg_movie_ids, dtype=tf.int32),
        tf.convert_to_tensor(neg_movie_feats, dtype=tf.float32)
    )

In [99]:
train_user_ids, train_user_feats, train_pos_movie_ids, train_pos_feats, train_neg_movie_ids, train_neg_feats = create_pairwise_features(train_triplets, user_features_numeric, movie_features_numeric)

val_user_ids, val_user_feats,val_pos_movie_ids, val_pos_feats, val_neg_movie_ids, val_neg_feats= create_pairwise_features(val_triplets, user_features_numeric, movie_features_numeric)

## 构建双塔模型

In [100]:
import tensorflow as tf

def build_user_tower(user_id_num, user_feat_dim, embedding_dim=16):
    # 用户 ID 输入
    user_id_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='user_id_input')
    # 用户特征输入
    user_feat_input = tf.keras.Input(shape=(user_feat_dim,), name='user_feat_input')

    # 用户 ID 嵌入层
    user_id_emb = tf.keras.layers.Embedding(input_dim=user_id_num, output_dim=embedding_dim)(user_id_input)
    user_id_emb = tf.keras.layers.Flatten()(user_id_emb)

    # 拼接用户 ID 嵌入和用户特征
    x = tf.keras.layers.Concatenate()([user_id_emb, user_feat_input])

    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    outputs = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)

    return tf.keras.Model(inputs=[user_id_input, user_feat_input], outputs=outputs, name="user_tower")

def build_movie_tower(movie_id_num, movie_feat_dim, embedding_dim=16):
    # 电影 ID 输入
    movie_id_input = tf.keras.Input(shape=(1,), dtype=tf.int32, name='movie_id_input')
    # 电影特征输入
    movie_feat_input = tf.keras.Input(shape=(movie_feat_dim,), name='movie_feat_input')

    # 电影 ID 嵌入层
    movie_id_emb = tf.keras.layers.Embedding(input_dim=movie_id_num, output_dim=embedding_dim)(movie_id_input)
    movie_id_emb = tf.keras.layers.Flatten()(movie_id_emb)

    # 拼接电影 ID 嵌入和电影特征
    x = tf.keras.layers.Concatenate()([movie_id_emb, movie_feat_input])

    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    outputs = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)

    return tf.keras.Model(inputs=[movie_id_input, movie_feat_input], outputs=outputs, name="movie_tower")

class TwoTowerModel(tf.keras.Model):
    def __init__(self, user_id_num, user_feat_dim, movie_id_num, movie_feat_dim, embedding_dim=16):
        super().__init__()
        self.user_tower = build_user_tower(user_id_num, user_feat_dim, embedding_dim)
        self.movie_tower = build_movie_tower(movie_id_num, movie_feat_dim, embedding_dim)

    def call(self, inputs):
        user_emb = self.user_tower([inputs["user_id"], inputs["user_feat"]])
        pos_emb = self.movie_tower([inputs["pos_movie_id"], inputs["pos_movie_feat"]])
        neg_emb = self.movie_tower([inputs["neg_movie_id"], inputs["neg_movie_feat"]])
        return user_emb, pos_emb, neg_emb

In [122]:
user_id_dim=int(user_features_numeric['user_id'].max())+1
print("user id dim:", user_id_dim)
movie_id_dim=int(movie_features_numeric['movie_id'].max())+1
print("movie id dim:", movie_id_dim)
movie_input_dim=movie_features_numeric.shape[1]-1
print("movie_input_dim:", movie_input_dim)
user_input_dim=user_features_numeric.shape[1]-1
print("user_input_dim:", user_input_dim)
model = TwoTowerModel(user_id_dim, user_input_dim, movie_id_dim, movie_input_dim)

user id dim: 6041
movie id dim: 3953
movie_input_dim: 25
user_input_dim: 46


In [124]:
model.summary()

## 损失函数

In [125]:
def bpr_loss(user_emb, pos_emb, neg_emb, reg_lambda=1e-4):
    pos_scores = tf.reduce_sum(user_emb * pos_emb, axis=1)
    neg_scores = tf.reduce_sum(user_emb * neg_emb, axis=1)

    loss = -tf.reduce_mean(tf.math.log(tf.nn.sigmoid(pos_scores - neg_scores) + 1e-6))

    # 正则化项
    reg_loss = reg_lambda * (
        tf.reduce_mean(tf.norm(user_emb, axis=1)) +
        tf.reduce_mean(tf.norm(pos_emb, axis=1)) +
        tf.reduce_mean(tf.norm(neg_emb, axis=1))
    )

    return loss + reg_loss


## 模型训练

In [126]:
import tensorflow as tf
from tensorflow.keras import optimizers
import numpy as np
from tqdm import tqdm


def train_pairwise_model(model, train_user_ids, train_user_feats, train_pos_movie_ids, train_pos_feats, train_neg_movie_ids, train_neg_feats,
                         epochs=1, batch_size=256, learning_rate=1e-3, reg_lambda=1e-4):
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    num_batches = int(train_user_feats.shape[0] // batch_size)

    for epoch in range(epochs):
        total_loss = 0.0
        with tqdm(total=num_batches, desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for batch in range(num_batches):
                start_index = batch * batch_size
                end_index = (batch + 1) * batch_size

                user_id_batch = train_user_ids[start_index:end_index]
                user_feat_batch = train_user_feats[start_index:end_index]
                pos_movie_id_batch = train_pos_movie_ids[start_index:end_index]
                pos_feat_batch = train_pos_feats[start_index:end_index]
                neg_movie_id_batch = train_neg_movie_ids[start_index:end_index]
                neg_feat_batch = train_neg_feats[start_index:end_index]

                inputs = {
                    "user_id": user_id_batch,
                    "user_feat": user_feat_batch,
                    "pos_movie_id": pos_movie_id_batch,
                    "pos_movie_feat": pos_feat_batch,
                    "neg_movie_id": neg_movie_id_batch,
                    "neg_movie_feat": neg_feat_batch
                }
                
                with tf.GradientTape() as tape:
                    user_emb, pos_emb, neg_emb = model(inputs)
                    loss = bpr_loss(user_emb, pos_emb, neg_emb, reg_lambda)

                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))

                total_loss += loss.numpy()
                pbar.set_postfix(loss=f"{loss.numpy():.4f}")
                pbar.update(1)

        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")

In [127]:
train_pairwise_model(model, train_user_ids, train_user_feats, train_pos_movie_ids, train_pos_feats, train_neg_movie_ids, train_neg_feats)

Epoch 1/1: 100%|██████████| 15274/15274 [04:54<00:00, 51.94it/s, loss=nan] 

Epoch 1/1 - Average Loss: nan





## 构建FAISS索引

In [128]:
print("movie_features_numeric 的列名和数据类型:")
print(movie_features_numeric.dtypes)
print("\nmovie_features_numeric 的前几行:")
print(movie_features_numeric.head())

movie_features_numeric 的列名和数据类型:
movie_id              float32
Action                float32
Adventure             float32
Animation             float32
Children's            float32
Comedy                float32
Crime                 float32
Documentary           float32
Drama                 float32
Fantasy               float32
Film-Noir             float32
Horror                float32
Musical               float32
Mystery               float32
Romance               float32
Sci-Fi                float32
Thriller              float32
War                   float32
Western               float32
movie_mean_rating     float32
movie_rating_std      float32
movie_rating_count    float32
popularity_encoded    float32
genre_purity          float32
year                  float32
title_length          float32
dtype: object

movie_features_numeric 的前几行:
   movie_id  Action  Adventure  Animation  Children's  Comedy  Crime  \
0       1.0     0.0        0.0        1.0         1.0     1.0    0.0   

In [135]:
movie_features_numeric

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,popularity_encoded,genre_purity,year,title_length
0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.146846,0.852349,2077.0,0.0,0.333333,1995.0,16.0
1,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.201141,0.983172,701.0,0.0,0.333333,1995.0,14.0
2,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.016737,1.071712,478.0,3.0,0.500000,1995.0,23.0
3,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.729412,1.013381,170.0,3.0,0.500000,1995.0,24.0
4,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.006757,1.025086,296.0,3.0,1.000000,1995.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,3948.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.635731,1.014196,862.0,0.0,1.000000,2000.0,23.0
3702,3949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.115131,1.009804,304.0,3.0,1.000000,2000.0,26.0
3703,3950.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.666667,1.046107,54.0,1.0,1.000000,2000.0,16.0
3704,3951.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.900000,1.057331,40.0,1.0,1.000000,2000.0,23.0


In [140]:
# 获取电影的embedding向量
movie_embeddings = []
feature_columns = [col for col in movie_features_numeric.columns if col != 'movie_id']

for index, row in tqdm(movie_features_numeric.iterrows(), total=len(movie_features_numeric), desc='获取电影的embedding向量进度'):
    # 构造输入字典
    features = row[feature_columns].values.astype('float32')
    inputs = {
        "movie_id_input": np.array([row["movie_id"]]),  # 假设 movie_feat 是包含 movie_id 的字典
        "movie_feat_input": np.array([features])  # 假设 movie_feat 是包含 features 的字典
    }
    movie_emb= model.movie_tower(inputs)
    movie_embeddings.append(movie_emb.numpy().squeeze())

movie_embeddings = np.array(movie_embeddings).astype('float32')

获取电影的embedding向量进度: 100%|██████████| 3706/3706 [00:05<00:00, 650.48it/s]


In [141]:
embedding_dimension = movie_embeddings.shape[1]  # 获取 embedding 的维度
num_movies = movie_embeddings.shape[0]
print(embedding_dimension)
print(num_movies)

32
3706


In [144]:
import faiss
# 选择一个FAISS索引类型
index = faiss.IndexFlatL2(embedding_dimension)
# 将embedding向量添加到FAISS索引中
index.add(movie_embeddings.astype('float32'))
# 将电影ID和embedding关联起来
movie_ids = movie_features_numeric['movie_id'].values.astype(np.int64)

In [145]:
faiss.write_index(index, "./embedding/movie_embeddings.faiss")