In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [30]:
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

# 查看数据基本情况

In [31]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [32]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [33]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [34]:
print("用户样本数:",len(users_df))
print("电影样本数:",len(movies_df))
print("评分样本数:", len(ratings_df))

用户样本数: 6040
电影样本数: 3883
评分样本数: 1000209


In [35]:
# 查看 movies_df 的空值情况
print("movies_df 的空值情况：")
print(movies_df.isna().sum())

# 查看 ratings_df 的空值情况
print("\nratings_df 的空值情况：")
print(ratings_df.isna().sum())

# 查看 users_df 的空值情况
print("\nusers_df 的空值情况：")
print(users_df.isna().sum())

movies_df 的空值情况：
movie_id    0
title       0
genres      0
dtype: int64

ratings_df 的空值情况：
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

users_df 的空值情况：
user_id       0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64


# 数据预处理

In [36]:
genres=movies_df['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies_df, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings_df, movies, on='movie_id')

In [37]:
genres

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [38]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
# 有哪些电影没有参与过评分
unrated_movies = movies_df[~movies_df['movie_id'].isin(ratings_df['movie_id'])]
print("没有参与过评分的电影:")
print(unrated_movies)
print("没有参与过评分的电影数量为:", len(unrated_movies))

没有参与过评分的电影:
      movie_id                                title                 genres
50          51                Guardian Angel (1994)  Action|Drama|Thriller
107        109  Headless Body in Topless Bar (1995)                 Comedy
113        115     Happiness Is in the Field (1995)                 Comedy
141        143                         Gospa (1995)                  Drama
281        284                  New York Cop (1996)           Action|Crime
...        ...                                  ...                    ...
3581      3650            Anguish (Angustia) (1986)                 Horror
3681      3750                Boricua's Bond (2000)                  Drama
3759      3829               Mad About Mambo (2000)         Comedy|Romance
3786      3856                  Autumn Heart (1999)                  Drama
3837      3907   Prince of Central Park, The (1999)                  Drama

[177 rows x 3 columns]
没有参与过评分的电影数量为: 177


In [40]:
# 有哪些用户没有参与过评分
unrated_users=users_df[~users_df['user_id'].isin(ratings_df['user_id'])]
print("没有参与过评分的用户:")
print(unrated_users)
print("没有参与过评分的用户数量为:", len(unrated_users))

没有参与过评分的用户:
Empty DataFrame
Columns: [user_id, gender, age, occupation, zip_code]
Index: []
没有参与过评分的用户数量为: 0


# 特征工程

## 用户特征工程

### 1. 用户统计特征

In [41]:
user_stats=data.groupby('user_id')['rating'].agg([
    ('mean_rating', 'mean'), # 平均评分
    ('rating_std', 'std'), # 评分标准差
    ('rating_count','count'), # 评分次数
    ('rating_min','min'), # 最小评分
    ('rating_max','max') # 最大评分
]).reset_index()


# 计算用户评分严格程度
global_mean_rating=data['rating'].mean()
user_stats['rating_strictness']=global_mean_rating-user_stats['mean_rating']

# 计算用户评分波动程度
user_stats['rating_variability']=user_stats['rating_std']/user_stats['mean_rating']

# 查看结果
print(user_stats.head())

   user_id  mean_rating  rating_std  rating_count  rating_min  rating_max  \
0        1     4.188679    0.680967            53           3           5   
1        2     3.713178    1.001513           129           1           5   
2        3     3.901961    0.984985            51           1           5   
3        4     4.190476    1.077917            21           1           5   
4        5     3.146465    1.132699           198           1           5   

   rating_strictness  rating_variability  
0          -0.607115            0.162573  
1          -0.131614            0.269719  
2          -0.320396            0.252433  
3          -0.608912            0.257230  
4           0.435100            0.359991  


### 2. 用户电影类型偏好特征

In [42]:
# 获取所有电影类型列
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]


# 计算用户对每种类型的评分次数和平均评分（用户ID、电影类型以及评分次数）
user_genre_stats=data.groupby('user_id')[genre_columns].sum().reset_index()

# 计算用户对每种类型的偏好程度（按行处理，将每一个用户对某一类型的评分，除以该用户对所有类型评分的总和）
for genre in genre_columns:
    user_genre_stats[f'{genre}_favorite_degree']=user_genre_stats[genre]/user_genre_stats[genre_columns].sum(axis=1)

for genre in genre_columns:
    user_genre_stats[f'{genre}_rating_cnt']=user_genre_stats[genre]

# 计算用户最喜欢的类型（返回最大值所在的索引），axis=1表示按照行操作
user_genre_stats['favorite_genre']=user_genre_stats[genre_columns].idxmax(axis=1)

# 计算用户喜欢的类型数量（评分过的类型数）（得到该行中评分大于0的类型数量）
user_genre_stats['num_liked_genres']=(user_genre_stats[genre_columns]>0).sum(axis=1)

user_genre_stats.drop(columns=genre_columns,inplace=True)

# 合并所有用户特征
user_features=pd.merge(user_stats, user_genre_stats, on='user_id')

# 添加用户活跃度分段特征
user_features['activity_level'] = pd.cut(
    user_features['rating_count'],
    bins=[0, 5, 20, 100, float('inf')],
    labels=['inactive', 'casual', 'active', 'super']
)

# 用户活跃度特征编码
activity_encoder = LabelEncoder()
user_features['activity_level_encoded'] = activity_encoder.fit_transform(user_features['activity_level'])

# 用户最喜欢的类型编码
genre_encoder = LabelEncoder()
user_features['favorite_genre_encoded'] = genre_encoder.fit_transform(user_features['favorite_genre'])


In [43]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


## 电影特征工程

In [44]:
# 计算电影的基本评分统计
movie_stats=data.groupby('movie_id')['rating'].agg([
    ('movie_mean_rating', 'mean'),
    ('movie_rating_std', 'std'),
    ('movie_rating_count', 'count')
])

# 填充可能存在的NaN值
movie_stats['movie_rating_std']=movie_stats['movie_rating_std'].fillna(0)

print("movie_stats:")
print(movie_stats)

# 合并电影原始信息
movie_features=pd.merge(movies, movie_stats, on='movie_id')

# 添加电影热度特征
movie_features['popularity'] = pd.cut(
    movie_features['movie_rating_count'],
    bins=[0, 10, 100, 500, float('inf')],
    labels=['niche', 'moderate', 'popular', 'blockbuster']
)

# 电影热度编码
popularity_encoder = LabelEncoder()
movie_features['popularity_encoded'] = popularity_encoder.fit_transform(movie_features['popularity'])

# 计算电影类型纯度（类型数量越少，纯度越高）
movie_features['genre_purity']=1/movie_features[genre_columns].sum(axis=1)

# 提取电影发布年份
movie_features['year'] = movie_features['title'].str.extract(r'\((\d{4})\)')
movie_features['year'] = movie_features['year'].fillna('1990').astype(int)

# 创建标题长度特征
movie_features['title_length'] = movie_features['title'].str.len()

movie_stats:
          movie_mean_rating  movie_rating_std  movie_rating_count
movie_id                                                         
1                  4.146846          0.852349                2077
2                  3.201141          0.983172                 701
3                  3.016736          1.071712                 478
4                  2.729412          1.013381                 170
5                  3.006757          1.025086                 296
...                     ...               ...                 ...
3948               3.635731          1.014196                 862
3949               4.115132          1.009804                 304
3950               3.666667          1.046107                  54
3951               3.900000          1.057331                  40
3952               3.780928          0.935074                 388

[3706 rows x 3 columns]


In [45]:
print("电影维度:", movie_features.shape)
print("用户维度:", user_features.shape)

电影维度: (3706, 29)
用户维度: (6040, 49)


## 检查user_features和movie_features的空值情况

In [46]:
# 检查user_features中哪些列含有空值
user_features_null_columns = user_features.columns[user_features.isna().any()].tolist()
print("user_features中含有空值的列:", user_features_null_columns)

# 检查movie_features中哪些列含有空值
movie_features_null_columns = movie_features.columns[movie_features.isna().any()].tolist()
print("movie_features中含有空值的列:", movie_features_null_columns)

user_features中含有空值的列: []
movie_features中含有空值的列: []


## 保存特征数据

In [47]:
user_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/user_features.csv', index=False)
movie_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/movie_features.csv', index=False)

In [48]:
pd.set_option('display.max_columns', None)
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,Action_rating_cnt,Adventure_rating_cnt,Animation_rating_cnt,Children's_rating_cnt,Comedy_rating_cnt,Crime_rating_cnt,Documentary_rating_cnt,Drama_rating_cnt,Fantasy_rating_cnt,Film-Noir_rating_cnt,Horror_rating_cnt,Musical_rating_cnt,Mystery_rating_cnt,Romance_rating_cnt,Sci-Fi_rating_cnt,Thriller_rating_cnt,War_rating_cnt,Western_rating_cnt,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,5,5,18,20,14,2,0,21,3,0,0,14,0,6,3,3,2,0,Drama,13,active,0,7
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,56,19,0,0,25,12,0,79,1,1,2,0,3,24,17,31,15,3,Drama,14,super,2,7
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,23,25,3,3,30,0,0,8,2,0,3,1,1,5,6,5,2,6,Comedy,15,active,0,4
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,19,6,0,1,0,1,0,6,2,0,3,0,0,2,9,4,3,2,Action,12,active,0,0
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,31,9,4,6,56,21,6,104,0,3,10,3,8,30,15,39,6,1,Drama,17,super,2,7


In [49]:
movie_features.head()

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,popularity_encoded,genre_purity,year,title_length
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4.146846,0.852349,2077,blockbuster,0,0.333333,1995,16
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3.201141,0.983172,701,blockbuster,0,0.333333,1995,14
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3.016736,1.071712,478,popular,3,0.5,1995,23
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2.729412,1.013381,170,popular,3,0.5,1995,24
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.006757,1.025086,296,popular,3,1.0,1995,34


## 数据准备：生成正负样本对

In [50]:
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# 从 user_features 和 movie_features 中筛选出数值型特征
def select_numeric_columns(df):
    # 选择数值型列（int 和 float）
    numeric_columns = df.select_dtypes(include=['number']).columns
    return df[numeric_columns]

# 筛选用户和电影的数值型特征
user_features_numeric = select_numeric_columns(user_features)
movie_features_numeric = select_numeric_columns(movie_features)

user_features_numeric = user_features_numeric.astype('float32')
movie_features_numeric = movie_features_numeric.astype('float32')

# 合并用户特征和电影特征
user_movie_pairs = pd.merge(ratings_df[['user_id', 'movie_id', 'rating']], user_features_numeric, on='user_id')
user_movie_pairs = pd.merge(user_movie_pairs, movie_features_numeric, on='movie_id')

In [51]:
user_movie_pairs = pd.merge(user_movie_pairs, user_stats[['user_id', 'mean_rating']], on='user_id')

# 确保删除了合并时产生的无用列
user_movie_pairs.drop(columns=['mean_rating_x'], inplace=True, errors='ignore')
user_movie_pairs.rename(columns={'mean_rating_y': 'mean_rating'}, inplace=True)

# 根据用户的评分和平均评分来设定正负样本
user_movie_pairs['label'] = user_movie_pairs.apply(
    lambda x: 1 if x['rating'] > x['mean_rating'] else 0, axis=1
)


In [52]:
import random
from tqdm import tqdm
# 获取 easy negative 和 hard negative
def vectorized_generate_negatives(positive_pairs, user_movie_pairs, user_stats):
    # 创建用户-电影矩阵
    user_movie_matrix = pd.crosstab(
        user_movie_pairs['user_id'], 
        user_movie_pairs['movie_id']
    )
    
    # 获取所有用户和电影
    all_users = user_movie_matrix.index
    all_movies = user_movie_matrix.columns
    
    # 为每个用户生成负样本
    negative_pairs = []
    
    for user_id in tqdm(all_users, desc='为每个用户获取负样本'):
        # 获取用户已看过的电影
        seen_movies = set(user_movie_matrix.columns[user_movie_matrix.loc[user_id] > 0])
        
        # 获取未看过的电影
        unseen_movies = list(set(all_movies) - seen_movies)
        
        if unseen_movies:
            # 随机选择负样本
            neg_movie = np.random.choice(unseen_movies)
            negative_pairs.append({
                'user_id': user_id,
                'movie_id': neg_movie,
                'label': 0
            })
    
    return pd.DataFrame(negative_pairs)

In [53]:
# 划分训练集和测试集
train_data, test_data = train_test_split(user_movie_pairs, test_size=0.2, random_state=42)

# 为训练集和测试集生成负样本
train_data_with_negatives = vectorized_generate_negatives(train_data, user_movie_pairs, user_stats)
test_data_with_negatives = vectorized_generate_negatives(test_data, user_movie_pairs, user_stats)

# 训练集和测试集特征提取
def create_features(data, user_features, movie_features):
    user_data = user_features[user_features['user_id'].isin(data['user_id'])].set_index('user_id')
    movie_data = movie_features[movie_features['movie_id'].isin(data['movie_id'])].set_index('movie_id')
    
    user_feats = user_data.loc[data['user_id']].values
    movie_feats = movie_data.loc[data['movie_id']].values
    
    return user_feats, movie_feats

train_user_feats, train_movie_feats = create_features(train_data_with_negatives, user_features_numeric, movie_features_numeric)
test_user_feats, test_movie_feats = create_features(test_data_with_negatives, user_features_numeric, movie_features_numeric)

# 标签
train_labels = train_data_with_negatives['label'].values
test_labels = test_data_with_negatives['label'].values


为每个用户获取负样本: 100%|██████████| 6040/6040 [00:02<00:00, 2588.21it/s]
为每个用户获取负样本: 100%|██████████| 6040/6040 [00:02<00:00, 2647.14it/s]


## 构建双塔模型

In [54]:
def create_advanced_tower(input_dim, output_dim):
    tower = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation='relu'),  # 更深的层
        layers.BatchNormalization(),  # BatchNormalization
        layers.Dropout(0.5),  # Dropout防止过拟合
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(output_dim, activation=None)  # 输出嵌入向量
    ])
    return tower

# 定义双塔模型
user_input = layers.Input(shape=(train_user_feats.shape[1],))
movie_input = layers.Input(shape=(train_movie_feats.shape[1],))

# 用户和电影塔
user_tower = create_advanced_tower(train_user_feats.shape[1], 64)
movie_tower = create_advanced_tower(train_movie_feats.shape[1], 64)

user_embedding = user_tower(user_input)  # 用户向量
movie_embedding = movie_tower(movie_input)  # 物料向量

# 点积操作计算用户与电影之间的相似度
similarity = layers.Dot(axes=1)([user_embedding, movie_embedding])  # 点积操作

# 输出层：单一的概率值，表示用户喜欢该电影的概率
output = layers.Dense(1, activation='sigmoid')(similarity)

# 构建模型
model = models.Model(inputs=[user_input, movie_input], outputs=output)

# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [56]:
# 训练模型
model.fit([train_user_feats, train_movie_feats], train_labels, epochs=5, batch_size=512, validation_split=0.1)

# 测试模型
test_loss, test_acc = model.evaluate([test_user_feats, test_movie_feats], test_labels)
print(f'Test Accuracy: {test_acc}')

Epoch 1/5
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5854 - loss: 1.1585 - val_accuracy: 0.3990 - val_loss: 11.2008
Epoch 2/5
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6172 - loss: 1.0059 - val_accuracy: 0.5281 - val_loss: 6.0298
Epoch 3/5
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7513 - loss: 0.8266 - val_accuracy: 0.6093 - val_loss: 4.6726
Epoch 4/5
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8395 - loss: 0.6038 - val_accuracy: 0.7715 - val_loss: 1.7110
Epoch 5/5
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9181 - loss: 0.4117 - val_accuracy: 0.8758 - val_loss: 0.7684
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 440us/step - accuracy: 0.8658 - loss: 0.9508
Test Accuracy: 0.867384135723114
