In [118]:
import pickle
import pandas as pd 
pd.set_option('display.max_columns', None)

## 读取数据

In [119]:
# 需要的用户特征和电影特征列
user_features_list, movie_features_list=pickle.load(open('./config/feature_keep.p', 'rb'))

ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')

import json
def read_json_file(filepath):
    """
    读取指定路径的 JSON 文件。

    Args:
        filepath (str): JSON 文件的完整路径。

    Returns:
        dict or list or None: 如果成功读取，则返回 JSON 文件中的数据（通常是字典或列表）。
                                如果文件不存在或读取失败，则返回 None。
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"错误: 文件 '{filepath}' 未找到。")
        return None
    except json.JSONDecodeError:
        print(f"错误: 文件 '{filepath}' 不是有效的 JSON 格式。")
        return None
    except Exception as e:
        print(f"读取文件 '{filepath}' 时发生未知错误: {e}")
        return None
    
movie_embedding_config=read_json_file('./config/movie_embedding_config.json')
user_embedding_config=read_json_file('./config/user_embedding_config.json')
with open('./features/movie_features.pkl', 'rb') as f:
    movie_features = pickle.load(f)
with open('./features/user_features.pkl', 'rb') as f:
    user_features=pickle.load(f)
pos_neg_samples=pd.read_csv('./data/pos_neg_data.csv')

In [120]:
pos_neg_samples

Unnamed: 0,user_id,movie_id,label
0,1,1193,1
1,1,2355,1
2,1,1287,1
3,1,2804,1
4,1,595,1
...,...,...,...
2538797,6040,1658,0
2538798,6040,1936,0
2538799,6040,2138,0
2538800,6040,3892,0


## 构建模型

### 用户塔

In [121]:
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,num_liked_genres,activity_level_encoded,favorite_genre_encoded,gender_encoded,occupation_encoded,age_encoded
0,1,4.188679,-1.615782,-0.584221,3.199606,0.061461,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,-1.097206,1,7,0,10,0
1,2,3.713178,-0.042568,-0.189889,-0.504394,0.061461,-0.131614,0.269718,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,-0.665949,2,7,1,16,4
2,3,3.901961,-0.123684,-0.594598,-0.504394,0.061461,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,-0.234692,1,4,1,15,1
3,4,4.190476,0.332416,-0.750255,-0.504394,0.061461,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,-1.528462,1,0,1,7,3
4,5,3.146465,0.601283,0.168123,-0.504394,0.061461,0.4351,0.35999,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,0.627821,2,7,1,20,1


In [122]:
user_embedding_feature=['user_id', 'activity_level_encoded', 'favorite_genre_encoded', 'gender_encoded', 'occupation_encoded', 'age_encoded']
user_not_embedding_feature=[col for col in user_features.columns if col not in user_embedding_feature]
print("需要送入embedding层的特征数量为:", len(user_embedding_feature))
print("不需要送入embedding层的特征数量为:", len(user_not_embedding_feature))

# 合并顺序：embedding 特征在前，其他特征在后
ordered_columns = user_embedding_feature + user_not_embedding_feature

# 按照指定顺序重新排列 DataFrame 的列
user_features = user_features[ordered_columns]

user_features.to_csv('./features/user_features_final.csv')

需要送入embedding层的特征数量为: 6
不需要送入embedding层的特征数量为: 26


In [123]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model

def build_user_tower_from_config(user_embedding_config, dense_input_dim, output_dim=64):
    inputs = []
    embed_layers = []

    # 1. 处理每个 embedding 特征
    for feature_name, config in user_embedding_config.items():
        vocab_size = config['vocab_size']
        emb_dim = config['embedding_dim']

        inp = Input(shape=(1,), name=feature_name)
        emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
        emb = Flatten()(emb)

        inputs.append(inp)
        embed_layers.append(emb)

    # 2. dense 特征处理（不需要 embedding）
    if dense_input_dim > 0:
        dense_input = Input(shape=(dense_input_dim,), name='dense_input_user')
        inputs.append(dense_input)
        embed_layers.append(dense_input)

    # 3. 合并特征 + MLP
    x = Concatenate()(embed_layers)
    x = Dense(128, activation='relu')(x)
    x = Dense(output_dim, activation='relu')(x)

    model = Model(inputs=inputs, outputs=x, name="UserTower")
    return model


In [124]:
dense_user_input_dim = len(user_not_embedding_feature)

user_tower = build_user_tower_from_config(
    user_embedding_config=user_embedding_config,
    dense_input_dim=dense_user_input_dim,
    output_dim=64
)

user_tower.summary()


### 物品塔

In [125]:
movie_embedding_feature=['movie_id', 'title', 'genres', 'popularity', 'year', 'is_old_movie']
movie_not_embedding_feature=[col for col in movie_features.columns if col not in movie_embedding_feature]
print(movie_not_embedding_feature)
print("需要送入embedding层的特征数量为:", len(movie_embedding_feature))
print("不需要送入embedding层的特征数量为:", len(movie_not_embedding_feature))

# 合并顺序：embedding 特征在前，其他特征在后
ordered_columns = movie_embedding_feature + movie_not_embedding_feature

# 按照指定顺序重新排列 DataFrame 的列
movie_features = movie_features[ordered_columns]

movie_features.to_csv('./features/movie_features_final.csv')

['movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity', 'title_length']
需要送入embedding层的特征数量为: 6
不需要送入embedding层的特征数量为: 5


In [126]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten, GlobalAveragePooling1D
from tensorflow.keras.models import Model

def build_movie_tower_from_config(movie_embedding_config, dense_input_dim, output_dim=64, sequence_features=['title', 'genres']):
    if sequence_features is None:
        sequence_features = ['title', 'genres']
    
    inputs = []
    embed_layers = []
    
    # 1. 处理每个 embedding 特征
    for feature_name, config in movie_embedding_config.items():
        vocab_size = config['vocab_size']
        emb_dim = config['embedding_dim']
        seq_len = config['seq_len']  # 默认值为1，适用于非序列特征

        if feature_name in sequence_features:
            # 处理序列特征，如 title 和 genres
            inp = Input(shape=(seq_len,), name=feature_name)  # shape=(batch, seq_len)
            emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
            emb = GlobalAveragePooling1D()(emb)  # 对序列进行池化（取平均）
        else:
            # 处理非序列特征，如 movie_id、popularity 等
            inp = Input(shape=(1,), name=feature_name)  # shape=(batch, 1)
            emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
            emb = Flatten()(emb)  # 展平嵌入层输出
            
        inputs.append(inp)
        embed_layers.append(emb)

    # 2. 添加其他非 embedding 的 dense 特征（如 movie_mean_rating 等）
    if dense_input_dim > 0:
        dense_input = Input(shape=(dense_input_dim,), name='dense_input_movie')
        inputs.append(dense_input)
        embed_layers.append(dense_input)

    # 3. 合并所有特征
    x = Concatenate()(embed_layers)
    x = Dense(128, activation='relu')(x)
    x = Dense(output_dim, activation='relu')(x)

    # 构建模型
    model = Model(inputs=inputs, outputs=x, name="MovieTower")
    return model


In [127]:
dense_movie_input_dim = len(movie_not_embedding_feature)

movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim)

movie_tower.summary()

In [128]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda
import tensorflow as tf

def build_dual_tower_model(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim, sequence_features=['title', 'genres']):
    # Build both towers
    user_tower = build_user_tower_from_config(user_embedding_config, dense_user_input_dim)
    movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim, sequence_features=sequence_features)

    # Output embeddings
    user_output = user_tower.output  # shape: (None, 64)
    movie_output = movie_tower.output  # shape: (None, 64)

    # L2 normalize the output vectors
    user_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(user_output)
    movie_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(movie_output)

    # Compute cosine similarity via dot product of normalized vectors
    cosine_similarity = tf.keras.layers.Dot(axes=1)([user_normalized, movie_normalized])

    # Define the model
    model = Model(inputs=user_tower.input + movie_tower.input, outputs=cosine_similarity, name="DualTower_CosineSim")

    return model


## 划分训练集和验证集

In [129]:
from sklearn.model_selection import train_test_split
import numpy as np
data=pd.merge(pos_neg_samples, user_features, on='user_id')
data=pd.merge(data, movie_features, on='movie_id')
for col in ['genres', 'title']:
    data[col] = data[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
from tensorflow.keras.preprocessing.sequence import pad_sequences
# title_padded = pad_sequences(data['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post')
# genres_padded = pad_sequences(data['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post')
# data['title']=pd.Series(title_padded.tolist())
# data['genres']=pd.Series(genres_padded.tolist())
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
print("训练集数量:", len(train_df))
print("测试集数量:", len(test_df))

训练集数量: 2031041
测试集数量: 507761


In [130]:
# # 假设 movie_embedding_config['title']['seq_len'] 为 16
# title_padded = pad_sequences(data['title'], maxlen=16, padding='post', truncating='post')

# # 检查结果的形状
# print(title_padded.shape)  # 应该输出 (2538802, 16)

# data['title'] = title_padded
# print(data['title'].shape)

In [131]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
import numpy as np

model = build_dual_tower_model(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim, sequence_features=['title', 'genres'])

model.compile(optimizer=Adam(learning_rate=0.001), 
              loss=BinaryCrossentropy(from_logits=True),  # 使用二分类交叉熵损失
              metrics=['accuracy'])  # 使用准确率作为评估指标


In [132]:
model.summary()

## pointwise训练模型

In [133]:
def prepare_model_input(X_user_df, X_user_dense, X_movie_df, X_movie_dense):
    # 用户输入特征
    user_input = [
        X_user_df['user_id'].values.reshape(-1, 1),
        X_user_df['activity_level_encoded'].values.reshape(-1, 1),
        X_user_df['favorite_genre_encoded'].values.reshape(-1, 1),
        X_user_df['gender_encoded'].values.reshape(-1, 1),
        X_user_df['occupation_encoded'].values.reshape(-1, 1),
        X_user_df['age_encoded'].values.reshape(-1, 1),
        X_user_dense  # shape (N, 26)
    ]

    # 电影输入特征
    movie_input = [
        X_movie_df['movie_id'].values.reshape(-1, 1),
        pad_sequences(X_movie_df['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post'),     # shape (N, 16)
        pad_sequences(X_movie_df['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post'),    # shape (N, 7)
        X_movie_df['popularity'].values.reshape(-1, 1),
        X_movie_df['year'].values.reshape(-1, 1),
        X_movie_df['is_old_movie'].values.reshape(-1, 1),
        X_movie_dense  # shape (N, 5)
    ]
    # print("title的shape为:", X_movie_df['title'])
    
    # 打印每个特征的形状
    for i, feature in enumerate(user_input):
        print(f"user_input feature {i}: shape {feature.shape}")

    for i, feature in enumerate(movie_input):
        print(f"movie_input feature {i}: shape {feature.shape}")

    # 将两个部分合并成一个大的输入列表
    X_train = user_input + movie_input
    return X_train

# 获取训练数据和测试数据
X_train = prepare_model_input(
    train_df[user_embedding_feature], 
    train_df[user_not_embedding_feature], 
    train_df[movie_embedding_feature], 
    train_df[movie_not_embedding_feature]
)

y_train = train_df['label'].values

X_test = prepare_model_input(
    test_df[user_embedding_feature], 
    test_df[user_not_embedding_feature], 
    test_df[movie_embedding_feature], 
    test_df[movie_not_embedding_feature]
)

y_test = test_df['label'].values

user_input feature 0: shape (2031041, 1)
user_input feature 1: shape (2031041, 1)
user_input feature 2: shape (2031041, 1)
user_input feature 3: shape (2031041, 1)
user_input feature 4: shape (2031041, 1)
user_input feature 5: shape (2031041, 1)
user_input feature 6: shape (2031041, 26)
movie_input feature 0: shape (2031041, 1)
movie_input feature 1: shape (2031041, 16)
movie_input feature 2: shape (2031041, 7)
movie_input feature 3: shape (2031041, 1)
movie_input feature 4: shape (2031041, 1)
movie_input feature 5: shape (2031041, 1)
movie_input feature 6: shape (2031041, 5)
user_input feature 0: shape (507761, 1)
user_input feature 1: shape (507761, 1)
user_input feature 2: shape (507761, 1)
user_input feature 3: shape (507761, 1)
user_input feature 4: shape (507761, 1)
user_input feature 5: shape (507761, 1)
user_input feature 6: shape (507761, 26)
movie_input feature 0: shape (507761, 1)
movie_input feature 1: shape (507761, 16)
movie_input feature 2: shape (507761, 7)
movie_input 

In [134]:
for i, inp in enumerate(model.inputs):
    print(f"{i}: {inp.name}, shape: {inp.shape}")

0: user_id, shape: (None, 1)
1: activity_level_encoded, shape: (None, 1)
2: favorite_genre_encoded, shape: (None, 1)
3: gender_encoded, shape: (None, 1)
4: occupation_encoded, shape: (None, 1)
5: age_encoded, shape: (None, 1)
6: dense_input_user, shape: (None, 26)
7: movie_id, shape: (None, 1)
8: title, shape: (None, 16)
9: genres, shape: (None, 7)
10: popularity, shape: (None, 1)
11: year, shape: (None, 1)
12: is_old_movie, shape: (None, 1)
13: dense_input_movie, shape: (None, 5)


In [135]:
# 5. 训练模型
history = model.fit(
    X_train,  # 训练数据
    y_train,  # 标签
    validation_data=(X_test, y_test),  # 验证数据
    epochs=100,  # 训练周期（debug模式设置为1）
    batch_size=64  # 每批次的大小
)

# 6. 打印训练过程中的日志
print("训练完成，历史记录如下:")
print(history.history)

# 7. 在测试集上评估模型性能
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test loss: {test_loss}, Test accuracy: {test_acc}")

Epoch 1/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3ms/step - accuracy: 0.8441 - loss: 0.6768 - val_accuracy: 0.8534 - val_loss: 0.6741
Epoch 2/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 3ms/step - accuracy: 0.8557 - loss: 0.6733 - val_accuracy: 0.8572 - val_loss: 0.6733
Epoch 3/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.8593 - loss: 0.6722 - val_accuracy: 0.8567 - val_loss: 0.6730
Epoch 4/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 3ms/step - accuracy: 0.8606 - loss: 0.6717 - val_accuracy: 0.8568 - val_loss: 0.6728
Epoch 5/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 3ms/step - accuracy: 0.8617 - loss: 0.6715 - val_accuracy: 0.8580 - val_loss: 0.6728
Epoch 6/100
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 3ms/step - accuracy: 0.8622 - loss: 0.6713 - val_accuracy: 0.8582 - val

KeyboardInterrupt: 

### 使用FAISS存储movie embedding 

In [136]:
def get_movie_input(X_movie_df, X_movie_dense):
    # 电影输入特征
    movie_input = [
        X_movie_df['movie_id'].values.reshape(-1, 1),
        pad_sequences(X_movie_df['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post'),     # shape (N, 16)
        pad_sequences(X_movie_df['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post'),    # shape (N, 7)
        X_movie_df['popularity'].values.reshape(-1, 1),
        X_movie_df['year'].values.reshape(-1, 1),
        X_movie_df['is_old_movie'].values.reshape(-1, 1),
        X_movie_dense  # shape (N, 5)
    ]
    # print("title的shape为:", X_movie_df['title'])
    

    for i, feature in enumerate(movie_input):
        print(f"movie_input feature {i}: shape {feature.shape}")

    return movie_input

movie_input=get_movie_input(movie_features[movie_embedding_feature], movie_features[movie_not_embedding_feature])

movie_input feature 0: shape (3706, 1)
movie_input feature 1: shape (3706, 16)
movie_input feature 2: shape (3706, 7)
movie_input feature 3: shape (3706, 1)
movie_input feature 4: shape (3706, 1)
movie_input feature 5: shape (3706, 1)
movie_input feature 6: shape (3706, 5)


In [137]:
# 获取movie_tower
movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim, sequence_features=['title', 'genres'])

# 获取输出并做L2归一化
movie_output = movie_tower.output  # shape: (None, 64)
movie_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(movie_output)

# 获取输出并且归一化
movie_embedding_model = Model(inputs=movie_tower.input, outputs=movie_normalized, name="MovieEmbeddingModel")

# 提取embedding
movie_vectors = movie_embedding_model.predict(movie_input, batch_size=128)

print(movie_vectors.shape)

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
(3706, 64)


In [138]:
import faiss
movie_vectors = np.asarray(movie_vectors).astype("float32")
# 使用 FAISS 构建向量索引（使用内积，因为是归一化过的 embedding）
index = faiss.IndexFlatIP(movie_vectors.shape[1])  # 64维
index.add(movie_vectors)  # 添加所有电影向量

In [139]:
faiss.write_index(index, "./faiss/movie_index_pointwise.faiss")

### 离线指标评估

In [140]:
##########获取测试集的用户特征##########
def get_user_input(X_user_df, X_user_dense):
    # 用户输入特征
    user_input = [
        X_user_df['user_id'].values.reshape(-1, 1),
        X_user_df['activity_level_encoded'].values.reshape(-1, 1),
        X_user_df['favorite_genre_encoded'].values.reshape(-1, 1),
        X_user_df['gender_encoded'].values.reshape(-1, 1),
        X_user_df['occupation_encoded'].values.reshape(-1, 1),
        X_user_df['age_encoded'].values.reshape(-1, 1),
        X_user_dense  # shape (N, 26)
    ]

    # 打印每个特征的形状
    for i, feature in enumerate(user_input):
        print(f"user_input feature {i}: shape {feature.shape}")

    return user_input

user_input=get_user_input(user_features[user_embedding_feature], user_features[user_not_embedding_feature])


user_input feature 0: shape (6040, 1)
user_input feature 1: shape (6040, 1)
user_input feature 2: shape (6040, 1)
user_input feature 3: shape (6040, 1)
user_input feature 4: shape (6040, 1)
user_input feature 5: shape (6040, 1)
user_input feature 6: shape (6040, 26)


In [141]:
# 获取movie_tower
user_tower = build_user_tower_from_config(user_embedding_config, dense_user_input_dim)

# 获取输出并做L2归一化
user_output = user_tower.output  # shape: (None, 64)
user_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(user_output)

# 获取输出并且归一化
user_embedding_model = Model(inputs=user_tower.input, outputs=user_normalized, name="UserEmbeddingModel")

# 提取embedding
user_vectors = user_embedding_model.predict(user_input, batch_size=128)

print(user_vectors.shape)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
(6040, 64)


In [142]:
#######构造每个用户看过的正样本电影#########
from tqdm import tqdm
from collections import defaultdict
def build_user_to_true_movies(test_df):
    """
    test_df: 包含 user_id 和 movie_id 的 DataFrame
    返回: user_to_true_movies 字典，键是 user_id，值是 movie_id 列表
    """
    user_to_movies = defaultdict(list)
    for _, row in tqdm(test_df.iterrows(), desc='构造每个用户看过的正样本电影'):
        if row['label']==1:
            user_to_movies[row['user_id']].append(row['movie_id'])
    return user_to_movies

user_to_movies=build_user_to_true_movies(pos_neg_samples)

构造每个用户看过的正样本电影: 2538802it [00:31, 79357.67it/s]


In [143]:
#######获取movie_id_list#########
movie_id_list = movies_df['movie_id'].tolist()
print("电影数为:", len(movie_id_list))

电影数为: 3883


In [144]:
total_recall=0
total_users=len(user_vectors)
for user_id, user_vector in enumerate(user_vectors):
    true_movies = user_to_movies.get(user_id, [])
    scores = np.dot(movie_vectors, user_vector)  # 计算每个电影与用户的相似度
    top_k_indices = np.argsort(scores)[-k:][::-1]
    top_k_movie_ids = [movie_id_list[i] for i in top_k_indices]
    print(top_k_movie_ids)
    break

[3650, 3672, 1052, 3002, 2749, 2969, 2818, 197, 1343, 2545, 1621, 1497, 408, 2906, 1426, 380, 3320, 3067, 3167, 1903, 3538, 2105, 1401, 451, 2434, 1579, 3445, 2806, 1655, 325, 596, 3685, 2619, 237, 3310, 3403, 352, 2756, 3318, 220, 2490, 603, 3115, 2739, 553, 3039, 3474, 3702, 3293, 2036, 2851, 348, 3014, 987, 1375, 31, 2448, 180, 3425, 2129, 2449, 2253, 925, 469, 3411, 2673, 1765, 3029, 2170, 27, 1376, 2276, 449, 244, 266, 3659, 3610, 2917, 1880, 3548, 494, 3759, 215, 134, 1569, 1114, 3196, 304, 3737, 420, 942, 2726, 2194, 2724, 2322, 3162, 2057, 1749, 3273, 2837]


In [None]:
import numpy as np

def recall_at_k(user_vectors, movie_vectors, user_to_movies, movie_id_list, k=100):
    total_recall = 0
    total_users = len(user_vectors)
    
    for user_id, user_vector in enumerate(user_vectors):
        # 获取用户实际喜欢的电影列表
        true_movies = user_to_movies.get(user_id, [])
        
        # 计算用户和所有电影之间的相似度（可以使用余弦相似度）
        scores = np.dot(movie_vectors, user_vector)  # 计算每个电影与用户的相似度
        
        # 获取前K个得分最高的电影索引
        top_k_indices = np.argsort(scores)[-k:][::-1]
        top_k_movie_ids = [movie_id_list[i] for i in top_k_indices]
        # 计算用户在前k个推荐电影中的命中数量
        hit_count = sum([1 for m in true_movies if m in top_k_movie_ids])
        
        # 计算Recall@k
        recall = hit_count / len(true_movies) if len(true_movies) > 0 else 0
        total_recall += recall
    
    # 返回平均 Recall@k
    return total_recall / total_users

k=1000
recall = recall_at_k(user_vectors, movie_vectors, user_to_movies, movie_id_list, k)
print(f"Recall@{k}: {recall:.4f}")


Recall@1000: 0.2410
