In [42]:
import pickle
import pandas as pd 
pd.set_option('display.max_columns', None)

## 读取数据

In [43]:
# 需要的用户特征和电影特征列
user_features_list, movie_features_list=pickle.load(open('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/config/feature_keep.p', 'rb'))

ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')

import json
def read_json_file(filepath):
    """
    读取指定路径的 JSON 文件。

    Args:
        filepath (str): JSON 文件的完整路径。

    Returns:
        dict or list or None: 如果成功读取，则返回 JSON 文件中的数据（通常是字典或列表）。
                                如果文件不存在或读取失败，则返回 None。
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"错误: 文件 '{filepath}' 未找到。")
        return None
    except json.JSONDecodeError:
        print(f"错误: 文件 '{filepath}' 不是有效的 JSON 格式。")
        return None
    except Exception as e:
        print(f"读取文件 '{filepath}' 时发生未知错误: {e}")
        return None
    
movie_embedding_config=read_json_file('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/config/movie_embedding_config.json')
user_embedding_config=read_json_file('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/config/user_embedding_config.json')
with open('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/movie_features.pkl', 'rb') as f:
    movie_features = pickle.load(f)
with open('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/user_features.pkl', 'rb') as f:
    user_features=pickle.load(f)
pos_neg_samples=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/data/pos_neg_data.csv')

In [44]:
movie_embedding_config

{'movie_id': {'vocab_size': 3953, 'embedding_dim': 64, 'seq_len': 1},
 'title': {'vocab_size': 4984, 'embedding_dim': 64, 'seq_len': 16},
 'genres': {'vocab_size': 19, 'embedding_dim': 64, 'seq_len': 7},
 'popularity': {'vocab_size': 4, 'embedding_dim': 64, 'seq_len': 1},
 'year': {'vocab_size': 6, 'embedding_dim': 64, 'seq_len': 1},
 'is_old_movie': {'vocab_size': 2, 'embedding_dim': 64, 'seq_len': 1}}

In [45]:
pos_neg_samples

Unnamed: 0,user_id,movie_id,label
0,1,1193,1
1,1,2355,1
2,1,1287,1
3,1,2804,1
4,1,595,1
...,...,...,...
2538797,6040,736,0
2538798,6040,2507,0
2538799,6040,3794,0
2538800,6040,1236,0


## 构建模型

### SENet模块

In [46]:
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten, GlobalAveragePooling1D, Reshape, Multiply

def squeeze_excite_block(input_tensor, ratio=16):
    init = input_tensor
    filters = input_tensor.shape[-1]  # 获取特征图通道数
    
    # 1. Squeeze阶段：全局平均池化
    input_tensor=Reshape((-1,1))(input_tensor)
    se = GlobalAveragePooling1D()(input_tensor)
    
    # 2. Excite阶段：通过两个全连接层来学习通道间的注意力
    se = Dense(filters // ratio, activation='relu')(se)
    se = Dense(filters, activation='sigmoid')(se)  # 最后的sigmoid来生成注意力权重

    # 3. Scale阶段：按通道进行加权
    se = Reshape((1, filters))(se)  # 将se形状调整为 (batch, 1, filters)
    x = Multiply()([init, se])  # 对输入进行加权
    return x


### 用户塔

In [47]:
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,num_liked_genres,activity_level_encoded,favorite_genre_encoded,gender_encoded,occupation_encoded,age_encoded
0,1,4.188679,-1.615782,-0.584221,3.199606,0.061461,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,-1.097206,1,7,0,10,0
1,2,3.713178,-0.042568,-0.189889,-0.504394,0.061461,-0.131614,0.269718,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,-0.665949,2,7,1,16,4
2,3,3.901961,-0.123684,-0.594598,-0.504394,0.061461,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,-0.234692,1,4,1,15,1
3,4,4.190476,0.332416,-0.750255,-0.504394,0.061461,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,-1.528462,1,0,1,7,3
4,5,3.146465,0.601283,0.168123,-0.504394,0.061461,0.4351,0.35999,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,0.627821,2,7,1,20,1


In [48]:
user_embedding_feature=['user_id', 'activity_level_encoded', 'favorite_genre_encoded', 'gender_encoded', 'occupation_encoded', 'age_encoded']
user_not_embedding_feature=[col for col in user_features.columns if col not in user_embedding_feature]
print("需要送入embedding层的特征数量为:", len(user_embedding_feature))
print("不需要送入embedding层的特征数量为:", len(user_not_embedding_feature))

# 合并顺序：embedding 特征在前，其他特征在后
ordered_columns = user_embedding_feature + user_not_embedding_feature

# 按照指定顺序重新排列 DataFrame 的列
user_features = user_features[ordered_columns]

user_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/user_features_final.csv')

需要送入embedding层的特征数量为: 6
不需要送入embedding层的特征数量为: 26


In [49]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization, Dropout

def build_user_tower_from_config(user_embedding_config, dense_input_dim, output_dim=64, dropout_rate=0.3):
    inputs = []
    embed_layers = []

    # 1. 处理每个 embedding 特征
    for feature_name, config in user_embedding_config.items():
        vocab_size = config['vocab_size']
        emb_dim = config['embedding_dim']

        inp = Input(shape=(1,), name=feature_name)
        emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
        emb = Flatten()(emb)

        inputs.append(inp)
        embed_layers.append(emb)

    # 2. dense 特征处理（不需要 embedding）
    if dense_input_dim > 0:
        dense_input = Input(shape=(dense_input_dim,), name='dense_input_user')
        inputs.append(dense_input)
        embed_layers.append(dense_input)

    # 3. 合并特征 + MLP
    x = Concatenate()(embed_layers)

    # 4. 添加SENet模块来增强特征
    x=squeeze_excite_block(x)

    # 5. 后续处理
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(output_dim, activation='relu')(x)

    # 6. 展平数据，否则为(None, 1, 64)
    x = Flatten()(x)
    model = Model(inputs=inputs, outputs=x, name="UserTower")
    return model


In [50]:
dense_user_input_dim = len(user_not_embedding_feature)

user_tower = build_user_tower_from_config(
    user_embedding_config=user_embedding_config,
    dense_input_dim=dense_user_input_dim,
    output_dim=64
)

user_tower.summary()


### 物品塔

In [51]:
movie_embedding_feature=['movie_id', 'title', 'genres', 'popularity', 'year', 'is_old_movie']
movie_not_embedding_feature=[col for col in movie_features.columns if col not in movie_embedding_feature]
print(movie_not_embedding_feature)
print("需要送入embedding层的特征数量为:", len(movie_embedding_feature))
print("不需要送入embedding层的特征数量为:", len(movie_not_embedding_feature))

# 合并顺序：embedding 特征在前，其他特征在后
ordered_columns = movie_embedding_feature + movie_not_embedding_feature

# 按照指定顺序重新排列 DataFrame 的列
movie_features = movie_features[ordered_columns]

movie_features.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/movie_features_final.csv')

['movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity', 'title_length']
需要送入embedding层的特征数量为: 6
不需要送入embedding层的特征数量为: 5


### pointwise双塔模型

In [52]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten, GlobalAveragePooling1D
from tensorflow.keras.models import Model

def build_movie_tower_from_config(movie_embedding_config, dense_input_dim, output_dim=64, sequence_features=['title', 'genres'], dropout_rate=0.3):
    if sequence_features is None:
        sequence_features = ['title', 'genres']
    
    inputs = []
    embed_layers = []
    
    # 1. 处理每个 embedding 特征
    for feature_name, config in movie_embedding_config.items():
        vocab_size = config['vocab_size']
        emb_dim = config['embedding_dim']
        seq_len = config['seq_len']  # 默认值为1，适用于非序列特征

        if feature_name in sequence_features:
            # 处理序列特征，如 title 和 genres
            inp = Input(shape=(seq_len,), name=feature_name)  # shape=(batch, seq_len)
            emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
            emb = GlobalAveragePooling1D()(emb)  # 对序列进行池化（取平均）
        else:
            # 处理非序列特征，如 movie_id、popularity 等
            inp = Input(shape=(1,), name=feature_name)  # shape=(batch, 1)
            emb = Embedding(input_dim=vocab_size, output_dim=emb_dim, name=f"{feature_name}_emb")(inp)
            emb = Flatten()(emb)  # 展平嵌入层输出
            
        inputs.append(inp)
        embed_layers.append(emb)

    # 2. 添加其他非 embedding 的 dense 特征（如 movie_mean_rating 等）
    if dense_input_dim > 0:
        dense_input = Input(shape=(dense_input_dim,), name='dense_input_movie')
        inputs.append(dense_input)
        embed_layers.append(dense_input)

    # 3. 合并所有特征
    x = Concatenate()(embed_layers)

    # 4. 添加SENet模块来增强特征
    x=squeeze_excite_block(x)

    # 5. 后续处理
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(output_dim, activation='relu')(x)
    x = Flatten()(x)
    # 构建模型
    model = Model(inputs=inputs, outputs=x, name="MovieTower")
    return model



In [53]:
dense_movie_input_dim = len(movie_not_embedding_feature)

movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim)

movie_tower.summary()

### pointwise双塔模型

In [54]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda
import tensorflow as tf
from tensorflow.keras.layers import Activation

def build_dual_tower_model(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim, sequence_features=['title', 'genres']):
    # Build both towers
    user_tower = build_user_tower_from_config(user_embedding_config, dense_user_input_dim)
    movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim, sequence_features=sequence_features)

    # Output embeddings
    user_output = user_tower.output  # shape: (None, 64)
    movie_output = movie_tower.output  # shape: (None, 64)

    # L2 normalize the output vectors
    user_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(user_output)
    movie_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(movie_output)

    # Compute cosine similarity via dot product of normalized vectors
    cosine_similarity = tf.keras.layers.Dot(axes=1)([user_normalized, movie_normalized])

    out = Activation('sigmoid')(cosine_similarity)
    # Define the model
    model = Model(inputs=user_tower.input + movie_tower.input, outputs=out, name="DualTowerRecall")

    return model


## pointwise模型训练

### 划分训练集和验证集

In [55]:
from sklearn.model_selection import train_test_split
import numpy as np
data=pd.merge(pos_neg_samples, user_features, on='user_id')
data=pd.merge(data, movie_features, on='movie_id')
for col in ['genres', 'title']:
    data[col] = data[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
from tensorflow.keras.preprocessing.sequence import pad_sequences
# title_padded = pad_sequences(data['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post')
# genres_padded = pad_sequences(data['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post')
# data['title']=pd.Series(title_padded.tolist())
# data['genres']=pd.Series(genres_padded.tolist())
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
print("训练集数量:", len(train_df))
print("测试集数量:", len(test_df))

训练集数量: 2031041
测试集数量: 507761


In [56]:
# # 假设 movie_embedding_config['title']['seq_len'] 为 16
# title_padded = pad_sequences(data['title'], maxlen=16, padding='post', truncating='post')

# # 检查结果的形状
# print(title_padded.shape)  # 应该输出 (2538802, 16)

# data['title'] = title_padded
# print(data['title'].shape)

In [57]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
import numpy as np
from tensorflow.keras.metrics import AUC


model_pointwise = build_dual_tower_model(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim, sequence_features=['title', 'genres'])

model_pointwise.compile(optimizer=Adam(learning_rate=0.001), 
              loss='binary_crossentropy',  # 使用二分类交叉熵损失
              metrics=['accuracy', AUC(name='auc')])  # 使用准确率作为评估指标


In [58]:
model_pointwise.summary()

In [59]:
def prepare_model_input(X_user_df, X_user_dense, X_movie_df, X_movie_dense):
    # 用户输入特征
    user_input = [
        X_user_df['user_id'].values.reshape(-1, 1),
        X_user_df['activity_level_encoded'].values.reshape(-1, 1),
        X_user_df['favorite_genre_encoded'].values.reshape(-1, 1),
        X_user_df['gender_encoded'].values.reshape(-1, 1),
        X_user_df['occupation_encoded'].values.reshape(-1, 1),
        X_user_df['age_encoded'].values.reshape(-1, 1),
        X_user_dense  # shape (N, 26)
    ]

    # 电影输入特征
    movie_input = [
        X_movie_df['movie_id'].values.reshape(-1, 1),
        pad_sequences(X_movie_df['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post'),     # shape (N, 16)
        pad_sequences(X_movie_df['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post'),    # shape (N, 7)
        X_movie_df['popularity'].values.reshape(-1, 1),
        X_movie_df['year'].values.reshape(-1, 1),
        X_movie_df['is_old_movie'].values.reshape(-1, 1),
        X_movie_dense  # shape (N, 5)
    ]
    # print("title的shape为:", X_movie_df['title'])
    
    # 打印每个特征的形状
    for i, feature in enumerate(user_input):
        print(f"user_input feature {i}: shape {feature.shape}")

    for i, feature in enumerate(movie_input):
        print(f"movie_input feature {i}: shape {feature.shape}")

    # 将两个部分合并成一个大的输入列表
    X_train = user_input + movie_input
    return X_train

# 获取训练数据和测试数据
X_train = prepare_model_input(
    train_df[user_embedding_feature], 
    train_df[user_not_embedding_feature], 
    train_df[movie_embedding_feature], 
    train_df[movie_not_embedding_feature]
)

y_train = train_df['label'].values

X_test = prepare_model_input(
    test_df[user_embedding_feature], 
    test_df[user_not_embedding_feature], 
    test_df[movie_embedding_feature], 
    test_df[movie_not_embedding_feature]
)

y_test = test_df['label'].values

user_input feature 0: shape (2031041, 1)
user_input feature 1: shape (2031041, 1)
user_input feature 2: shape (2031041, 1)
user_input feature 3: shape (2031041, 1)
user_input feature 4: shape (2031041, 1)
user_input feature 5: shape (2031041, 1)
user_input feature 6: shape (2031041, 26)
movie_input feature 0: shape (2031041, 1)
movie_input feature 1: shape (2031041, 16)
movie_input feature 2: shape (2031041, 7)
movie_input feature 3: shape (2031041, 1)
movie_input feature 4: shape (2031041, 1)
movie_input feature 5: shape (2031041, 1)
movie_input feature 6: shape (2031041, 5)
user_input feature 0: shape (507761, 1)
user_input feature 1: shape (507761, 1)
user_input feature 2: shape (507761, 1)
user_input feature 3: shape (507761, 1)
user_input feature 4: shape (507761, 1)
user_input feature 5: shape (507761, 1)
user_input feature 6: shape (507761, 26)
movie_input feature 0: shape (507761, 1)
movie_input feature 1: shape (507761, 16)
movie_input feature 2: shape (507761, 7)
movie_input 

In [60]:
for i, inp in enumerate(model_pointwise.inputs):
    print(f"{i}: {inp.name}, shape: {inp.shape}")

0: user_id, shape: (None, 1)
1: activity_level_encoded, shape: (None, 1)
2: favorite_genre_encoded, shape: (None, 1)
3: gender_encoded, shape: (None, 1)
4: occupation_encoded, shape: (None, 1)
5: age_encoded, shape: (None, 1)
6: dense_input_user, shape: (None, 26)
7: movie_id, shape: (None, 1)
8: title, shape: (None, 16)
9: genres, shape: (None, 7)
10: popularity, shape: (None, 1)
11: year, shape: (None, 1)
12: is_old_movie, shape: (None, 1)
13: dense_input_movie, shape: (None, 5)


In [61]:
# 5. 训练模型
history = model_pointwise.fit(
    X_train,  # 训练数据
    y_train,  # 标签
    validation_data=(X_test, y_test),  # 验证数据
    epochs=1,  # 训练周期（debug模式设置为1）
    batch_size=64  # 每批次的大小
)

# 6. 打印训练过程中的日志
print("训练完成，历史记录如下:")
print(history.history)
print(f"训练期间 AUC: {history.history['auc']}")
print(f"验证期间 AUC: {history.history['val_auc']}")


# 7. 在测试集上评估模型性能
test_loss, test_acc, test_auc = model_pointwise.evaluate(X_test, y_test)
print(f"Test loss: {test_loss}, Test accuracy: {test_acc}, Test AUC: {test_auc}")

[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 4ms/step - accuracy: 0.8234 - auc: 0.7549 - loss: 0.6772 - val_accuracy: 0.8541 - val_auc: 0.7706 - val_loss: 0.6735
训练完成，历史记录如下:
{'accuracy': [0.8433704972267151], 'auc': [0.761093258857727], 'loss': [0.6757306456565857], 'val_accuracy': [0.8541439771652222], 'val_auc': [0.7705793380737305], 'val_loss': [0.6734799742698669]}
训练期间 AUC: [0.761093258857727]
验证期间 AUC: [0.7705793380737305]
[1m15868/15868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 706us/step - accuracy: 0.8539 - auc: 0.7708 - loss: 0.6736
Test loss: 0.6734820604324341, Test accuracy: 0.8541439771652222, Test AUC: 0.7705793380737305


In [62]:
def get_movie_input(X_movie_df, X_movie_dense):
    # 电影输入特征
    movie_input = [
        X_movie_df['movie_id'].values.reshape(-1, 1),
        pad_sequences(X_movie_df['title'], maxlen=movie_embedding_config['title']['seq_len'], padding='post', truncating='post'),     # shape (N, 16)
        pad_sequences(X_movie_df['genres'], maxlen=movie_embedding_config['genres']['seq_len'], padding='post', truncating='post'),    # shape (N, 7)
        X_movie_df['popularity'].values.reshape(-1, 1),
        X_movie_df['year'].values.reshape(-1, 1),
        X_movie_df['is_old_movie'].values.reshape(-1, 1),
        X_movie_dense  # shape (N, 5)
    ]
    # print("title的shape为:", X_movie_df['title'])
    

    for i, feature in enumerate(movie_input):
        print(f"movie_input feature {i}: shape {feature.shape}")

    return movie_input

movie_input=get_movie_input(movie_features[movie_embedding_feature], movie_features[movie_not_embedding_feature])

movie_input feature 0: shape (3706, 1)
movie_input feature 1: shape (3706, 16)
movie_input feature 2: shape (3706, 7)
movie_input feature 3: shape (3706, 1)
movie_input feature 4: shape (3706, 1)
movie_input feature 5: shape (3706, 1)
movie_input feature 6: shape (3706, 5)


In [63]:
# 获取movie_tower
movie_tower = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim, sequence_features=['title', 'genres'])

# 获取输出并做L2归一化
movie_output = movie_tower.output  # shape: (None, 64)
movie_normalized = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(movie_output)

# 获取输出并且归一化
movie_embedding_model = Model(inputs=movie_tower.input, outputs=movie_output, name="MovieEmbeddingModel")

# 提取embedding
movie_vectors = movie_embedding_model.predict(movie_input, batch_size=128)

print(movie_vectors.shape)

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
(3706, 64)


In [64]:
import faiss
movie_vectors = np.asarray(movie_vectors).astype("float32")
# 使用 FAISS 构建向量索引（使用内积，因为是归一化过的 embedding）
index = faiss.IndexFlatIP(movie_vectors.shape[1])  # 64维
index.add(movie_vectors)  # 添加所有电影向量

In [65]:
faiss.write_index(index, "/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/faiss/movie_index_pointwise.faiss")

In [66]:
#######构造每个用户看过的正样本电影#########
from tqdm import tqdm
from collections import defaultdict
def build_user_to_true_movies(test_df):
    """
    test_df: 包含 user_id 和 movie_id 的 DataFrame
    返回: user_to_true_movies 字典，键是 user_id，值是 movie_id 列表
    """
    user_to_movies = defaultdict(list)
    for _, row in tqdm(test_df.iterrows(), desc='构造每个用户看过的正样本电影'):
        if row['label']==1:
            user_to_movies[row['user_id']].append(row['movie_id'])
    return user_to_movies

user_to_movies=build_user_to_true_movies(pos_neg_samples)

构造每个用户看过的正样本电影: 2538802it [00:30, 82157.92it/s]


In [67]:
#######获取movie_id_list#########
movie_id_list = movies_df['movie_id'].tolist()
print("电影数为:", len(movie_id_list))

电影数为: 3883


### pairwise双塔模型

In [68]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot, Subtract, Lambda, Activation
import tensorflow as tf

def build_pairwise_dual_tower(user_embedding_config, movie_embedding_config, 
                               dense_user_input_dim, dense_movie_input_dim, 
                               sequence_features=['title', 'genres']):
    
    # User塔
    user_tower = build_user_tower_from_config(user_embedding_config, dense_user_input_dim)
    # Positive电影塔
    movie_tower_pos = build_movie_tower_from_config(movie_embedding_config, dense_movie_input_dim)
    # Negative电影塔（共享权重）
    movie_tower_neg = movie_tower_pos  # 权重共享

    user_embedding = user_tower.output
    pos_movie_embedding = movie_tower_pos.output
    neg_movie_embedding = movie_tower_neg.output

    # 归一化
    user_embedding = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(user_embedding)
    pos_movie_embedding = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(pos_movie_embedding)
    neg_movie_embedding = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(neg_movie_embedding)

    # dot(u, p) - dot(u, n)
    pos_score = Dot(axes=1)([user_embedding, pos_movie_embedding])
    neg_score = Dot(axes=1)([user_embedding, neg_movie_embedding])
    diff = Subtract()([pos_score, neg_score])

    # BPR损失对应的sigmoid输出
    output = Activation('sigmoid')(diff)

    # 定义模型
    model = Model(inputs=user_tower.input + movie_tower_pos.input + movie_tower_neg.input, 
                  outputs=output)

    return model

In [69]:
model_pairwise=build_pairwise_dual_tower(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim)
model_pairwise.compile(optimizer='adam', loss='binary_crossentropy')

model_pairwise.summary()

In [70]:
import pandas as pd
from tqdm import tqdm

def generate_pairwise_triplets(pos_neg_sample):
    # 分别提取正样本和负样本
    pos_samples = pos_neg_sample[pos_neg_sample['label'] == 1]
    neg_samples = pos_neg_sample[pos_neg_sample['label'] == 0]

    triplets = []

    # 遍历每个用户
    for user_id, group in tqdm(pos_samples.groupby('user_id'), desc='为用户ID生成三元组'):
        # 获取当前用户的正样本电影列表
        pos_movies = group['movie_id'].tolist()

        # 获取当前用户的负样本电影列表
        neg_movies = neg_samples[neg_samples['user_id'] == user_id]['movie_id'].tolist()

        # 为每个正样本电影和负样本电影组合生成三元组
        for pos_movie in pos_movies:
            for neg_movie in neg_movies:
                triplets.append((user_id, pos_movie, neg_movie))

    # 返回三元组的 DataFrame
    return pd.DataFrame(triplets, columns=['user_id', 'pos_movie_id', 'neg_movie_id'])

triplet_df = generate_pairwise_triplets(pos_neg_samples)
print(triplet_df.head())

为用户ID生成三元组: 100%|██████████| 6040/6040 [00:43<00:00, 139.53it/s]


   user_id  pos_movie_id  neg_movie_id
0        1          1193          3755
1        1          1193          2609
2        1          1193          3680
3        1          1193          1049
4        1          1193          3365


In [71]:
# triplet_df.to_csv('./data/tripled_df.csv')

In [72]:
import numpy as np

triplet_df = pd.read_csv('./data/tripled_df.csv')

In [73]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_pairwise_data(triplet_df, user_features, movie_features, user_embedding_feature, movie_embedding_feature):
    """
    准备训练数据
    :param triplet_df: 生成的三元组 DataFrame
    :param user_features: 用户特征 DataFrame
    :param movie_features: 电影特征 DataFrame
    :param user_embedding_feature: 用户的嵌入特征
    :param movie_embedding_feature: 电影的嵌入特征
    :return: 模型输入字典，训练标签
    """
    # 设置索引以加速合并操作
    user_features = user_features.set_index('user_id')
    movie_features = movie_features.set_index('movie_id')

    # 合并用户特征
    user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
    # 合并正样本电影特征
    pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
    # 合并负样本电影特征
    neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

    # 标签全为 1，因为我们想要模型学习正负样本的区别
    labels = np.ones(len(triplet_df))

    # 返回输入数据和标签
    return {
        'user_input': user_input,
        'pos_movie_input': pos_movie_input,
        'neg_movie_input': neg_movie_input
    }, labels

In [74]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_pairwise_data(triplet_df, user_features, movie_features, user_embedding_feature, movie_embedding_feature):
    """
    准备训练数据
    :param triplet_df: 生成的三元组 DataFrame
    :param user_features: 用户特征 DataFrame
    :param movie_features: 电影特征 DataFrame
    :param user_embedding_feature: 用户的嵌入特征
    :param movie_embedding_feature: 电影的嵌入特征
    :return: 模型输入字典，训练标签
    """
    # 设置索引以加速合并操作
    user_features = user_features.set_index('user_id')
    movie_features = movie_features.set_index('movie_id')

    # 合并用户特征
    user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
    # 合并正样本电影特征
    pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
    # 合并负样本电影特征
    neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

    # 标签全为 1，因为我们想要模型学习正负样本的区别
    labels = np.ones(len(triplet_df))

    # 返回输入数据和标签
    return {
        'user_input': user_input,
        'pos_movie_input': pos_movie_input,
        'neg_movie_input': neg_movie_input
    }, labels

In [75]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_pairwise_data(triplet_df, user_features, movie_features, user_embedding_feature, movie_embedding_feature):
    """
    准备训练数据
    :param triplet_df: 生成的三元组 DataFrame
    :param user_features: 用户特征 DataFrame
    :param movie_features: 电影特征 DataFrame
    :param user_embedding_feature: 用户的嵌入特征
    :param movie_embedding_feature: 电影的嵌入特征
    :return: 模型输入字典，训练标签
    """
    # 设置索引以加速合并操作
    user_features = user_features.set_index('user_id')
    movie_features = movie_features.set_index('movie_id')

    # 合并用户特征
    user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
    # 合并正样本电影特征
    pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
    # 合并负样本电影特征
    neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

    # 标签全为 1，因为我们想要模型学习正负样本的区别
    labels = np.ones(len(triplet_df))

    # 返回输入数据和标签
    return {
        'user_input': user_input,
        'pos_movie_input': pos_movie_input,
        'neg_movie_input': neg_movie_input
    }, labels

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_pairwise_data(triplet_df, user_features, movie_features, user_embedding_feature, movie_embedding_feature):
    """
    准备训练数据
    :param triplet_df: 生成的三元组 DataFrame
    :param user_features: 用户特征 DataFrame
    :param movie_features: 电影特征 DataFrame
    :param user_embedding_feature: 用户的嵌入特征
    :param movie_embedding_feature: 电影的嵌入特征
    :return: 模型输入字典，训练标签
    """
    # 设置索引以加速合并操作
    user_features = user_features.set_index('user_id')
    movie_features = movie_features.set_index('movie_id')

    # 合并用户特征
    user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
    # 合并正样本电影特征
    pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
    # 合并负样本电影特征
    neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

    # 标签全为 1，因为我们想要模型学习正负样本的区别
    labels = np.ones(len(triplet_df))

    # 返回输入数据和标签
    return {
        'user_input': user_input,
        'pos_movie_input': pos_movie_input,
        'neg_movie_input': neg_movie_input
    }, labels

In [76]:
user_features = user_features.set_index('user_id')
movie_features = movie_features.set_index('movie_id')


In [83]:
user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
# 合并正样本电影特征
pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
# 合并负样本电影特征
neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

KeyError: "['user_id'] not in index"

In [77]:
# merged_user = triplet_df.merge(user_features, left_on='user_id', right_on='user_id', how='left')
# merged_pos_movie = triplet_df.merge(movie_features, left_on='pos_movie_id', right_on='movie_id', how='left')
# merged_neg_movie = triplet_df.merge(movie_features, left_on='neg_movie_id', right_on='movie_id', how='left')
# # user_input = np.array(merged_user[user_embedding_feature].values.tolist())
# # pos_movie_input = np.array(merged_pos_movie[movie_embedding_feature].values.tolist())
# # neg_movie_input = np.array(merged_neg_movie[movie_embedding_feature].values.tolist())
# pairwise_data, labels = prepare_pairwise_data(triplet_df, user_features, movie_features, 
#                                               user_embedding_feature, movie_embedding_feature)

In [78]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def prepare_pairwise_data(triplet_df, user_features, movie_features, user_embedding_feature, movie_embedding_feature):
    """
    准备训练数据
    :param triplet_df: 生成的三元组 DataFrame
    :param user_features: 用户特征 DataFrame
    :param movie_features: 电影特征 DataFrame
    :param user_embedding_feature: 用户的嵌入特征
    :param movie_embedding_feature: 电影的嵌入特征
    :return: 模型输入字典，训练标签
    """
    # 设置索引以加速合并操作
    user_features = user_features.set_index('user_id')
    movie_features = movie_features.set_index('movie_id')

    # 合并用户特征
    user_input = np.array(user_features.loc[triplet_df['user_id'], user_embedding_feature].values.tolist())
    # 合并正样本电影特征
    pos_movie_input = np.array(movie_features.loc[triplet_df['pos_movie_id'], movie_embedding_feature].values.tolist())
    # 合并负样本电影特征
    neg_movie_input = np.array(movie_features.loc[triplet_df['neg_movie_id'], movie_embedding_feature].values.tolist())

    # 标签全为 1，因为我们想要模型学习正负样本的区别
    labels = np.ones(len(triplet_df))

    # 返回输入数据和标签
    return {
        'user_input': user_input,
        'pos_movie_input': pos_movie_input,
        'neg_movie_input': neg_movie_input
    }, labels

## pairwise模型训练

In [79]:
model_pairwise=build_dual_tower_model(user_embedding_config, movie_embedding_config, dense_user_input_dim, dense_movie_input_dim)

In [80]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    """
    对比损失函数，通常用于相似度学习。

    Args:
        y_true: 真实标签，1表示正样本对，0表示负样本对。
        y_pred: 模型预测的相似度值（0-1之间）。
        margin: 正负样本对之间的边界。
    
    Returns:
        loss: 计算的损失值。
    """
    # 对正样本对，y_true=1，损失为 (1 - y_pred)^2
    positive_loss = y_true * tf.square(y_pred)
    # 对负样本对，y_true=0，损失为 max(0, margin - y_pred)^2
    negative_loss = (1 - y_true) * tf.square(tf.maximum(0., margin - y_pred))

    return tf.reduce_mean(positive_loss + negative_loss)

In [81]:
model_pairwise.compile(optimizer=Adam(learning_rate=0.001), 
              loss=contrastive_loss,  # 使用对比损失函数
              metrics=['accuracy', AUC(name='auc')])

In [82]:
history = model_pairwise.fit(
    X_train,  # 训练数据
    y_train,  # 标签
    epochs=5,  # 训练周期
    batch_size=64  # 每批次的大小
)

Epoch 1/5
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 4ms/step - accuracy: 0.1590 - auc: 0.1787 - loss: 0.1471
Epoch 2/5
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 4ms/step - accuracy: 0.1546 - auc: 0.1690 - loss: 0.1462
Epoch 3/5
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 4ms/step - accuracy: 0.1531 - auc: 0.1476 - loss: 0.1460
Epoch 4/5
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 4ms/step - accuracy: 0.1516 - auc: 0.1447 - loss: 0.1458
Epoch 5/5
[1m31736/31736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 4ms/step - accuracy: 0.1511 - auc: 0.1439 - loss: 0.1458
