In [1]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from two_tower_model_v2 import MovieRecommendationModel
import pickle



使用设备cpu


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 加载预处理数据
title2int, title_count, title_set, genres2int, genres_map, features_pd, targets_pd, features, targets_values, ratings_df, users_df, movies_df, data = pickle.load(open('./data/preprocess.p', 'rb'))


In [4]:
# 原始的电影数据、用户数据和评分数据
raw_movies_df=pd.read_csv('./data/ml-1m/movies.csv')
raw_users_df=pd.read_csv('./data/ml-1m/users.csv')
raw_ratings_df=pd.read_csv('./data/ml-1m/ratings.csv')

In [5]:
raw_movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# 获取电影ID和标题的映射
movie_id_to_title = dict(zip(raw_movies_df['movie_id'], raw_movies_df['title']))

# 获取电影ID和类型的映射
movie_id_to_genres=dict(zip(raw_movies_df['movie_id'],raw_movies_df['genres']))

In [7]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,"[895, 3490, 3512, 3512, 3512, 3512, 3512, 3512...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
1,2,"[2012, 3512, 3512, 3512, 3512, 3512, 3512, 351...","[9, 15, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
2,3,"[769, 5179, 5052, 3512, 3512, 3512, 3512, 3512...","[6, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3,4,"[359, 390, 370, 3512, 3512, 3512, 3512, 3512, ...","[6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,5,"[834, 3458, 2242, 2367, 663, 2801, 3512, 3512,...","[6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."


In [8]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,1,0,10,48067
1,2,0,5,16,70072
2,3,0,6,15,55117
3,4,0,2,7,2460
4,5,0,6,20,55455


In [9]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
def load_model(model_path='./model_save/two_tower_model_20250401.pth'):
    embed_dim = 32
    # 用户 ID 个数
    uid_num = max(features.take(0, 1)) + 1
    # 性别个数
    gender_num = max(features.take(2, 1)) + 1
    # 年龄类别个数
    age_num = max(features.take(3, 1)) + 1
    # 职业个数
    job_num = max(features.take(4, 1)) + 1

    # 电影 ID 个数
    mid_num = max(features.take(1, 1)) + 1
    # 电影类型个数
    movie_category_num = max(genres2int.values()) + 1
    # 电影名单词个数
    movie_title_num = len(title_set)

    # 文本卷积滑动窗口
    window_sizes={2, 3, 4, 5}

    # 文本卷积核数量
    filter_num=8

    sentence_size=title_count

    dropout_keep_prob=0.5

    # 加载模型
    model = MovieRecommendationModel(uid_num, gender_num, age_num, job_num, embed_dim, 
                                mid_num, movie_category_num, movie_title_num, 
                                window_sizes, filter_num, sentence_size, dropout_keep_prob)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [11]:
model=load_model()
print(model)

MovieRecommendationModel(
  (user_tower): UserTower(
    (uid_embedding): Embedding(6041, 32)
    (gender_embedding): Embedding(2, 16)
    (age_embedding): Embedding(7, 16)
    (job_embedding): Embedding(21, 16)
    (relu): ReLU()
    (tanh): Tanh()
    (uid_fc): Linear(in_features=32, out_features=32, bias=True)
    (gender_fc): Linear(in_features=16, out_features=32, bias=True)
    (age_fc): Linear(in_features=16, out_features=32, bias=True)
    (job_fc): Linear(in_features=16, out_features=32, bias=True)
    (combine_fc): Linear(in_features=128, out_features=200, bias=True)
  )
  (movie_tower): MovieTower(
    (movie_id_embedding): Embedding(3953, 32)
    (movie_categories_embedding): Embedding(19, 32)
    (movie_title_embedding): Embedding(5217, 32)
    (relu): ReLU()
    (tanh): Tanh()
    (movie_id_fc): Linear(in_features=32, out_features=32, bias=True)
    (movie_categories_fc): Linear(in_features=32, out_features=32, bias=True)
    (conv_layers): ModuleList(
      (0): Conv2d(1

In [69]:
data.head()

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zip_code,title,genres
0,1,1193,5,1,0,10,48067,"[4835, 130, 3852, 2242, 1296, 4153, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
1,1,661,3,1,0,10,48067,"[4809, 4472, 2242, 3869, 1291, 3512, 3512, 351...","[11, 15, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
2,1,914,3,1,0,10,48067,"[4394, 1320, 1007, 3512, 3512, 3512, 3512, 351...","[0, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3,1,3408,4,1,0,10,48067,"[1951, 4696, 3512, 3512, 3512, 3512, 3512, 351...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,1,2355,5,1,0,10,48067,"[722, 932, 973, 3512, 3512, 3512, 3512, 3512, ...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."


In [76]:
# 定义电影特征数据集类
from tqdm import tqdm

embed_dim = 32
# 用户 ID 个数
uid_num = max(features.take(0, 1)) + 1
# 性别个数
gender_num = max(features.take(2, 1)) + 1
# 年龄类别个数
age_num = max(features.take(3, 1)) + 1
# 职业个数
job_num = max(features.take(4, 1)) + 1

# 电影 ID 个数
mid_num = max(features.take(1, 1)) + 1
# 电影类型个数
movie_category_num = max(genres2int.values()) + 1
# 电影名单词个数
movie_title_num = len(title_set)


In [None]:
movies_df.head()

# 将movies_df变成features
movie_features=movies_df.values # 第一个key是id，第二个key是title，第三个key是genres

In [117]:

class MovieFeaturesDataset(Dataset):
    def __init__(self, movies_df, movie_features):
        """
        用于提取电影特征的数据集类
        :param movies_df: 电影数据框
        :param features: 原始特征数据
        """
        # 获取所有唯一的电影ID
        self.unique_movies = movies_df['movie_id'].unique()
        
        # 创建电影ID到特征的映射
        self.movie_to_feature = {}
        for feature in movie_features:
            movie_id = int(feature[0])
            self.movie_to_feature[movie_id] = feature
        
        # 检查是否有缺失的电影ID
        missing_movies = [movie_id for movie_id in self.unique_movies if movie_id not in self.movie_to_feature]
        if missing_movies:
            print(f"缺失的电影ID: {missing_movies}")
            print(len(missing_movies))
    
    def __len__(self):
        return len(self.unique_movies)
    
    def __getitem__(self, idx):
        movie_id = self.unique_movies[idx]
        feature = self.movie_to_feature[movie_id]
        
        return {
            'movie_id': torch.tensor(feature[0]),  # 电影ID
            'movie_titles': torch.tensor(feature[1]),  # 电影类型
            'movie_genres': torch.tensor(feature[2])  # 电影标题
        }


In [119]:
dataset = MovieFeaturesDataset(movies_df, features)

In [120]:

def compute_all_movie_embeddings(model, movies_df, features, batch_size=1, save_path='./embedding/movie_embeddings.pkl'):
    """
    计算所有电影的embedding并保存
    :param model: 训练好的双塔模型
    :param movies_df: 电影数据框
    :param features: 原始特征数据
    :param batch_size: 批量大小
    :param save_path: embedding保存路径
    :return: 电影ID到embedding的字典
    """
    # 创建数据集和数据加载器
    dataset = MovieFeaturesDataset(movies_df, features)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    # 存储结果，给定电影ID的情况下可以返回其embedding
    movie_embeddings = {}
    
    model.eval()  # 设置为评估模式
    with torch.no_grad():
        for batch in tqdm(loader, desc="计算电影embedding"):
            # 将数据转移到设备
            movie_ids = batch['movie_id'].to(device)
            categories = batch['movie_genres'].to(device)
            titles = batch['movie_titles'].to(device)
            
            # 获取电影embedding
            embeddings = model.movie_tower(movie_ids, categories, titles)
            
            # 存储结果
            for i in range(len(movie_ids)):
                movie_id = movie_ids[i].item()
                embedding = embeddings[i].cpu().numpy()
                movie_embeddings[movie_id] = embedding
    
    # 保存结果
    with open(save_path, 'wb') as f:
        pickle.dump(movie_embeddings, f)
    
    print(f"电影embedding已保存到 {save_path}")
    return movie_embeddings

In [121]:

# 计算所有电影的embedding
movie_embeddings = compute_all_movie_embeddings(model, movies_df, features)

# 示例：获取特定电影的embedding
sample_movie_id = 1  # 示例电影ID
if sample_movie_id in movie_embeddings:
    print(f"\n电影ID {sample_movie_id} 的embedding:")
    print(movie_embeddings[sample_movie_id])
    print(f"Embedding维度: {movie_embeddings[sample_movie_id].shape}")
else:
    print(f"电影ID {sample_movie_id} 不存在于数据中")

计算电影embedding:   0%|          | 0/3883 [00:00<?, ?it/s]


RuntimeError: Calculated padded input size per channel: (1 x 32). Kernel size: (2 x 32). Kernel size can't be greater than actual input size

In [48]:
def recommend_movies(user_id, model, topk=10):
    """
    给定用户ID和双塔模型，以及希望输出的电影数
    可以输出该用户最有可能感兴趣的topk个电影
    """
    # 获取用户特征
    user_features = features[np.where(features[:, 0] == user_id)[0]]

    # 判断是否找到用户ID
    if len(user_features) == 0:
        raise ValueError(f"用户 ID {user_id} 不存在")

    user_features = user_features[0]

    # 读取用户特征
    uid = torch.tensor([user_features[0]]).to(device)
    user_gender = torch.tensor([user_features[2]]).to(device)
    user_age = torch.tensor([user_features[3]]).to(device)
    user_job = torch.tensor([user_features[4]]).to(device)

    # 计算所有电影的嵌入向量
    all_movie_ids = np.unique(features[:, 1])
    movie_embeddings = []
    for movie_id in all_movie_ids:
        movie_features = features[np.where(features[:, 1] == movie_id)[0]]
        if len(movie_features) == 0:
            continue
        movie_features = movie_features[0]
        # 电影ID
        movie_id_tensor = torch.tensor([movie_features[1]]).to(device)
        movie_categories = torch.tensor([movie_features[7]]).to(device)
        movie_titles = torch.tensor([movie_features[6]]).to(device)

        with torch.no_grad():
            movie_embed = model.movie_tower(movie_id_tensor, movie_categories, movie_titles)
            movie_embeddings.append(movie_embed)

    movie_embeddings = torch.cat(movie_embeddings, dim=0)

    # 计算用户与所有电影的相似度
    with torch.no_grad():
        user_embed = model.user_tower(uid, user_gender, user_age, user_job)
        similarity = torch.matmul(user_embed, movie_embeddings.t())[0]
        print(similarity)

    # 按相似度排序，获取 topk 个电影
    _, top_movie_indices = torch.topk(similarity, topk)
    top_movie_ids = all_movie_ids[top_movie_indices.cpu().numpy()]

    return top_movie_ids

In [49]:
recommend_movies(1, model, 10)

tensor([-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000])


array([1205, 610, 2081, 3864, 588, 288, 299, 272, 235, 32], dtype=object)