In [27]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from two_tower_model_v2 import MovieRecommendationModel
import pickle

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 加载预处理数据
title2int, title_count, title_set, genres2int, genres_map, features_pd, targets_pd, features, targets_values, ratings_df, users_df, movies_df, data = pickle.load(open('./data/preprocess.p', 'rb'))


In [59]:
# 原始的电影数据、用户数据和评分数据
raw_movies_df=pd.read_csv('./data/ml-1m/movies.csv')
raw_users_df=pd.read_csv('./data/ml-1m/users.csv')
raw_ratings_df=pd.read_csv('./data/ml-1m/ratings.csv')

In [61]:
raw_movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [62]:
# 获取电影ID和标题的映射
movie_id_to_title = dict(zip(raw_movies_df['movie_id'], raw_movies_df['title']))

# 获取电影ID和类型的映射
movie_id_to_genres=dict(zip(raw_movies_df['movie_id'],raw_movies_df['genres']))

In [29]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,"[895, 3490, 3512, 3512, 3512, 3512, 3512, 3512...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
1,2,"[2012, 3512, 3512, 3512, 3512, 3512, 3512, 351...","[9, 15, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
2,3,"[769, 5179, 5052, 3512, 3512, 3512, 3512, 3512...","[6, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3,4,"[359, 390, 370, 3512, 3512, 3512, 3512, 3512, ...","[6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,5,"[834, 3458, 2242, 2367, 663, 2801, 3512, 3512,...","[6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."


In [30]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,1,0,10,48067
1,2,0,5,16,70072
2,3,0,6,15,55117
3,4,0,2,7,2460
4,5,0,6,20,55455


In [31]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [32]:
def load_model(model_path='./model_save/two_tower_model_20250401.pth'):
    embed_dim = 32
    # 用户 ID 个数
    uid_num = max(features.take(0, 1)) + 1
    # 性别个数
    gender_num = max(features.take(2, 1)) + 1
    # 年龄类别个数
    age_num = max(features.take(3, 1)) + 1
    # 职业个数
    job_num = max(features.take(4, 1)) + 1

    # 电影 ID 个数
    mid_num = max(features.take(1, 1)) + 1
    # 电影类型个数
    movie_category_num = max(genres2int.values()) + 1
    # 电影名单词个数
    movie_title_num = len(title_set)

    # 文本卷积滑动窗口
    window_sizes={2, 3, 4, 5}

    # 文本卷积核数量
    filter_num=8

    sentence_size=title_count

    dropout_keep_prob=0.5

    # 加载模型
    model = MovieRecommendationModel(uid_num, gender_num, age_num, job_num, embed_dim, 
                                mid_num, movie_category_num, movie_title_num, 
                                window_sizes, filter_num, sentence_size, dropout_keep_prob)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [33]:
model=load_model()
print(model)

MovieRecommendationModel(
  (user_tower): UserTower(
    (uid_embedding): Embedding(6041, 32)
    (gender_embedding): Embedding(2, 16)
    (age_embedding): Embedding(7, 16)
    (job_embedding): Embedding(21, 16)
    (relu): ReLU()
    (tanh): Tanh()
    (uid_fc): Linear(in_features=32, out_features=32, bias=True)
    (gender_fc): Linear(in_features=16, out_features=32, bias=True)
    (age_fc): Linear(in_features=16, out_features=32, bias=True)
    (job_fc): Linear(in_features=16, out_features=32, bias=True)
    (combine_fc): Linear(in_features=128, out_features=200, bias=True)
  )
  (movie_tower): MovieTower(
    (movie_id_embedding): Embedding(3953, 32)
    (movie_categories_embedding): Embedding(19, 32)
    (movie_title_embedding): Embedding(5217, 32)
    (relu): ReLU()
    (tanh): Tanh()
    (movie_id_fc): Linear(in_features=32, out_features=32, bias=True)
    (movie_categories_fc): Linear(in_features=32, out_features=32, bias=True)
    (conv_layers): ModuleList(
      (0): Conv2d(1

In [85]:
class InferenceDataset(Dataset):
    def __init__(self, user_features, movie_features):
        self.user_features = user_features
        self.movie_features = movie_features
        
    def __len__(self):
        return len(self.movie_features)
    
    def __getitem__(self, idx):
        movie_feature = self.movie_features[idx]
        return {
            'user_features': torch.tensor(self.user_features),
            'movie_id': torch.tensor(movie_feature[0]),
            'movie_categories': torch.tensor(movie_feature[1]),
            'movie_titles': torch.tensor(movie_feature[2])
        }

In [35]:
def prepare_user_features(user_id):
    """给定用户ID的情况下输出用户特征"""
    user_data = users_df[users_df['user_id'] == user_id].iloc[0]
    
    # 转换为模型输入格式
    user_features = [
        user_data['user_id'],  # uid
        user_data['gender'],  # gender
        user_data['age'],      # age
        user_data['occupation']  # job
    ]
    
    return user_features

In [86]:
user_features=prepare_user_features(1)
print(user_features)

[np.int64(1), np.int64(1), np.int64(0), np.int64(10)]


In [36]:
def prepare_movie_features():
    """准备所有电影特征"""
    movie_features = []
    for _, row in movies_df.iterrows():
        # # 转换电影类型为multi-hot编码
        # categories = [genres2int[g] for g in row['genres'].split('|')]
        # # 转换标题为序列
        # title = [title2int.get(word, 0) for word in row['title'].split()[:title_count]]
        # title = title + [0] * (title_count - len(title))  # 填充
        
        movie_features.append([
            row['movie_id'],  # movie_id
            row['genres'],       # categories
            row['title']            # title
        ])
    
    return movie_features

In [89]:
def recommend_movies(model, user_id, top_k=10):
    """为指定用户推荐电影（使用forward方法）"""
    # 1. 准备用户特征
    user_features = prepare_user_features(user_id) # 4
    # print("用户特征维度:", len(user_features))
    # 2. 准备所有电影特征
    movie_features = prepare_movie_features() # 3883
    # print("电影特征维度:", len(movie_features))
    
    # 3. 创建数据集和数据加载器
    dataset = InferenceDataset(user_features, movie_features)
    loader = DataLoader(dataset, batch_size=256, shuffle=False)
    
    # 4. 获取用户向量和计算相似度
    movie_scores = []
    with torch.no_grad():
        # 准备用户输入
        uid = torch.tensor([user_features[0]]).to(device)
        gender = torch.tensor([user_features[1]]).to(device)
        age = torch.tensor([user_features[2]]).to(device)
        job = torch.tensor([user_features[3]]).to(device)
        
        for batch in loader:
            movie_ids = batch['movie_id'].to(device)
            categories = batch['movie_categories'].to(device)
            titles = batch['movie_titles'].to(device)

            # uid: [1]
            # gender: [1]
            # age: [1]
            # job: [1]
            # mmovie_ids: [256]
            # categories: [256, 18]
            # titles: [256, 15]

            # 使用forward方法计算相似度
            similarities, _, _ = model(
                uid,  
                gender,
                age,
                job,
                movie_ids,
                categories,
                titles
            )
            
            print(similarities)
            # 对角线元素就是用户与每部电影的相似度
            batch_scores = similarities.diagonal()
            
            # 收集结果
            for mid, score in zip(movie_ids.cpu().numpy(), batch_scores.cpu().numpy()):
                movie_scores.append((mid, score))
    
    # 5. 排序并获取top-k推荐
    movie_scores.sort(key=lambda x: x[1], reverse=True)
    top_movies = movie_scores[:top_k]
    
    # 6. 返回推荐结果
    recommendations = []
    movie_id_to_title = dict(zip(movies_df['movie_id'], movies_df['title']))

    for mid, score in top_movies:
        title = movie_id_to_title.get(mid, "Unknown")
        genres = movies_df[movies_df['movie_id'] == mid]['genres'].values[0]
        recommendations.append({
            'movie_id': mid,
            'title': title,
            'genres': genres,
            'score': float(score)
        })
    
    return recommendations

In [90]:
user_id = 1
recommendations = recommend_movies(model, user_id, top_k=10)

print(f"为用户 {user_id} 推荐的电影:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {movie_id_to_title[rec['movie_id']]} ({movie_id_to_genres[rec['movie_id']]}) - 相似度: {rec['score']:.4f}")

tensor([1])
tensor([1])
tensor([0])
tensor([10])
tensor([[-12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
         -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654, -12.0654,
   