In [26]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from two_tower_model_v2 import MovieRecommendationModel
import pickle

## 加载数据

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 加载预处理数据
title2int, title_count, title_set, genres2int, genres_map, features_pd, targets_pd, features, targets_values, ratings_df, users_df, movies_df, data = pickle.load(open('./data/preprocess.p', 'rb'))

In [3]:
# 原始的电影数据、用户数据和评分数据
raw_movies_df=pd.read_csv('./data/ml-1m/movies.csv')
raw_users_df=pd.read_csv('./data/ml-1m/users.csv')
raw_ratings_df=pd.read_csv('./data/ml-1m/ratings.csv')

In [5]:
raw_movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 加载训练好的模型

In [5]:
def load_model(model_path='./model_save/two_tower_model_20250401.pth'):
    embed_dim = 32
    # 用户 ID 个数
    uid_num = max(features.take(0, 1)) + 1
    # 性别个数
    gender_num = max(features.take(2, 1)) + 1
    # 年龄类别个数
    age_num = max(features.take(3, 1)) + 1
    # 职业个数
    job_num = max(features.take(4, 1)) + 1

    # 电影 ID 个数
    mid_num = max(features.take(1, 1)) + 1
    # 电影类型个数
    movie_category_num = max(genres2int.values()) + 1
    # 电影名单词个数
    movie_title_num = len(title_set)

    # 文本卷积滑动窗口
    window_sizes={2, 3, 4, 5}

    # 文本卷积核数量
    filter_num=8

    sentence_size=title_count

    dropout_keep_prob=0.5

    # 加载模型
    model = MovieRecommendationModel(uid_num, gender_num, age_num, job_num, embed_dim, 
                                mid_num, movie_category_num, movie_title_num, 
                                window_sizes, filter_num, sentence_size, dropout_keep_prob)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [6]:
model=load_model()
print(model)

MovieRecommendationModel(
  (user_tower): UserTower(
    (uid_embedding): Embedding(6041, 32)
    (gender_embedding): Embedding(2, 16)
    (age_embedding): Embedding(7, 16)
    (job_embedding): Embedding(21, 16)
    (relu): ReLU()
    (tanh): Tanh()
    (uid_fc): Linear(in_features=32, out_features=32, bias=True)
    (gender_fc): Linear(in_features=16, out_features=32, bias=True)
    (age_fc): Linear(in_features=16, out_features=32, bias=True)
    (job_fc): Linear(in_features=16, out_features=32, bias=True)
    (combine_fc): Linear(in_features=128, out_features=200, bias=True)
  )
  (movie_tower): MovieTower(
    (movie_id_embedding): Embedding(3953, 32)
    (movie_categories_embedding): Embedding(19, 32)
    (movie_title_embedding): Embedding(5217, 32)
    (relu): ReLU()
    (tanh): Tanh()
    (movie_id_fc): Linear(in_features=32, out_features=32, bias=True)
    (movie_categories_fc): Linear(in_features=32, out_features=32, bias=True)
    (conv_layers): ModuleList(
      (0): Conv2d(1

In [7]:
# 定义电影特征数据集类
from tqdm import tqdm

embed_dim = 32
# 用户 ID 个数
uid_num = max(features.take(0, 1)) + 1
# 性别个数
gender_num = max(features.take(2, 1)) + 1
# 年龄类别个数
age_num = max(features.take(3, 1)) + 1
# 职业个数
job_num = max(features.take(4, 1)) + 1

# 电影 ID 个数
mid_num = max(features.take(1, 1)) + 1
# 电影类型个数
movie_category_num = max(genres2int.values()) + 1
# 电影名单词个数
movie_title_num = len(title_set)


In [30]:
movies_df

Unnamed: 0,movie_id,title,genres
0,1,"[895, 3490, 3512, 3512, 3512, 3512, 3512, 3512...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
1,2,"[2012, 3512, 3512, 3512, 3512, 3512, 3512, 351...","[9, 15, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
2,3,"[769, 5179, 5052, 3512, 3512, 3512, 3512, 3512...","[6, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3,4,"[359, 390, 370, 3512, 3512, 3512, 3512, 3512, ...","[6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,5,"[834, 3458, 2242, 2367, 663, 2801, 3512, 3512,...","[6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
...,...,...,...
3878,3948,"[2300, 2242, 271, 3512, 3512, 3512, 3512, 3512...","[6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3879,3949,"[3376, 3594, 1473, 1772, 3512, 3512, 3512, 351...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3880,3950,"[512, 3512, 3512, 3512, 3512, 3512, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3881,3951,"[4651, 3565, 341, 3512, 3512, 3512, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."


In [None]:
# 获取电影的embedding向量
# 1. 获取电影的id
movie_ids=set([item for item in movies_df['movie_id']])
print(f"总共有{len(movie_ids)}部电影")

# 2. 获取电影的映射
# key是movie_id，value是movie_title和movie_genres
movie_dict={}
for movie_id in movie_ids:
    movie_dict[movie_id]={
        'movie_title': movies_df[movies_df['movie_id']==movie_id]['title'].values[0],
        'movie_genres': movies_df[movies_df['movie_id']==movie_id]['genres'].values[0],
    }

总共有3883部电影
{1: {'movie_title': [895, 3490, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512], 'movie_genres': [11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}, 2: {'movie_title': [2012, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512], 'movie_genres': [9, 15, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}, 3: {'movie_title': [769, 5179, 5052, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512], 'movie_genres': [6, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}, 4: {'movie_title': [359, 390, 370, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512], 'movie_genres': [6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}, 5: {'movie_title': [834, 3458, 2242, 2367, 663, 2801, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512], 'movie_genres': [6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]}, 6: {'movie_title': [163, 3512, 3512, 3512, 3512, 3512, 3512, 3512, 3512,

In [55]:

movie_data=MovieDataset(features)
movie_loader=DataLoader(movie_data, batch_size=256, shuffle=False, num_workers=2)

In [58]:
movie_data['movie_id']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [48]:
for batch_i, (movie_id, movie_titles, movie_categories) in enumerate(movie_loader):
    movie_id = movie_id.to(device)
    movie_titles = movie_titles.to(device)
    movie_categories = movie_categories.to(device)
    # movie_output=model.movie_tower(movie_id, movie_titles, movie_categories)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'MovieData' on <module '__main__' (built-in)>


KeyboardInterrupt: 