### 数据准备

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
user_features = pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/user_features.csv')
movie_features = pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/features/movie_features.csv')
ratings=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
movies=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
users=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

In [4]:
user_features.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action,Adventure,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,favorite_genre,num_liked_genres
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.045028,...,0.0,0.479172,0.0,0.382257,0.297663,0.406715,0.418157,0.0,Musical,13
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269719,0.194444,0.081828,...,0.02085,0.0,0.031933,0.263799,0.252818,0.613924,0.745944,0.51241,War,14
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.249533,...,0.099299,0.036615,0.037954,0.196962,0.291509,0.336156,0.195883,0.71377,Western,15
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.152565,...,0.126004,0.0,0.0,0.095535,0.472931,0.380836,0.43579,0.462981,Sci-Fi,12
4,5,3.146465,1.132699,198,1,5,0.4351,0.359991,0.088068,0.02803,...,0.088538,0.029117,0.07995,0.325581,0.240121,0.817461,0.629844,0.240616,Thriller,17


In [5]:
movie_features.head()

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_mean_rating,movie_rating_std,movie_rating_count,genre_purity
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,4.146846,0.852349,2077,0.333333
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,0,0,0,0,0,0,3.201141,0.983172,701,0.333333
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,1,0,0,0,0,3.016736,1.071712,478,0.5
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2.729412,1.013381,170,0.5
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3.006757,1.025086,296,1.0


In [13]:
genres=movies['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings, movies, on='movie_id')
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]

# 选择用户特征列
user_cols = [
    'mean_rating', 'rating_std', 'rating_count', 'rating_min', 'rating_max',
    'rating_strictness', 'rating_variability', 'num_liked_genres'
] + [col for col in user_features.columns if col in genre_columns]

# 选择电影特征列
movie_cols = [
    'movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity'
] + [col for col in movie_features.columns if col in genre_columns]

print("用户特征列 user_cols:", user_cols)
print("电影特征列 movie_cols:", movie_cols)

用户特征列 user_cols: ['mean_rating', 'rating_std', 'rating_count', 'rating_min', 'rating_max', 'rating_strictness', 'rating_variability', 'num_liked_genres', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
电影特征列 movie_cols: ['movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


### 创建映射字典

In [14]:
# 给定user_id的情况下，可以获取其特征值
user_id_to_feature = {}
for _, row in user_features.iterrows():
    try:
        user_id_to_feature[row['user_id']] = row[user_cols].values.astype(np.float32)
    except KeyError as e:
        print(f"Missing column in user_features: {e}")
        break

In [None]:
# 给定movie_id的情况下，可以获取其特征值
movie_id_to_feature = {}
for _, row in movie_features.iterrows():
    try:
        movie_id_to_feature[row['movie_id']] = row[movie_cols].values.astype(np.float32)
    except KeyError as e:
        print(f"Missing column in movie_features: {e}")
        break

### 数据集类

In [17]:
class RatingDataset(Dataset):
    def __init__(self, data, user_id_to_feature, movie_id_to_feature):
        self.data = data
        self.user_id_to_feature = user_id_to_feature
        self.movie_id_to_feature = movie_id_to_feature
        
        # 获取favorite_genre的映射
        self.genre_to_idx = {genre: idx for idx, genre in enumerate(genre_columns)}
        self.num_genres = len(genre_columns)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user_id, movie_id, rating = self.data[idx]
        
        # 获取数值特征
        user_feature = self.user_id_to_feature[user_id]
        movie_feature = self.movie_id_to_feature[movie_id]
        
        # 处理favorite_genre（转换为one-hot）
        favorite_genre = user_features[user_features['user_id'] == user_id]['favorite_genre'].values[0]
        genre_idx = self.genre_to_idx.get(favorite_genre, 0)
        genre_onehot = torch.zeros(self.num_genres)
        genre_onehot[genre_idx] = 1
        
        # 合并所有用户特征
        user_feature = np.concatenate([
            user_feature[:-2],  # 排除最后的favorite_genre和num_liked_genres
            genre_onehot.numpy(),
            [user_feature[-1]]  # num_liked_genres
        ])
        
        normalized_rating = (rating - 1) / 4.0
        return (
            torch.FloatTensor(user_feature),
            torch.FloatTensor(movie_feature),
            torch.FloatTensor([normalized_rating])
        )

### 模型架构

In [18]:
class DualTowerModel(nn.Module):
    def __init__(self, user_feature_dim, movie_feature_dim, num_genres, embedding_dim=64):
        super().__init__()
        
        # 用户塔（处理数值特征 + genre one-hot）
        self.user_numeric = nn.Sequential(
            nn.Linear(user_feature_dim - num_genres - 1, 64),  # -1 for num_liked_genres
            nn.ReLU()
        )
        self.user_genre = nn.Sequential(
            nn.Linear(num_genres, 32),
            nn.ReLU()
        )
        self.user_combine = nn.Sequential(
            nn.Linear(64 + 32 + 1, embedding_dim),  # +1 for num_liked_genres
            nn.ReLU()
        )
        
        # 电影塔
        self.movie_tower = nn.Sequential(
            nn.Linear(movie_feature_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embedding_dim),
            nn.ReLU()
        )
        
        # 评分预测头
        self.rating_head = nn.Sequential(
            nn.Linear(embedding_dim * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, user_features, movie_features):
        # 拆分用户特征
        numeric_part = user_features[:, :-len(genre_columns)-1]
        genre_part = user_features[:, -len(genre_columns)-1:-1]
        num_liked = user_features[:, -1].unsqueeze(1)
        
        # 用户塔前向传播
        user_numeric_out = self.user_numeric(numeric_part)
        user_genre_out = self.user_genre(genre_part)
        user_out = self.user_combine(torch.cat([user_numeric_out, user_genre_out, num_liked], dim=1))
        
        # 电影塔前向传播
        movie_out = self.movie_tower(movie_features)
        
        # 合并预测
        combined = torch.cat([user_out, movie_out], dim=1)
        return self.rating_head(combined).squeeze()