In [1]:
import os 
from typing import List, Tuple
import pickle
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
from datetime import datetime
import torch
import torch.nn as nn


In [2]:

class Dice(nn.Module):
    def __init__(self, num_features, epsilon=1e-8):
        super(Dice, self).__init__()
        self.bn = nn.BatchNorm1d(num_features, affine=False)
        self.alpha = nn.Parameter(torch.zeros(num_features))
        self.epsilon = epsilon

    def forward(self, x):
        x_normed = self.bn(x)
        x_p = torch.sigmoid(self.alpha * (x_normed - x_normed.detach()))
        return x * x_p + (1 - x_p) * x_normed


In [3]:
# 自定义的激活函数
class CustomActivation(nn.Module):
    def forward(self, x):
        return torch.clamp(x, min=1, max=5)

In [4]:

class DINModel(nn.Module):
    def __init__(self, uid_num, gender_num, age_num, job_num, mid_num, movie_category_num, movie_title_num, embedding_dim=16, attention_units=32):
        """初始化DIN模型"""
        super(DINModel, self).__init__()
        self.embedding_dim = embedding_dim

        # 用户特征嵌入层
        self.uid_embedding = nn.Embedding(uid_num, embedding_dim)
        self.gender_embedding = nn.Embedding(gender_num, embedding_dim)
        self.age_embedding = nn.Embedding(age_num, embedding_dim)
        self.job_embedding = nn.Embedding(job_num, embedding_dim)

        # 电影特征嵌入层
        self.movie_id_embedding = nn.Embedding(mid_num, embedding_dim)
        self.movie_categories_embedding = nn.Embedding(movie_category_num, embedding_dim)
        self.movie_title_embedding = nn.Embedding(movie_title_num, embedding_dim)

        # 历史电影特征嵌入层
        self.history_movie_embedding = nn.Embedding(mid_num, embedding_dim)

        # 注意力层
        self.attention = nn.Sequential(
            nn.Linear(embedding_dim * 2, attention_units),
            Dice(attention_units),
            nn.Linear(attention_units, attention_units),
            Dice(attention_units),
            nn.Linear(attention_units, 1),
            nn.Sigmoid()
        )

        # 预测层
        self.prediction = nn.Sequential(
            nn.Linear(embedding_dim*8, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            CustomActivation()  # 使用自定义激活函数
        )

    def forward(self, uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles, history_movie_ids):
        """前向传播
        Args:
            uid: 用户ID [batch_size] []
            user_gender: 用户性别 [batch_size] []
            user_age: 用户年龄 [batch_size] []
            user_job: 用户职业 [batch_size] []
            movie_id: 候选电影ID [batch_size] []
            movie_categories: 候选电影类别 [batch_size] [18]
            movie_titles: 候选电影标题 [batch_size] [18]
            history_movie_ids: 历史交互电影ID [batch_size, hist_len] [2314]
        """

        # 嵌入用户特征
        uid_embed = self.uid_embedding(uid)
        gender_embed = self.gender_embedding(user_gender)
        age_embed = self.age_embedding(user_age)
        job_embed = self.job_embedding(user_job)

        # 嵌入候选电影特征
        movie_id_embed = self.movie_id_embedding(movie_id)
        movie_categories_embed = self.movie_categories_embedding(movie_categories)
        movie_titles_embed = self.movie_title_embedding(movie_titles)

        # 嵌入历史电影ID特征
        hist_movie_embed = self.history_movie_embedding(history_movie_ids)

        # 注意力机制处理历史交互
        attention_input = torch.cat([
            movie_id_embed.unsqueeze(1),
            hist_movie_embed
        ], dim=1)

        # 扩展最后一个维度
        attention_input_expanded = torch.cat([attention_input, attention_input], dim=-1)

        # 调整其维度
        batch_size, seq_len, _ = attention_input_expanded.shape
        embedding_dim = 16
        attention_input_reshaped = attention_input_expanded.view(-1, embedding_dim * 2)
        attention_weight = self.attention(attention_input_reshaped)

        # 恢复形状
        attention_output = attention_weight.view(batch_size, seq_len, 1)

        # 调整 attention_output 的维度，使其与 hist_movie_embed 匹配
        attention_output = attention_output[:, 1:, :]
        hist_attention = torch.sum(attention_output * hist_movie_embed, dim=1)

        # 这里的特征维度一定要对齐
        movie_categories_embed_mean = torch.mean(movie_categories_embed, dim=1)
        movie_titles_embed_mean = torch.mean(movie_titles_embed, dim=1)


        # print("uid_embed维度:", uid_embed.size())
        # print("gender_embed维度:", gender_embed.size())
        # print("age_embed维度:", age_embed.size())
        # print("job_embed维度:", job_embed.size())
        # print("movie_id_embed维度:", movie_id_embed.size())
        # print("movie_categories_embed_mean维度:", movie_categories_embed_mean.size())
        # print("movie_titles_embed_mean维度:", movie_titles_embed_mean.size())
        # print("hist_attention:", hist_attention.size())
        concat_features = torch.cat([
            uid_embed,  # [32, 16]
            gender_embed, # [32, 16]
            age_embed, # [32, 16]
            job_embed, # [32, 16]
            movie_id_embed,  # [32, 16]
            movie_categories_embed_mean,  # [32, 16]
            movie_titles_embed_mean, # [32, 16]
            hist_attention # [32, 16]
        ], dim=1)  #【32, 128]

        # 输出预测分数
        return self.prediction(concat_features) # [32, 1] 32为一个批次，其中每个值都代表一个预测分数

In [5]:
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        uid = torch.tensor(self.features[idx][0])
        movie_id = torch.tensor(self.features[idx][1])
        user_gender = torch.tensor(self.features[idx][2])
        user_age = torch.tensor(self.features[idx][3])
        user_job = torch.tensor(self.features[idx][4])
        movie_titles = torch.tensor(self.features[idx][6])
        movie_categories = torch.tensor(self.features[idx][7])
        history_movie_ids = torch.tensor(self.features[idx][8])

        targets = torch.tensor(self.targets[idx]).float()
        return uid, movie_id, user_gender, user_age, user_job, movie_titles, movie_categories, history_movie_ids, targets

In [6]:
import torch
import torch.nn as nn
from sklearn.metrics import mean_squared_error, mean_absolute_error
from typing import List, Tuple
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备{device}")

class RankModel:
    def __init__(self, uid_num, gender_num, age_num, job_num, mid_num, movie_category_num, movie_title_num):
        """初始化DIN排序模型"""
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = DINModel(uid_num, gender_num, age_num, job_num, mid_num, movie_category_num, movie_title_num).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.criterion = nn.MSELoss() 

    def train(self, train_loader, num_epochs=1):
        """训练模型
        Args:
            train_features: 训练集特征
            train_targets: 训练集标签
        """
        print("开始训练DIN排序模型...")
        for epoch in range(num_epochs):
            self.model.train()
            for batch_i, (uid, movie_id, user_gender, user_age, user_job, movie_titles, movie_categories, history_movie_ids, ratings) in enumerate(train_loader):
                self.optimizer.zero_grad()
                uid = uid.to(device) # [32] 32表示batch_size
                user_gender = user_gender.to(device) # [32] 
                user_age = user_age.to(device) # [32]
                user_job = user_job.to(device) # [32]
                movie_id = movie_id.to(device) # [32]
                movie_categories = movie_categories.to(device) # [32, 18] 32表示batch_size，18表示电影的类型
                movie_titles = movie_titles.to(device) # [32, 15] 32表示batch_size, 15表示每个电影的长度
                history_movie_ids = history_movie_ids.to(device).long() # [32, 2314] 但是这里现在是[32]有问题 
                ratings = ratings.float().to(device) # [32]

                # 前向传播
                outputs = self.model(uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles, history_movie_ids)
                loss = self.criterion(outputs, ratings)
                # 反向传播和优化
                loss.backward()

                # # 计算梯度范数（检查是否有出现梯度爆炸）
                # total_norm = 0
                # for p in self.model.parameters():
                #     param_norm = p.grad.data.norm(2)
                #     total_norm += param_norm.item() ** 2
                # total_norm = total_norm ** (1. / 2)
                # print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{batch_i + 1}/{len(train_loader)}], Gradient Norm: {total_norm:.4f}")

                
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

                self.optimizer.step()
                if (batch_i + 1) % 100 == 0:
                    print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{batch_i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        print("训练完成！")

    def evaluate(self, test_loader):
        """评估模型
        Args:
            test_loader: 测试集的DataLoader
        """
        self.model.eval()
        all_predictions = []
        all_ratings = []
        with torch.no_grad():
            for batch_i, (uid, movie_id, user_gender, user_age, user_job, movie_titles, movie_categories, history_movie_ids, ratings) in enumerate(test_loader):
                uid = uid.to(self.device)
                user_gender = user_gender.to(self.device)
                user_age = user_age.to(self.device)
                user_job = user_job.to(self.device)
                movie_id = movie_id.to(self.device)
                movie_categories = movie_categories.to(self.device)
                movie_titles = movie_titles.to(self.device)
                ratings = ratings.to(self.device)
                history_movie_ids=history_movie_ids.to(self.device)
                predictions = self.model(uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles)
                all_predictions.extend(predictions.cpu().numpy().flatten())
                all_ratings.extend(ratings.cpu().numpy().flatten())

        mse = mean_squared_error(all_ratings, all_predictions)
        mae = mean_absolute_error(all_ratings, all_predictions)
        print(f"测试集MSE: {mse:.4f}")
        print(f"测试集MAE: {mae:.4f}")

    def predict(self, features: np.ndarray) -> np.ndarray:
        """预测评分
        Args:
            features: 待预测特征
        Returns:
            预测的评分
        """
        self.model.eval()
        with torch.no_grad():
            uid = torch.tensor(features[:, 0].to(self.device))
            user_gender = torch.tensor(features[:, 2].to(self.device))
            user_age = torch.tensor(features[:, 3].to(self.device))
            user_job = torch.tensor(features[:, 4].to(self.device))
            movie_id = torch.tensor(features[:, 1].to(self.device))
            movie_categories = torch.tensor(features[:, 7].to(self.device))
            movie_titles = torch.tensor(features[:, 6].to(self.device))
            history_movie_ids = torch.tensor(features[:, 8].to(self.device))

            predictions = self.model(uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles, history_movie_ids)
            return predictions.cpu().numpy()

    def get_recommendations(self, user_features: np.ndarray, recall_movie_features: np.ndarray, top_k: int = 10) -> List[Tuple[int, float]]:
        """获取推荐电影列表
        Args:
            user_features: 用户特征
            recall_movie_features: 召回的候选电影特征
            top_k: 推荐电影数量
        Returns:
            推荐电影列表，每个元素为(电影ID, 预测评分)
        """
        predictions = self.predict(recall_movie_features)
        movie_scores = list(enumerate(predictions))
        movie_scores.sort(key=lambda x: x[1], reverse=True)
        return movie_scores[:top_k]


使用设备cpu


### 处理数据

In [7]:
train_features, train_targets, test_features, test_targets=pickle.load(open('./data/split_dataset.p', 'rb'))
title2int, title_count, title_set, genres2int, genres_map, features_pd, targets_pd, features, targets_values, ratings_df, users_df, movies_df, data = pickle.load(open('./data/preprocess.p', 'rb'))

In [8]:
data.head()

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zip_code,title,genres
0,1,1193,5,1,0,10,48067,"[4835, 130, 3852, 2242, 1296, 4153, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
1,1,661,3,1,0,10,48067,"[4809, 4472, 2242, 3869, 1291, 3512, 3512, 351...","[11, 15, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
2,1,914,3,1,0,10,48067,"[4394, 1320, 1007, 3512, 3512, 3512, 3512, 351...","[0, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
3,1,3408,4,1,0,10,48067,"[1951, 4696, 3512, 3512, 3512, 3512, 3512, 351...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,1,2355,5,1,0,10,48067,"[722, 932, 973, 3512, 3512, 3512, 3512, 3512, ...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."


In [9]:
user_movie_ids = data.groupby('user_id')['movie_id'].apply(list).reset_index()

In [10]:
# 在data后增加一列，用来记录用户的历史观看的电影ID
new_data = pd.merge(data, user_movie_ids, on='user_id', how='left')

# 修改列名
new_data=new_data.rename(columns={'movie_id_y':'history_movie_ids', 'movie_id_x': 'movie_id'})


In [11]:
new_data

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,zip_code,title,genres,history_movie_ids
0,1,1193,5,1,0,10,48067,"[4835, 130, 3852, 2242, 1296, 4153, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
1,1,661,3,1,0,10,48067,"[4809, 4472, 2242, 3869, 1291, 3512, 3512, 351...","[11, 15, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
2,1,914,3,1,0,10,48067,"[4394, 1320, 1007, 3512, 3512, 3512, 3512, 351...","[0, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
3,1,3408,4,1,0,10,48067,"[1951, 4696, 3512, 3512, 3512, 3512, 3512, 351...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
4,1,2355,5,1,0,10,48067,"[722, 932, 973, 3512, 3512, 3512, 3512, 3512, ...","[11, 15, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...","[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,0,6,6,11106,"[5025, 378, 4350, 3512, 3512, 3512, 3512, 3512...","[6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[573, 589, 1, 2068, 592, 593, 3016, 3017, 2070..."
1000205,6040,1094,5,0,6,6,11106,"[3690, 1768, 4315, 3512, 3512, 3512, 3512, 351...","[4, 5, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,...","[573, 589, 1, 2068, 592, 593, 3016, 3017, 2070..."
1000206,6040,562,5,0,6,6,11106,"[1050, 390, 2242, 314, 3512, 3512, 3512, 3512,...","[6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[573, 589, 1, 2068, 592, 593, 3016, 3017, 2070..."
1000207,6040,1096,4,0,6,6,11106,"[2220, 766, 3512, 3512, 3512, 3512, 3512, 3512...","[4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[573, 589, 1, 2068, 592, 593, 3016, 3017, 2070..."


In [12]:
# 填充new_data的history_movie_ids列
# 1. 找到history_movie_ids最长是多少
max_hist_len = max([len(arr) for arr in new_data['history_movie_ids']])
print("最长的历史电影ID数量为:", max_hist_len)

# 2. 对history_movie_ids进行填充操作
def pad_array(arr):
    arr_len = len(arr)
    if arr_len < max_hist_len:
        # 选择数组中的第一个元素进行填充（你也可以按需选择其他元素）
        fill_element = arr[0]
        padding = [fill_element] * (max_hist_len - arr_len)
        arr = arr + padding
    return arr

new_data['history_movie_ids'] = new_data['history_movie_ids'].apply(pad_array)

# 3. 检查填充后的结果
wrong_ids=[len(arr) for arr in new_data['history_movie_ids'] if len(arr)!=max_hist_len]
print("这些数组仍未被填充:", wrong_ids)

最长的历史电影ID数量为: 2314
这些数组仍未被填充: []


### 训练模型

In [13]:
# 从data中划分训练集和测试集
print("开始划分训练集和测试集...")

# 将数据转换为numpy数组
features = np.array(new_data[['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip_code', 'title', 'genres', 'history_movie_ids']].values)
targets = np.array(new_data['rating'].values)


# 使用train_test_split划分训练集和测试集
from sklearn.model_selection import train_test_split
train_features, test_features, train_targets, test_targets = train_test_split(
    features, targets, test_size=0.2, random_state=42
)

print(f"训练集大小: {len(train_features)}")
print(f"测试集大小: {len(test_features)}")

train_dataset=MovieDataset(train_features, train_targets)
test_dataset=MovieDataset(test_features, test_targets)
train_loader=DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=32, shuffle=False)


开始划分训练集和测试集...
训练集大小: 800167
测试集大小: 200042


In [14]:
embed_dim = 32
# 用户 ID 个数
uid_num = max(features.take(0, 1)) + 1
# 性别个数
gender_num = max(features.take(2, 1)) + 1
# 年龄类别个数
age_num = max(features.take(3, 1)) + 1
# 职业个数
job_num = max(features.take(4, 1)) + 1

# 电影 ID 个数
mid_num = max(features.take(1, 1)) + 1
# 电影类型个数
movie_category_num = max(genres2int.values()) + 1
# 电影名单词个数
movie_title_num = len(title_set)

# 初始化排序模型
rank_model = RankModel(uid_num, gender_num, age_num, job_num, mid_num, movie_category_num, movie_title_num)

# 训练模型
print("开始训练模型...")
rank_model.train(train_loader)

# 评估模型
print("开始评估模型...")
rank_model.evaluate(test_features, test_targets)

# 保存模型
save_dir = os.path.join(os.getcwd(), "model_save")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 获取当前日期作为模型文件名的一部分
from datetime import datetime
today = datetime.today()
formatted_date = today.strftime('%Y%m%d')

开始训练模型...
开始训练DIN排序模型...


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/1], Step [100/25006], Loss: 2.4688
Epoch [1/1], Step [200/25006], Loss: 2.7500
Epoch [1/1], Step [300/25006], Loss: 4.2597
Epoch [1/1], Step [400/25006], Loss: 2.2457
Epoch [1/1], Step [500/25006], Loss: 3.1473
Epoch [1/1], Step [600/25006], Loss: 2.9061
Epoch [1/1], Step [700/25006], Loss: 2.7488
Epoch [1/1], Step [800/25006], Loss: 3.6204
Epoch [1/1], Step [900/25006], Loss: 3.0548
Epoch [1/1], Step [1000/25006], Loss: 4.0913
Epoch [1/1], Step [1100/25006], Loss: 2.3674
Epoch [1/1], Step [1200/25006], Loss: 2.0326
Epoch [1/1], Step [1300/25006], Loss: 3.2326
Epoch [1/1], Step [1400/25006], Loss: 2.2234
Epoch [1/1], Step [1500/25006], Loss: 2.7326
Epoch [1/1], Step [1600/25006], Loss: 2.2675
Epoch [1/1], Step [1700/25006], Loss: 3.9018
Epoch [1/1], Step [1800/25006], Loss: 2.2874


KeyboardInterrupt: 