## 数据预处理

In [3]:
import numpy as np

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# 1. 加载数据
movies = pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings = pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users = pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

# 2. 将评分转换为隐式反馈(假设rating>=4为正样本)
ratings['label'] = (ratings['rating'] >= 4).astype(int)

# 3. 处理电影类型特征(多值特征)
movies['genres'] = movies['genres'].str.split('|')
genre_list = list(set(g for genres in movies['genres'] for g in genres))
for genre in genre_list:
    movies[f'genre_{genre}'] = movies['genres'].apply(lambda x: int(genre in x))

# 4. 编码分类特征
user_features = ['user_id', 'gender', 'age', 'occupation']
movie_features = ['movie_id'] + [f'genre_{g}' for g in genre_list]

# 5. 构建用户历史行为序列
# 按时间戳排序
ratings = ratings.sort_values(['user_id', 'timestamp'])
# 为每个用户保留最近的50次行为作为历史序列
user_hist = ratings.groupby('user_id').tail(50)

## 构建DIN模型

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

def build_din_model(num_users, num_movies, genre_size, max_seq_length=50):
    # 输入层
    # 用户特征
    user_id_input = Input(shape=(1,), name='user_id')
    gender_input = Input(shape=(1,), name='gender')
    age_input = Input(shape=(1,), name='age')
    occupation_input = Input(shape=(1,), name='occupation')
    
    # 电影特征
    movie_id_input = Input(shape=(1,), name='movie_id')
    movie_genres_input = Input(shape=(genre_size,), name='movie_genres')
    
    # 历史电影特征
    hist_movie_ids_input = Input(shape=(max_seq_length,), name='hist_movie_ids')
    hist_movie_genres_input = Input(shape=(max_seq_length, genre_size), name='hist_movie_genres')
    
    # 嵌入层
    user_id_embed = Embedding(num_users, 16)(user_id_input)
    gender_embed = Embedding(2, 4)(gender_input)
    age_embed = Embedding(7, 4)(age_input)  # age是1-7的类别
    occupation_embed = Embedding(21, 4)(occupation_input)  # 0-20共21个职业
    
    movie_id_embed = Embedding(num_movies, 16)(movie_id_input)
    hist_movie_ids_embed = Embedding(num_movies, 16)(hist_movie_ids_input)
    
    # 拼接用户特征
    user_embed = Concatenate()([
        Flatten()(user_id_embed),
        Flatten()(gender_embed),
        Flatten()(age_embed),
        Flatten()(occupation_embed)
    ])
    
    # 候选物品特征
    candidate_embed = Concatenate()([
        Flatten()(movie_id_embed),
        Flatten()(movie_genres_input)
    ])
    
    # 历史行为序列处理
    hist_movie_embeds = TimeDistributed(
        Concatenate()([
            Flatten()(hist_movie_ids_embed),
            Flatten()(hist_movie_genres_input)
        ])
    )
    
    # 注意力机制
    attention_output = AttentionLayer()([candidate_embed, hist_movie_embeds])
    
    # 深度网络
    deep_input = Concatenate()([user_embed, candidate_embed, attention_output])
    dnn_output = Dense(128, activation='relu')(deep_input)
    dnn_output = Dense(64, activation='relu')(dnn_output)
    
    # 输出层
    output = Dense(1, activation='sigmoid')(dnn_output)
    
    model = Model(inputs=[
        user_id_input, gender_input, age_input, occupation_input,
        movie_id_input, movie_genres_input,
        hist_movie_ids_input, hist_movie_genres_input
    ], outputs=output)
    
    return model



In [6]:
# 注意力层
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[0][-1], input_shape[1][-1]),
            initializer='random_normal',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(input_shape[1][1],),
            initializer='zeros',
            trainable=True
        )
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # query: candidate item, key: history items
        query, keys = inputs
        # 计算注意力得分
        query = tf.expand_dims(query, 1)  # [B, 1, D]
        scores = tf.matmul(keys, tf.matmul(query, self.W) + self.b)  # [B, T, 1]
        scores = tf.nn.softmax(scores, axis=1)
        # 加权求和
        output = tf.reduce_sum(scores * keys, axis=1)  # [B, D]
        return output

### 数据管道构建

In [7]:
# 构建训练样本
def generate_samples(ratings, user_hist, movies, genre_list, max_seq_length=50):
    samples = []
    for _, row in ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        label = row['label']
        
        # 获取用户特征
        user_data = users[users['user_id'] == user_id].iloc[0]
        
        # 获取候选物品特征
        movie_data = movies[movies['movie_id'] == movie_id].iloc[0]
        movie_genres = [int(genre in movie_data['genres']) for genre in genre_list]
        
        # 获取用户历史行为序列
        hist = user_hist[user_hist['user_id'] == user_id]
        hist_movie_ids = hist['movie_id'].values[-max_seq_length:]
        hist_movie_genres = []
        
        for m_id in hist_movie_ids:
            m_data = movies[movies['movie_id'] == m_id].iloc[0]
            hist_movie_genres.append([int(genre in m_data['genres']) for genre in genre_list])
        
        # 填充或截断序列
        if len(hist_movie_ids) < max_seq_length:
            pad_len = max_seq_length - len(hist_movie_ids)
            hist_movie_ids = np.pad(hist_movie_ids, (0, pad_len), 'constant')
            hist_movie_genres += [[0]*len(genre_list)] * pad_len
        
        sample = {
            'user_id': user_id,
            'gender': 0 if user_data['gender'] == 'F' else 1,
            'age': user_data['age'],
            'occupation': user_data['occupation'],
            'movie_id': movie_id,
            'movie_genres': movie_genres,
            'hist_movie_ids': hist_movie_ids,
            'hist_movie_genres': hist_movie_genres,
            'label': label
        }
        samples.append(sample)
    
    return pd.DataFrame(samples)

In [8]:
# 构建tensorflow dataset
def create_dataset(df, batch_size=32):
    def generator():
        for _, row in df.iterrows():
            yield (
                {
                    'user_id': np.array([row['user_id']]),
                    'gender': np.array([row['gender']]),
                    'age': np.array([row['age']]),
                    'occupation': np.array([row['occupation']]),
                    'movie_id': np.array([row['movie_id']]),
                    'movie_genres': np.array(row['movie_genres']),
                    'hist_movie_ids': np.array(row['hist_movie_ids']),
                    'hist_movie_genres': np.array(row['hist_movie_genres'])
                },
                np.array([row['label']])
            )
    
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_types=(
            {
                'user_id': tf.int32,
                'gender': tf.int32,
                'age': tf.int32,
                'occupation': tf.int32,
                'movie_id': tf.int32,
                'movie_genres': tf.float32,
                'hist_movie_ids': tf.int32,
                'hist_movie_genres': tf.float32
            },
            tf.float32
        ),
        output_shapes=(
            {
                'user_id': (1,),
                'gender': (1,),
                'age': (1,),
                'occupation': (1,),
                'movie_id': (1,),
                'movie_genres': (len(genre_list),),
                'hist_movie_ids': (max_seq_length,),
                'hist_movie_genres': (max_seq_length, len(genre_list))
            },
            (1,)
        )
    )
    
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

### 模型训练与评估

In [9]:
# 按时间戳排序后，取前80%作为训练，后20%作为验证
ratings = ratings.sort_values('timestamp')
train_size = int(len(ratings) * 0.8)

train_ratings = ratings.iloc[:train_size]
print("训练集大小:", len(train_ratings))
val_ratings = ratings.iloc[train_size:]
print("验证集大小:", len(val_ratings))

训练集大小: 800167
验证集大小: 200042


In [10]:
# 参数设置
max_seq_length = 50
batch_size = 256
epochs = 10

# 准备数据
train_samples = generate_samples(train_ratings, user_hist, movies, genre_list, max_seq_length)
val_samples = generate_samples(val_ratings, user_hist, movies, genre_list, max_seq_length)

train_dataset = create_dataset(train_samples, batch_size)
val_dataset = create_dataset(val_samples, batch_size)

KeyboardInterrupt: 

In [None]:

# 构建模型
num_users = users['user_id'].nunique()
num_movies = movies['movie_id'].nunique()
genre_size = len(genre_list)

model = build_din_model(num_users, num_movies, genre_size, max_seq_length)

# 编译模型
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

# 训练模型
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
)