In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import pickle

In [3]:
movies = pd.read_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/movies_preprocessed.csv')
users = pd.read_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/users_preprocessed.csv')
ratings = pd.read_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/ratings_preprocessed.csv')

In [None]:
with open('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/genre_to_id.pkl', 'rb') as f:
    genre_to_id = pickle.load(f)

In [9]:
genre_to_id

{'Action': 1,
 'Adventure': 2,
 'Animation': 3,
 "Children's": 4,
 'Comedy': 5,
 'Crime': 6,
 'Documentary': 7,
 'Drama': 8,
 'Fantasy': 9,
 'Film-Noir': 10,
 'Horror': 11,
 'Musical': 12,
 'Mystery': 13,
 'Romance': 14,
 'Sci-Fi': 15,
 'Thriller': 16,
 'War': 17,
 'Western': 18}

In [41]:
# 1. 加载用户数据
data_dir='/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/ml-1m'

users_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users = pd.read_csv(
    os.path.join(data_dir, 'users.dat'),
    sep='::',
    names=users_cols,
    engine='python'
)

movies_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(
    os.path.join(data_dir, 'movies.dat'),
    sep='::',
    names=movies_cols,
    engine='python',
    encoding='latin-1'
)

ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(
    os.path.join(data_dir, 'ratings.dat'),
    sep='::',
    names=ratings_cols,
    engine='python'
)

In [18]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [19]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## ID类特征处理

In [42]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

# 拟合编码器
user_encoder.fit(users['user_id'].unique())
movie_encoder.fit(movies['movie_id'].unique())


In [43]:
users['user_idx'] = user_encoder.transform(users['user_id'])
movies['movie_idx'] = movie_encoder.transform(movies['movie_id'])
ratings['user_idx'] = user_encoder.transform(ratings['user_id'])
ratings['movie_idx'] = movie_encoder.transform(ratings['movie_id'])

In [44]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [45]:
ratings['user_idx'] = user_encoder.transform(ratings['user_id'])
ratings['movie_idx'] = movie_encoder.transform(ratings['movie_id'])


## 电影特征增强

In [None]:
# 处理电影类型，构建类型到ID的映射


# 提取所有唯一类型并创建映射字典
genres_all = set()
movies['genres'].str.split('|').apply(genres_all.update)
genre_to_id = {genre: idx+1 for idx, genre in enumerate(sorted(genres_all))}  # ID从1开始

print(genre_to_id)
# 输出示例: {'Animation': 1, "Children's": 2, 'Comedy': 3, ...}

{'Action': 1, 'Adventure': 2, 'Animation': 3, "Children's": 4, 'Comedy': 5, 'Crime': 6, 'Documentary': 7, 'Drama': 8, 'Fantasy': 9, 'Film-Noir': 10, 'Horror': 11, 'Musical': 12, 'Mystery': 13, 'Romance': 14, 'Sci-Fi': 15, 'Thriller': 16, 'War': 17, 'Western': 18}


In [47]:
movies['genre_ids'] = movies['genres'].str.split('|').apply(
    lambda x: [genre_to_id[g] for g in x]
)

In [49]:
# 提取电影年份（关键特征）
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').fillna(0).astype(int)

# 计算电影热度
movie_stats = ratings.groupby('movie_idx').agg(
    avg_rating=('rating', 'mean'),
    rating_count=('rating', 'count')
).reset_index()
movies = movies.merge(movie_stats, on='movie_idx')

In [50]:
movies

Unnamed: 0,movie_id,title,genres,movie_idx,genre_ids,year,avg_rating,rating_count
0,1,Toy Story (1995),Animation|Children's|Comedy,0,"[3, 4, 5]",1995,4.146846,2077
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,"[2, 4, 9]",1995,3.201141,701
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[5, 14]",1995,3.016736,478
3,4,Waiting to Exhale (1995),Comedy|Drama,3,"[5, 8]",1995,2.729412,170
4,5,Father of the Bride Part II (1995),Comedy,4,[5],1995,3.006757,296
...,...,...,...,...,...,...,...,...
3701,3948,Meet the Parents (2000),Comedy,3878,[5],2000,3.635731,862
3702,3949,Requiem for a Dream (2000),Drama,3879,[8],2000,4.115132,304
3703,3950,Tigerland (2000),Drama,3880,[8],2000,3.666667,54
3704,3951,Two Family House (2000),Drama,3881,[8],2000,3.900000,40


## 用户特征工程

In [None]:
# 用户行为统计
user_stats = ratings.groupby('user_idx').agg(
    user_avg_rating=('rating', 'mean'),
    user_rating_count=('rating', 'count'),
    last_active=('timestamp', 'max')
).reset_index()
users = users.merge(user_stats, on='user_idx')

# 性别编码
users['gender'] = users['gender'].map({'F':0, 'M':1})

In [53]:
# 年龄分桶：按照分位数来分桶，确保每个桶内大概有20%的数据
users['age_group'] = pd.qcut(
    users['age'], 
    q=5,  # 分为5个桶（ quantiles ）
    labels=False,  # 返回整数标签（0,1,2,3,4）
    duplicates='drop'  # 处理重复分位数的情况
)

In [54]:
users

Unnamed: 0,user_id,gender,age,occupation,zip_code,user_idx,user_avg_rating,user_rating_count,last_active,age_group
0,1,0,1,10,48067,0,4.188679,53,2001-01-06 23:39:11,0
1,2,1,56,16,70072,1,3.713178,129,2000-12-31 22:02:54,4
2,3,1,25,15,55117,2,3.901961,51,2000-12-31 21:35:04,1
3,4,1,45,7,02460,3,4.190476,21,2000-12-31 20:24:42,3
4,5,1,25,20,55455,4,3.146465,198,2000-12-31 07:09:45,1
...,...,...,...,...,...,...,...,...,...,...
6035,6036,0,25,15,32603,6035,3.302928,888,2000-04-26 13:19:56,1
6036,6037,0,45,1,76006,6036,3.717822,202,2000-04-27 02:17:20,3
6037,6038,0,56,1,14706,6037,3.800000,20,2000-04-26 02:46:44,4
6038,6039,0,45,0,01060,6038,3.878049,123,2000-04-26 14:07:09,3


## 构建用户行为序列数据

In [None]:
# 按时间排序后生成序列
ratings_sorted = ratings.sort_values(['user_idx', 'timestamp'])

# 生成用户历史序列
user_sequences = ratings_sorted.groupby('user_idx')['movie_idx'].apply(list).reset_index()
user_sequences['sequence_length'] = user_sequences['movie_idx'].apply(len)

# 过滤短序列（根据需求调整）
user_sequences = user_sequences[user_sequences['sequence_length'] >= 5].copy()

## 存储处理好的数据

In [61]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code,user_idx,user_avg_rating,user_rating_count,last_active,age_group
0,1,0,1,10,48067,0,4.188679,53,2001-01-06 23:39:11,0
1,2,1,56,16,70072,1,3.713178,129,2000-12-31 22:02:54,4
2,3,1,25,15,55117,2,3.901961,51,2000-12-31 21:35:04,1
3,4,1,45,7,2460,3,4.190476,21,2000-12-31 20:24:42,3
4,5,1,25,20,55455,4,3.146465,198,2000-12-31 07:09:45,1


In [62]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,user_idx,movie_idx
0,1,1193,5,2000-12-31 22:12:40,0,1176
1,1,661,3,2000-12-31 22:35:09,0,655
2,1,914,3,2000-12-31 22:32:48,0,902
3,1,3408,4,2000-12-31 22:04:35,0,3339
4,1,2355,5,2001-01-06 23:38:11,0,2286


In [63]:
movies.head()

Unnamed: 0,movie_id,title,genres,movie_idx,genre_ids,year,avg_rating,rating_count
0,1,Toy Story (1995),Animation|Children's|Comedy,0,"[3, 4, 5]",1995,4.146846,2077
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,"[2, 4, 9]",1995,3.201141,701
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[5, 14]",1995,3.016736,478
3,4,Waiting to Exhale (1995),Comedy|Drama,3,"[5, 8]",1995,2.729412,170
4,5,Father of the Bride Part II (1995),Comedy,4,[5],1995,3.006757,296


In [66]:
user_ratings = pd.merge(
    ratings,
    users,
    on='user_idx',
)

# Step 2: 再与电影表合并 (1:1 电影信息扩展)
merged_df = pd.merge(
    user_ratings,
    movies,
    on='movie_idx'
)

In [57]:
users.to_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/users_preprocessed.csv', index = False)
movies.to_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/movies_preprocessed.csv', index = False)

In [None]:
users.head()

# user用到的特征为gender, user_idx,  age_group, occupation, user_avg_rating, user_rating_count
# movies用到的特征为title, movie_idx, gender_ids, year, avg_rating, rating_count

Unnamed: 0,user_id,gender,age,occupation,zip_code,user_idx,user_avg_rating,user_rating_count,last_active,age_group
0,1,0,1,10,48067,0,4.188679,53,2001-01-06 23:39:11,0
1,2,1,56,16,70072,1,3.713178,129,2000-12-31 22:02:54,4
2,3,1,25,15,55117,2,3.901961,51,2000-12-31 21:35:04,1
3,4,1,45,7,2460,3,4.190476,21,2000-12-31 20:24:42,3
4,5,1,25,20,55455,4,3.146465,198,2000-12-31 07:09:45,1


In [69]:
movies.head()

Unnamed: 0,movie_id,title,genres,movie_idx,genre_ids,year,avg_rating,rating_count
0,1,Toy Story (1995),Animation|Children's|Comedy,0,"[3, 4, 5]",1995,4.146846,2077
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,"[2, 4, 9]",1995,3.201141,701
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[5, 14]",1995,3.016736,478
3,4,Waiting to Exhale (1995),Comedy|Drama,3,"[5, 8]",1995,2.729412,170
4,5,Father of the Bride Part II (1995),Comedy,4,[5],1995,3.006757,296


In [70]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,user_idx,movie_idx
0,1,1193,5,2000-12-31 22:12:40,0,1176
1,1,661,3,2000-12-31 22:35:09,0,655
2,1,914,3,2000-12-31 22:32:48,0,902
3,1,3408,4,2000-12-31 22:04:35,0,3339
4,1,2355,5,2001-01-06 23:38:11,0,2286


In [59]:
ratings.to_csv('/Users/althealam/Desktop/Code/RQ-VAE-Recommendation-System/data/preprocessed_data/ratings_preprocessed.csv', index=False)

In [71]:
from collections import OrderedDict

def build_embedding_feat_dict(movies, users, genre_to_id):
    """构建基于序列建模的特征字典（兼容您的代码风格）"""
    embedding_feat_dict = OrderedDict()
    
    # 1. 稀疏特征（分类特征）
    embedding_feat_dict['sparse'] = {
        'gender': {
            'vocab_size': int(users['gender'].nunique()),
            'embedding_dim': 4  # 建议添加
        },
        'user_idx': {
            'vocab_size': int(users['user_id'].nunique()),
            'embedding_dim': 32
        },
        'occupation': {
            'vocab_size': int(users['occupation'].nunique()),
            'embedding_dim': 8
        },
        'age_group': {
            'vocab_size': int(users['age_group'].nunique()),
            'embedding_dim': 4
        },
        'movie_idx': {
            'vocab_size': int(movies['movie_idx'].nunique()),
            'embedding_dim': 32
        }
    }
    
    # 2. 稠密特征（数值特征）
    embedding_feat_dict['dense'] = {
        'user_avg_rating': {
            'mean': float(users['user_avg_rating'].mean()),
            'std': float(users['user_avg_rating'].std())
        },
        'user_rating_count': {
            'mean': float(users['user_rating_count'].mean()),
            'std': float(users['user_rating_count'].std())
        },
        'year': {
            'mean': float(movies['year'].mean()),
            'std': float(movies['year'].std())
        },
        'avg_rating': {
            'mean': float(movies['avg_rating'].mean()),
            'std': float(movies['avg_rating'].std())
        },
        'rating_count': {
            'mean': float(movies['rating_count'].mean()),
            'std': float(movies['rating_count'].std())
        }
    }
    
    # 3. 序列特征
    embedding_feat_dict['sequence'] = {
        'genre_ids': {
            'vocab_size': len(genre_to_id) + 1,  # +1 for padding
            'max_len': int(movies['genre_ids'].apply(len).max()),
            'embedding_dim': 8
        },
        'title': {
            'type': 'text',
            'max_length': int(movies['title'].str.len().max() * 1.2),  # 动态计算+20%缓冲
            'observed_max': int(movies['title'].str.len().max()),
            'tokenizer': 'bert-base-uncased',  # 指定tokenizer类型
            'embedding_dim': 64
        }
    }
    
    # 4. 元信息
    embedding_feat_dict['meta'] = {
        'num_users': len(users),
        'num_movies': len(movies),
        'generated_at': pd.Timestamp.now().isoformat()
    }
    
    return embedding_feat_dict

In [72]:
build_embedding_feat_dict = build_embedding_feat_dict(movies, users, genre_to_id)

In [73]:
build_embedding_feat_dict

OrderedDict([('sparse',
              {'gender': {'vocab_size': 2, 'embedding_dim': 4},
               'user_idx': {'vocab_size': 6040, 'embedding_dim': 32},
               'occupation': {'vocab_size': 21, 'embedding_dim': 8},
               'age_group': {'vocab_size': 5, 'embedding_dim': 4},
               'movie_idx': {'vocab_size': 3706, 'embedding_dim': 32}}),
             ('dense',
              {'user_avg_rating': {'mean': 3.702704866999724,
                'std': 0.42962208228122745},
               'user_rating_count': {'mean': 165.5975165562914,
                'std': 192.74702906977777},
               'year': {'mean': 1986.0992984349703, 'std': 16.618197740244113},
               'avg_rating': {'mean': 3.2388921779108912,
                'std': 0.672924744098626},
               'rating_count': {'mean': 269.88909875876953,
                'std': 384.0478375720256}}),
             ('sequence',
              {'genre_ids': {'vocab_size': 19,
                'max_len': 6,
      