In [68]:
import pickle
import pandas as pd 

## 读取数据

In [69]:
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')

In [70]:
import json
def read_json_file(filepath):
    """
    读取指定路径的 JSON 文件。

    Args:
        filepath (str): JSON 文件的完整路径。

    Returns:
        dict or list or None: 如果成功读取，则返回 JSON 文件中的数据（通常是字典或列表）。
                                如果文件不存在或读取失败，则返回 None。
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"错误: 文件 '{filepath}' 未找到。")
        return None
    except json.JSONDecodeError:
        print(f"错误: 文件 '{filepath}' 不是有效的 JSON 格式。")
        return None
    except Exception as e:
        print(f"读取文件 '{filepath}' 时发生未知错误: {e}")
        return None
    
movie_embedding_config=read_json_file('./config/movie_embedding_config.json')
user_embedding_config=read_json_file('./config/user_embedding_config.json')

In [71]:
import pandas as pd
rating_movie_user=pd.read_csv('./data/rating_movie_user.csv')

## 进行正负样本标注

In [72]:
rating_movie_user['label'] = rating_movie_user.apply(
    lambda x: 1 if x['rating'] > x['mean_rating'] else 0, axis=1
)

## 获取正负样本

In [73]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm


def vectorized_generate_negatives(user_movie_pairs, movies_df, ratings_df):
    """
    为每个用户生成负样本，并控制负样本与正样本的比例为 1:2。

    Args:
        user_movie_pairs (pd.DataFrame): 用户-电影交互对。
        movies_df (pd.DataFrame): 包含电影信息。
        ratings_df (pd.DataFrame): 包含用户评分数据。

    Returns:
        pd.DataFrame: 包含负样本的 DataFrame，列为 'user_id', 'movie_id', 'label'
    """
    # 创建用户-电影交互矩阵
    user_movie_matrix = pd.crosstab(user_movie_pairs['user_id'], user_movie_pairs['movie_id'])

    all_users = user_movie_matrix.index
    all_movies = user_movie_matrix.columns

    # 计算电影平均评分与观看次数
    movie_stats = ratings_df.groupby('movie_id')['rating'].agg(['mean', 'count']).reset_index()
    movie_stats.rename(columns={'mean': 'avg_rating', 'count': 'rating_count'}, inplace=True)

    # 合并电影统计信息
    movies_with_stats = pd.merge(movies_df, movie_stats, on='movie_id', how='left')
    movies_with_stats['rating_count'].fillna(0, inplace=True)

    # 热门高分电影筛选
    popularity_threshold = movies_with_stats['rating_count'].quantile(0.75)
    high_rating_threshold = 4.0  # 高评分阈值
    popular_high_rated_movies = movies_with_stats[
        (movies_with_stats['avg_rating'] >= high_rating_threshold) &
        (movies_with_stats['rating_count'] >= popularity_threshold)
        ]['movie_id'].tolist()

    # 缓存用户已观看电影
    user_history = user_movie_pairs.groupby('user_id')['movie_id'].apply(list).to_dict()

    # 缓存用户评分信息
    user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
    movie_avg_rating_map = movie_stats.set_index('movie_id')['avg_rating'].to_dict()

    negative_pairs = []

    for user_id in tqdm(all_users, desc='为每个用户生成负样本'):
        seen_movies = set(user_movie_matrix.columns[user_movie_matrix.loc[user_id] > 0])
        user_watched_movies = user_history.get(user_id, [])
        unseen_movies = list(set(all_movies) - seen_movies)

        # 统计该用户的正样本数量
        positive_count = len(seen_movies)
        num_negatives_to_generate = positive_count * 2  # 负样本数量是正样本的两倍

        # 1. 未看过的随机电影
        num_random_negatives = min(num_negatives_to_generate, len(unseen_movies))  # 最多生成和未看过的电影一样多的数量
        if num_random_negatives > 0:
            random_negative_movies = np.random.choice(unseen_movies, size=num_random_negatives, replace=False).tolist()
            for movie_id in random_negative_movies:
                negative_pairs.append({'user_id': user_id, 'movie_id': movie_id, 'label': 0})
            num_negatives_to_generate -= num_random_negatives

        if num_negatives_to_generate == 0:
            continue

        # 2. 热门高评分负样本
        num_hard_negatives = min(num_negatives_to_generate, len(popular_high_rated_movies))
        hard_negatives = generate_popular_high_rated_negatives(
            user_watched_movies,
            popular_high_rated_movies,
            num_hard_negatives
        )
        for mid in hard_negatives:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})
        num_negatives_to_generate -= len(hard_negatives)

        if num_negatives_to_generate == 0:
            continue

        # 3. 看过但评分低于平均值的电影
        user_movie_ratings = user_ratings.get(user_id, {})
        disliked = [mid for mid, rating in user_movie_ratings.items() if rating < movie_avg_rating_map.get(mid, 0)]
        num_disliked_negatives = min(num_negatives_to_generate, len(disliked))
        sampled_disliked = random.sample(disliked, num_disliked_negatives)
        for mid in sampled_disliked:
            negative_pairs.append({'user_id': user_id, 'movie_id': mid, 'label': 0})

    return pd.DataFrame(negative_pairs)


def generate_popular_high_rated_negatives(user_watched_movies, all_popular_high_rated_movies, num_negatives):
    available_negatives = list(set(all_popular_high_rated_movies) - set(user_watched_movies))
    if not available_negatives:
        return []
    return list(np.random.choice(available_negatives, size=min(num_negatives, len(available_negatives)), replace=False))


def create_features(data, user_features, movie_features):
    user_data = user_features[user_features['user_id'].isin(data['user_id'])].set_index('user_id')
    movie_data = movie_features[movie_features['movie_id'].isin(data['movie_id'])].set_index('movie_id')
    
    user_feats = user_data.loc[data['user_id']].values
    movie_feats = movie_data.loc[data['movie_id']].values
    
    return user_feats, movie_feats


In [74]:
negative_pairs=vectorized_generate_negatives(rating_movie_user, movies_df, ratings_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_with_stats['rating_count'].fillna(0, inplace=True)
  user_ratings = ratings_df.groupby('user_id').apply(lambda x: x.set_index('movie_id')['rating'].to_dict()).to_dict()
为每个用户生成负样本: 100%|██████████| 6040/6040 [00:02<00:00, 2401.39it/s]


In [75]:
negative_pairs

Unnamed: 0,user_id,movie_id,label
0,1,3481,0
1,1,3900,0
2,1,426,0
3,1,3446,0
4,1,9,0
...,...,...,...
1995639,6040,1658,0
1995640,6040,1936,0
1995641,6040,2138,0
1995642,6040,3892,0


In [76]:
positive_samples = rating_movie_user[rating_movie_user['label']==1][['user_id', 'movie_id', 'label']]

In [77]:
pos_neg_samples=pd.concat([positive_samples, negative_pairs])

In [78]:
# 查看所有用户的正负样本数量
# 统计每个用户的正样本数量
positive_counts = pos_neg_samples[pos_neg_samples['label'] == 1].groupby('user_id').size().reset_index(name='positive_count')

# 统计每个用户的负样本数量
negative_counts = pos_neg_samples[pos_neg_samples['label'] == 0].groupby('user_id').size().reset_index(name='negative_count')

# 合并正负样本数量统计结果
user_sample_counts = pd.merge(positive_counts, negative_counts, on='user_id', how='outer').fillna(0)


In [79]:
user_sample_counts

Unnamed: 0,user_id,positive_count,negative_count
0,1,18,106
1,2,73,258
2,3,37,102
3,4,10,42
4,5,82,396
...,...,...,...
6035,6036,399,1776
6036,6037,120,404
6037,6038,13,40
6038,6039,90,246


In [80]:
pos_neg_samples.to_csv('./data/pos_neg_data.csv', index=False)
rating_movie_user.to_csv('./data/rating_movie_user_with_label.csv', index=False)