In [2]:
import pandas as pd
import os
from scipy.sparse import lil_matrix, csr_matrix
import h5py
import gc
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

# 导入数据
folder_path = './ml-25m'
movies_df = pd.read_csv(os.path.join(folder_path, 'movies.csv'))
genome_tags_df = pd.read_csv(os.path.join(folder_path, 'genome-tags.csv'))
genome_scores_df = pd.read_csv(os.path.join(folder_path, 'genome-scores.csv'))

# 1. 列出所有独特的类型
genres_set = set()
for genres in movies_df['genres'].str.split('|'):
    genres_set.update(genres)
genres_list = list(genres_set)

# 2. 列出所有独特的标签
tags_list = genome_tags_df['tagId'].tolist()

# 3. 计算特征的总数（d）和电影的总数（m）
d = len(genres_list) + len(tags_list)
m = len(movies_df)

In [3]:
# 4. 创建稀疏的电影特征矩阵
movie_features_sparse = lil_matrix((m, d))

# 创建电影ID到行索引的映射
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies_df['movieId'])}

# 保存映射到文件
with open(os.path.join(folder_path, 'movie_id_to_index.pkl'), 'wb') as f:
    pickle.dump(movie_id_to_index, f)

# 创建类型到列索引的映射
genre_to_index = {genre: idx for idx, genre in enumerate(genres_list)}
tag_offset = len(genres_list)
tag_to_index = {tag: idx + tag_offset for idx, tag in enumerate(tags_list)}

# 填充类型特征
for idx, row in movies_df.iterrows():
    movie_id = row['movieId']
    movie_idx = movie_id_to_index[movie_id]
    for genre in row['genres'].split('|'):
        genre_idx = genre_to_index[genre]
        movie_features_sparse[movie_idx, genre_idx] = 1

# 填充标签特征
for idx, row in genome_scores_df.iterrows():
    movie_id = row['movieId']
    movie_idx = movie_id_to_index[movie_id]
    tag_id = row['tagId']
    tag_idx = tag_to_index[tag_id]
    relevance = row['relevance']
    movie_features_sparse[movie_idx, tag_idx] = relevance

# 将稀疏矩阵转换为 CSR 格式以节省内存
movie_features_sparse = movie_features_sparse.tocsr()

# 将稀疏矩阵保存为HDF5文件
output_path = os.path.join(folder_path, 'movie_features.h5')
with h5py.File(output_path, 'w') as f:
    f.create_dataset('data', data=movie_features_sparse.data)
    f.create_dataset('indices', data=movie_features_sparse.indices)
    f.create_dataset('indptr', data=movie_features_sparse.indptr)
    f.attrs['shape'] = movie_features_sparse.shape

print(f"The movie features matrix has been saved to {output_path}.")

# 清理内存
del movies_df, genome_tags_df, genome_scores_df
gc.collect()

The movie features matrix has been saved to ./ml-25m/movie_features.h5.


19

In [4]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # 读取ratings.csv文件
# folder_path = './ml-25m'
# ratings_df = pd.read_csv(os.path.join(folder_path, 'ratings.csv'))

# # 分层抽样，将数据分为训练集和测试集
# train_data, test_data = train_test_split(ratings_df, test_size=0.2, stratify=ratings_df['userId'])

# # 保存拆分后的数据集
# train_data.to_csv(os.path.join(folder_path, 'train_ratings.csv'), index=False)
# test_data.to_csv(os.path.join(folder_path, 'test_ratings.csv'), index=False)

# # 打印训练集和测试集的前几行
# print("Train data (first 5 rows):")
# print(train_data.head())

# print("\nTest data (first 5 rows):")
# print(test_data.head())

In [5]:
# 读取ratings.csv文件
ratings_df = pd.read_csv(os.path.join(folder_path, 'ratings.csv'))

# 分层抽样，定义函数用于分批处理
def stratified_split_batch(ratings, test_size=0.2, batch_size=100000):
    with h5py.File(os.path.join(folder_path, 'ratings_split.h5'), 'w') as h5file:
        h5file.create_dataset('train_data', shape=(0, ratings.shape[1]), maxshape=(None, ratings.shape[1]), dtype='float32')
        h5file.create_dataset('test_data', shape=(0, ratings.shape[1]), maxshape=(None, ratings.shape[1]), dtype='float32')
        
        user_ids = ratings['userId'].unique()
        for start in range(0, len(user_ids), batch_size):
            end = min(start + batch_size, len(user_ids))
            batch_user_ids = user_ids[start:end]
            batch_ratings = ratings[ratings['userId'].isin(batch_user_ids)]
            train_batch, test_batch = train_test_split(batch_ratings, test_size=test_size, stratify=batch_ratings['userId'])
            
            h5file['train_data'].resize((h5file['train_data'].shape[0] + train_batch.shape[0]), axis=0)
            h5file['train_data'][-train_batch.shape[0]:] = train_batch.to_numpy()
            
            h5file['test_data'].resize((h5file['test_data'].shape[0] + test_batch.shape[0]), axis=0)
            h5file['test_data'][-test_batch.shape[0]:] = test_batch.to_numpy()

# 执行分层抽样
stratified_split_batch(ratings_df)

# 读取HDF5文件查看结果
with h5py.File(os.path.join(folder_path, 'ratings_split.h5'), 'r') as f:
    train_data = f['train_data'][:]
    test_data = f['test_data'][:]

# 将结果转换回DataFrame并保存为CSV
train_df = pd.DataFrame(train_data, columns=ratings_df.columns)
test_df = pd.DataFrame(test_data, columns=ratings_df.columns)

train_df.to_csv(os.path.join(folder_path, 'train_ratings.csv'), index=False)
test_df.to_csv(os.path.join(folder_path, 'test_ratings.csv'), index=False)

print("Train data (first 5 rows):")
print(train_df.head())

print("\nTest data (first 5 rows):")
print(test_df.head())

del ratings_df
gc.collect()

Train data (first 5 rows):
    userId   movieId  rating     timestamp
0  62451.0     356.0     4.5  1.306214e+09
1  17488.0    1566.0     4.0  1.538273e+09
2  65493.0    4896.0     4.0  1.212679e+09
3  52220.0    7162.0     3.0  1.105518e+09
4  81961.0  174055.0     2.5  1.526697e+09

Test data (first 5 rows):
    userId  movieId  rating     timestamp
0  57840.0   7361.0     4.0  1.111477e+09
1  97994.0   5225.0     3.5  1.405587e+09
2  15560.0   7347.0     2.5  1.328037e+09
3  96671.0   1196.0     5.0  8.663739e+08
4  61988.0  59315.0     4.0  1.414696e+09


0

In [6]:
# 读取电影特征矩阵
with h5py.File(os.path.join(folder_path, 'movie_features.h5'), 'r') as f:
    data = f['data'][:]
    indices = f['indices'][:]
    indptr = f['indptr'][:]
    shape = f.attrs['shape']
    movie_features_sparse = csr_matrix((data, indices, indptr), shape=shape)

# 读取训练集
train_df = pd.read_csv(os.path.join(folder_path, 'train_ratings.csv'))

# 计算用户画像
user_profiles = {}

def calculate_user_profiles(train_df, movie_features_sparse, user_profiles):
    batch_size = 1000  # 分批处理，减少内存占用
    for user_id, group in train_df.groupby('userId'):
        if user_id in user_profiles:
            continue
        user_profile = np.zeros(d)
        total_rating = 0
        for _, row in group.iterrows():
            movie_id = row['movieId']
            rating = row['rating']
            movie_idx = movie_id_to_index[movie_id]
            movie_features = movie_features_sparse[movie_idx].toarray().flatten()
            user_profile += rating * movie_features
            total_rating += rating
        if total_rating > 0:
            user_profiles[user_id] = user_profile / total_rating
        else:
            user_profiles[user_id] = user_profile
        if len(user_profiles) % batch_size == 0:
            print(f"Processed {len(user_profiles)} users")
            gc.collect()
    return user_profiles

user_profiles = calculate_user_profiles(train_df, movie_features_sparse, user_profiles)

# 保存用户画像到文件
with open(os.path.join(folder_path, 'user_profiles.pkl'), 'wb') as f:
    pickle.dump(user_profiles, f)

print("User profiles have been calculated and saved.")

Processed 1000 users
Processed 2000 users
Processed 3000 users
Processed 4000 users
Processed 5000 users
Processed 6000 users
Processed 7000 users
Processed 8000 users
Processed 9000 users
Processed 10000 users
Processed 11000 users
Processed 12000 users
Processed 13000 users
Processed 14000 users
Processed 15000 users
Processed 16000 users
Processed 17000 users
Processed 18000 users
Processed 19000 users
Processed 20000 users
Processed 21000 users
Processed 22000 users
Processed 23000 users
Processed 24000 users
Processed 25000 users
Processed 26000 users
Processed 27000 users
Processed 28000 users
Processed 29000 users
Processed 30000 users
Processed 31000 users
Processed 32000 users
Processed 33000 users
Processed 34000 users
Processed 35000 users
Processed 36000 users
Processed 37000 users
Processed 38000 users
Processed 39000 users
Processed 40000 users
Processed 41000 users
Processed 42000 users
Processed 43000 users
Processed 44000 users
Processed 45000 users
Processed 46000 use

In [7]:
user_profiles

{1.0: array([0.321513  , 0.05437352, 0.76122931, ..., 0.04202955, 0.08381678,
        0.02101537]),
 2.0: array([0.34009662, 0.16811594, 0.49951691, ..., 0.0561756 , 0.09878357,
        0.02390193]),
 3.0: array([0.24871663, 0.08418891, 0.38244353, ..., 0.03511659, 0.12806828,
        0.03779858]),
 4.0: array([0.34441088, 0.10951662, 0.20015106, ..., 0.04094807, 0.13264558,
        0.04568429]),
 5.0: array([0.48859935, 0.0781759 , 0.4495114 , ..., 0.02361564, 0.09479316,
        0.02061156]),
 6.0: array([0.04761905, 0.10714286, 0.63095238, ..., 0.13734821, 0.08203571,
        0.02126488]),
 7.0: array([0.30555556, 0.15277778, 0.51388889, ..., 0.09110764, 0.08594097,
        0.01990972]),
 8.0: array([0.29398664, 0.08240535, 0.43207127, ..., 0.03277784, 0.09767205,
        0.02168207]),
 9.0: array([0.34200743, 0.19702602, 0.36431227, ..., 0.0568592 , 0.09832667,
        0.02480809]),
 10.0: array([0.20350877, 0.05964912, 0.34736842, ..., 0.04029737, 0.14839649,
        0.07824123]),

In [10]:
import pickle
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# 读取用户画像和电影特征矩阵
folder_path = './ml-25m'
with open(os.path.join(folder_path, 'user_profiles.pkl'), 'rb') as f:
    user_profiles = pickle.load(f)

with h5py.File(os.path.join(folder_path, 'movie_features.h5'), 'r') as f:
    data = f['data'][:]
    indices = f['indices'][:]
    indptr = f['indptr'][:]
    shape = f.attrs['shape']
    movie_features_sparse = csr_matrix((data, indices, indptr), shape=shape)

# 将用户画像转换为矩阵
user_ids = list(user_profiles.keys())
user_profile_matrix = np.array([user_profiles[user_id] for user_id in user_ids])

# 读取训练集和测试集
train_df = pd.read_csv(os.path.join(folder_path, 'train_ratings.csv'))
test_df = pd.read_csv(os.path.join(folder_path, 'test_ratings.csv'))

# 选择训练集和测试集的一部分数据
train_sample = train_df.sample(n=10000, random_state=42)  # 选择10000条训练数据
test_sample = test_df.sample(n=2000, random_state=42)    # 选择2000条测试数据

In [11]:
# 生成评分预测
def predict_ratings(train_df):
    predicted_ratings = []
    for _, row in train_df.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        user_idx = user_ids.index(user_id)
        movie_idx = movie_id_to_index[movie_id]
        predicted_rating = cosine_similarity(
            user_profile_matrix[user_idx].reshape(1, -1),
            movie_features_sparse[movie_idx].reshape(1, -1)
        )[0][0]
        predicted_ratings.append(predicted_rating)
    return predicted_ratings

# 计算评分预测
train_sample['predicted_rating'] = predict_ratings(train_sample)

# 评估模型性能
mse = mean_squared_error(train_sample['rating'], train_sample['predicted_rating'])
print(f"Training MSE: {mse}")

print("Sample of predictions:")
print(train_sample[['userId', 'movieId', 'rating', 'predicted_rating']].head())

Training MSE: 8.697372066698565
Sample of predictions:
            userId  movieId  rating  predicted_rating
6557171    68781.0  62999.0     4.5          0.729368
19862025  134015.0   4226.0     4.0          0.883057
12007453   53783.0   1127.0     4.5          0.843321
6965385     4886.0    586.0     3.5          0.816398
2467440    59232.0   2791.0     4.5          0.805203


In [12]:
# 预测用户对所有电影的评分
def predict_ratings_for_all_movies(user_id, movie_features_sparse):
    user_idx = user_ids.index(user_id)
    user_profile = user_profile_matrix[user_idx].reshape(1, -1)
    predicted_ratings = cosine_similarity(user_profile, movie_features_sparse).flatten()
    return predicted_ratings

# 推荐电影
def recommend_movies(user_id, threshold=2):
    predicted_ratings = predict_ratings_for_all_movies(user_id, movie_features_sparse)
    recommended_movies = np.where(predicted_ratings > threshold)[0]
    return recommended_movies, predicted_ratings[recommended_movies]

# 评估推荐效果
def evaluate_recommendations(test_df, threshold=2):
    total_users = len(user_ids)
    well_recommended = 0
    true_preferences = 0
    
    for user_id in user_ids:
        user_test_df = test_df[test_df['userId'] == user_id]
        true_preferences += len(user_test_df[user_test_df['rating'] > threshold])
        
        recommended_movies, predicted_ratings = recommend_movies(user_id, threshold)
        well_recommended += len(user_test_df[(user_test_df['movieId'].isin(recommended_movies)) & (user_test_df['rating'] > threshold)])
    
    eval_model = well_recommended / true_preferences if true_preferences > 0 else 0
    return eval_model

# 使用测试集的一部分数据进行评估
eval_score = evaluate_recommendations(test_sample, threshold=2)
print(f"Evaluation Score: {eval_score}")

# 示例：推荐给用户1的电影
recommended_movies, predicted_ratings = recommend_movies(1, threshold=2)
print(f"Recommended movies for user 1: {recommended_movies}")
print(f"Predicted ratings for recommended movies: {predicted_ratings}")

Evaluation Score: 0.0
Recommended movies for user 1: []
Predicted ratings for recommended movies: []
