In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import sparse
import matplotlib.pyplot as plt
import sklearn

In [2]:
file_root = 'D:/Datasets/ml-25m/'
ratings_df = pd.read_csv(file_root + 'ratings.csv')
movies_df = pd.read_csv(file_root + 'movies.csv')
genome_scores_df = pd.read_csv(file_root + 'genome-scores.csv')

## 电影特征处理
将电影本身所属的类型作为电影特征，没被打过分的电影用平均评分填充
再将movie_id处理成二进制编码

In [3]:
# movie feature extraction
# 对 'genres' 列进行 One-Hot 编码
genres_one_hot = movies_df['genres'].str.get_dummies(sep='|')

# 合并编码到一个列中
movies_df = pd.concat([movies_df, genres_one_hot], axis=1)
# 

movies_df.head(5)

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# 求movies_df中所有电影的平均评分
average_rating_df = ratings_df[['movieId', 'rating']].groupby('movieId').mean()
average_rating_df.columns = ['movie_average_rating']
# 用均分填补没有打分的电影
average_rating_df['movie_average_rating'] = average_rating_df['movie_average_rating'].fillna(average_rating_df['movie_average_rating'].mean())
average_rating_df.head(5)

Unnamed: 0_level_0,movie_average_rating
movieId,Unnamed: 1_level_1
1,3.893708
2,3.251527
3,3.142028
4,2.853547
5,3.058434


In [5]:
# Join movie data and average ratings
movies_df = movies_df.merge(average_rating_df, on='movieId', how='left')

In [6]:
# 将Movie_id处理成二进制编码
def convert_to_binary_vector(id_in, max_length):
    binary_representation = format(id_in, 'b')  # 将数字转换为二进制字符串
    binary_vector = [int(digit) for digit in binary_representation]  # 将字符串转换为数字列表
    # 如果长度不足，用0填充
    return [0] * (max_length - len(binary_vector)) + binary_vector

# 确定所需的二进制长度
max_length = len(format(movies_df['movieId'].max(), 'b'))

tqdm.pandas()
# 应用转换
binary_movie_id_df = pd.DataFrame(movies_df['movieId'].apply(lambda x: convert_to_binary_vector(x, max_length)).tolist())
# 为新列命名
binary_movie_id_df.columns = [f'binary_MovieId_{i}' for i in range(max_length)]
# 将新列添加到原始 DataFrame 中
movies_df = pd.concat([movies_df, binary_movie_id_df], axis=1)

In [7]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,binary_MovieId_8,binary_MovieId_9,binary_MovieId_10,binary_MovieId_11,binary_MovieId_12,binary_MovieId_13,binary_MovieId_14,binary_MovieId_15,binary_MovieId_16,binary_MovieId_17
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1


In [8]:
# Join user data and movie data
big_df = ratings_df.merge(movies_df, on='movieId', how='left')

In [9]:
big_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,(no genres listed),Action,Adventure,Animation,...,binary_MovieId_8,binary_MovieId_9,binary_MovieId_10,binary_MovieId_11,binary_MovieId_12,binary_MovieId_13,binary_MovieId_14,binary_MovieId_15,binary_MovieId_16,binary_MovieId_17
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,0,0,0,0,...,0,1,0,0,1,1,0,0,1,0
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,0,0,0,0,...,0,1,0,0,1,1,0,0,1,1
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,0,0,0,0,...,1,1,1,0,0,0,0,0,1,1


In [17]:
# big_df = ratings_df.merge(movies_df[['genres', 'movieId']], on='movieId', how='left')

## 用户特征处理
将用户对每个类型电影的平均评分作为用户特征，没打过分的电影用用户的平均评分填充
再将user_id处理成二进制编码

In [18]:
# 计算每个用户的电影平均评分
user_average_ratings = big_df.groupby('userId')['rating'].mean()
user_average_ratings.columns = ['user_average_rating']

In [19]:
from tqdm import tqdm
# 拆分 genres 列
genres_expanded = big_df['genres'].str.get_dummies('|')
ratings_expanded = pd.concat([big_df, genres_expanded], axis=1)
# 计算每个用户对每个类型的平均评分
def calculate_genre_ratings(group):
    genre_ratings = {}
    for genre in genres_expanded.columns:
        genre_movies = group[genre] == 1
        if genre_movies.any():
            genre_ratings[genre] = group.loc[genre_movies, 'rating'].mean()
        else:
            genre_ratings[genre] = None
    return pd.Series(genre_ratings)

tqdm.pandas()
user_df = ratings_expanded.groupby('userId').progress_apply(calculate_genre_ratings)
user_df = user_df.add_prefix('User_')

user_df = user_df.apply(lambda row: row.fillna(user_average_ratings[row.name]), axis=1)
user_df.head(5)

100%|██████████| 162541/162541 [11:56<00:00, 226.78it/s]


Unnamed: 0_level_0,User_(no genres listed),User_Action,User_Adventure,User_Animation,User_Children,User_Comedy,User_Crime,User_Documentary,User_Drama,User_Fantasy,User_Film-Noir,User_Horror,User_IMAX,User_Musical,User_Mystery,User_Romance,User_Sci-Fi,User_Thriller,User_War,User_Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,3.814286,4.125,3.727273,4.0,3.833333,3.869565,4.0625,2.0,3.867925,3.6,3.5,3.5,3.814286,3.7,3.5,4.166667,3.7,4.3,3.9,2.5
2,3.630435,3.69697,3.906667,3.617647,3.66,3.31746,3.138889,3.630435,3.571429,3.982759,3.630435,4.0,4.75,3.272727,3.6875,3.161765,4.017857,3.833333,3.433333,2.5
3,3.5,3.640719,3.689394,3.98,3.708333,3.454545,3.886364,3.166667,3.890086,3.737179,4.3,3.544444,3.734568,3.583333,3.891667,3.533333,3.696429,3.692469,3.692308,3.75
4,3.378099,3.186207,3.074561,3.467742,3.214286,3.611111,3.972973,4.1,3.744898,2.833333,3.378099,3.15,2.426829,3.642857,3.805556,3.45,3.164706,3.544643,3.333333,2.833333
5,3.752475,3.722222,3.857143,3.75,3.333333,3.571429,4.142857,3.752475,3.822222,3.5,3.752475,4.666667,4.0,3.714286,4.714286,3.55,4.090909,4.0,4.5,3.0


In [24]:
# 将User_id编码为二进制向量
# 确定所需的二进制长度
# 确定所需的二进制长度
max_length = len(format(user_df.shape[0], 'b'))

tqdm.pandas()
# 应用转换
binary_user_id_df = pd.DataFrame(pd.Series(user_df.index).apply(lambda x: convert_to_binary_vector(x, max_length)).tolist())
# 为新列命名
binary_user_id_df.columns = [f'binary_userId_{i}' for i in range(max_length)]
# 将新列添加到原始 DataFrame 中
user_df = pd.concat([user_df, binary_user_id_df], axis=1)

In [25]:
user_df.head(5)

Unnamed: 0,User_(no genres listed),User_Action,User_Adventure,User_Animation,User_Children,User_Comedy,User_Crime,User_Documentary,User_Drama,User_Fantasy,...,binary_userId_8,binary_userId_9,binary_userId_10,binary_userId_11,binary_userId_12,binary_userId_13,binary_userId_14,binary_userId_15,binary_userId_16,binary_userId_17
1,3.814286,4.125,3.727273,4.0,3.833333,3.869565,4.0625,2.0,3.867925,3.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.630435,3.69697,3.906667,3.617647,3.66,3.31746,3.138889,3.630435,3.571429,3.982759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,3.5,3.640719,3.689394,3.98,3.708333,3.454545,3.886364,3.166667,3.890086,3.737179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3.378099,3.186207,3.074561,3.467742,3.214286,3.611111,3.972973,4.1,3.744898,2.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5,3.752475,3.722222,3.857143,3.75,3.333333,3.571429,4.142857,3.752475,3.822222,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [26]:
big_df = big_df.merge(user_df, left_on='userId', right_index=True, how='left')

In [28]:
big_df = big_df.merge(movies_df, left_on='userId', right_index=True, how='left')

In [27]:
big_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres,User_(no genres listed),User_Action,User_Adventure,User_Animation,User_Children,...,binary_userId_8,binary_userId_9,binary_userId_10,binary_userId_11,binary_userId_12,binary_userId_13,binary_userId_14,binary_userId_15,binary_userId_16,binary_userId_17
0,1,296,5.0,1147880044,Comedy|Crime|Drama|Thriller,3.814286,4.125,3.727273,4.0,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,306,3.5,1147868817,Drama,3.814286,4.125,3.727273,4.0,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,307,5.0,1147868828,Drama,3.814286,4.125,3.727273,4.0,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,665,5.0,1147878820,Comedy|Drama|War,3.814286,4.125,3.727273,4.0,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,899,3.5,1147868510,Comedy|Musical|Romance,3.814286,4.125,3.727273,4.0,3.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# 过滤掉文本值的列
data_df = big_df.drop(['title', 'genres'], axis=1)

# 展开矢量值的列
for col in data_df.columns:
    if isinstance(data_df[col].iloc[0], list):  # 检查第一个元素是否为列表
        # 展开矢量列
        print(col)
        expanded_df = data_df[col].apply(pd.Series)
        expanded_df.columns = [f'{col}_{i}' for i in range(expanded_df.shape[1])]
        data_df = pd.concat([data_df, expanded_df], axis=1).drop(col, axis=1)


In [None]:
data_df.head(5)

## 使用ML模型进行预测
此处暂时选用的ML模型为线性回归

In [None]:
## 数值归一化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 分离特征和目标变量
X = data_df.drop(['rating', 'timestamp', ], axis=1)
y = data_df['rating']

# 归一化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=5021)

# 应用线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print("均方误差(MSE):", mse)

## 处理genome_scores_df
将genome_scores_df中的relevance列进行矢量化，部分电影没有相匹配的genome_relevence，用平均向量填充缺失值

In [None]:
# 对genome_scores_df进行矢量化
# 按 movieId 分组，并对每个组内的 relevance 进行矢量化
grouped = genome_scores_df.groupby('movieId')['relevance'].apply(list)
# 如果你希望将这个 Series 转换回 DataFrame，并将索引重置为列
vectorized_df = grouped.reset_index(name='relevance_vector')
vectorized_df.head(5)

In [None]:
# Join movie data and genome scores
movies_df = movies_df.merge(vectorized_df, on='movieId', how='left')

In [None]:
# 仅选择有 relevance_vector 的行进行计算
valid_vectors = movies_df[movies_df['relevance_vector'].notna()]['relevance_vector'].tolist()
# 计算平均向量，这里假设所有向量长度相同
average_vector = np.mean(np.array(valid_vectors), axis=0).tolist()
# 使用平均向量填充缺失值
movies_df['relevance_vector'] = movies_df['relevance_vector'].apply(lambda x: x if isinstance(x, list) else average_vector)
# 检查缺失值填充后的结果
print(movies_df.head())