In [97]:
import pandas as pd

In [98]:
movies_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/movies.csv')
ratings_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/ratings.csv')
users_df=pd.read_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/data/ml-1m/users.csv')

In [99]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [100]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [101]:
from sklearn.preprocessing import LabelEncoder

genres=movies_df['genres'].str.get_dummies(sep='|')
movies=pd.concat([movies_df, genres], axis=1)

# 合并评分和电影数据
data=pd.merge(ratings_df, movies, on='movie_id')

In [102]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## 数据基本情况

In [103]:
# 有哪些电影没有参与过评分
unrated_movies = movies_df[~movies_df['movie_id'].isin(ratings_df['movie_id'])]
print("没有参与过评分的电影:")
print(unrated_movies)
print("没有参与过评分的电影数量为:", len(unrated_movies))

没有参与过评分的电影:
      movie_id                                title                 genres
50          51                Guardian Angel (1994)  Action|Drama|Thriller
107        109  Headless Body in Topless Bar (1995)                 Comedy
113        115     Happiness Is in the Field (1995)                 Comedy
141        143                         Gospa (1995)                  Drama
281        284                  New York Cop (1996)           Action|Crime
...        ...                                  ...                    ...
3581      3650            Anguish (Angustia) (1986)                 Horror
3681      3750                Boricua's Bond (2000)                  Drama
3759      3829               Mad About Mambo (2000)         Comedy|Romance
3786      3856                  Autumn Heart (1999)                  Drama
3837      3907   Prince of Central Park, The (1999)                  Drama

[177 rows x 3 columns]
没有参与过评分的电影数量为: 177


## 电影特征工程&预处理

In [104]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### 处理电影的genres

In [105]:
# 将每个电影的 genres 转换为整数标签
genre_encoder = LabelEncoder()
# 假设 genres 列已经是一个由 | 分隔的类型字符串
all_genres = pd.unique(movies_df['genres'].str.split('|', expand=True).values.ravel())
genre_encoder.fit(all_genres)

# 将每个电影的类型转换为整数
max_genres = movies_df['genres'].apply(lambda x: len(x.split('|'))).max()
print("电影最多的类型数为:", max_genres)
movies_df['genres'] = movies_df['genres'].apply(lambda x: genre_encoder.transform(x.split('|')))

电影最多的类型数为: 6


In [106]:
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[2, 3, 4]"
1,2,Jumanji (1995),"[1, 3, 8]"
2,3,Grumpier Old Men (1995),"[4, 13]"
3,4,Waiting to Exhale (1995),"[4, 7]"
4,5,Father of the Bride Part II (1995),[4]
...,...,...,...
3878,3948,Meet the Parents (2000),[4]
3879,3949,Requiem for a Dream (2000),[7]
3880,3950,Tigerland (2000),[7]
3881,3951,Two Family House (2000),[7]


In [107]:
def pad_genres(genres, max_length=6):
    genres = list(genres)
    # 如果 genres 列表的长度小于 6，使用第一个元素进行填充
    while len(genres) < max_length:
        genres.append(genres[0])  # 填充时选择第一个元素
    
    # 返回填充后的结果
    return genres[:max_length]  # 确保长度为 6

movies_df['genres'] = movies_df['genres'].apply(pad_genres)


In [108]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[2, 3, 4, 2, 2, 2]"
1,2,Jumanji (1995),"[1, 3, 8, 1, 1, 1]"
2,3,Grumpier Old Men (1995),"[4, 13, 4, 4, 4, 4]"
3,4,Waiting to Exhale (1995),"[4, 7, 4, 4, 4, 4]"
4,5,Father of the Bride Part II (1995),"[4, 4, 4, 4, 4, 4]"


### 电影的统计特征

In [109]:
# 计算电影的基本评分统计
movie_stats=data.groupby('movie_id')['rating'].agg([
    ('movie_mean_rating', 'mean'),
    ('movie_rating_std', 'std'),
    ('movie_rating_count', 'count')
])

# 填充可能存在的NaN值
movie_stats['movie_rating_std']=movie_stats['movie_rating_std'].fillna(0)

# 合并电影原始信息
movie_features_v1=pd.merge(movies_df, movie_stats, on='movie_id')

# 添加电影热度特征
n_quantiles = 4  # 分为 4 个热度等级
movie_features_v1['popularity'] = pd.qcut(
    movie_features_v1['movie_rating_count'],
    q=n_quantiles,
    labels=[int(i) for i in range(n_quantiles)],
    duplicates='drop'  # 避免分位值重复时出错
)

In [110]:
from collections import Counter

# 提取电影发布年份
movie_features_v1['year'] = movie_features_v1['title'].str.extract(r'\((\d{4})\)')
movie_features_v1['year'] = movie_features_v1['year'].fillna('1990').astype(int)

# 是否为老片
movie_features_v1['is_old_movie'] = (movie_features_v1['year'] < 2000).astype(int)

# 计算电影类型纯度（类型数量越少，纯度越高）
movie_features_v1['genre_purity'] = movie_features_v1['genres'].apply(lambda x: len(set(x)) / len(genre_encoder.classes_))

# 年份分桶
n_year_buckets = 6  # 设置分成多少个桶
movie_features_v1['year'] = pd.qcut(
    movie_features_v1['year'],
    q=n_year_buckets,
    labels=False,
    duplicates='drop'  # 如果有重复分位点，自动减少桶数
)

# 创建标题长度特征
movie_features_v1['title_length'] = movie_features_v1['title'].str.len()

In [111]:
movie_features_v1.head()

Unnamed: 0,movie_id,title,genres,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,year,is_old_movie,genre_purity,title_length
0,1,Toy Story (1995),"[2, 3, 4, 2, 2, 2]",4.146846,0.852349,2077,3,3,1,0.157895,16
1,2,Jumanji (1995),"[1, 3, 8, 1, 1, 1]",3.201141,0.983172,701,3,3,1,0.157895,14
2,3,Grumpier Old Men (1995),"[4, 13, 4, 4, 4, 4]",3.016736,1.071712,478,3,3,1,0.105263,23
3,4,Waiting to Exhale (1995),"[4, 7, 4, 4, 4, 4]",2.729412,1.013381,170,2,3,1,0.105263,24
4,5,Father of the Bride Part II (1995),"[4, 4, 4, 4, 4, 4]",3.006757,1.025086,296,2,3,1,0.052632,34


### 处理电影的title

In [112]:
import re
pattern = re.compile(r'^(.*)\((\d+)\)$')

title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movie_features_v1['title']))}
movie_features_v1['title'] = movie_features_v1['title'].map(title_map)


In [113]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

title_set = set()
for val in movie_features_v1['title'].str.split():
    title_set.update(val)

title_set.add('<PAD>')
title2int = {val:ii for ii, val in enumerate(title_set)}

#将电影Title转成等长数字列表，长度是15
title_count = 15
title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movie_features_v1['title']))}

for key in title_map:
    for cnt in range(title_count - len(title_map[key])):
        title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])

movie_features_v1['title'] = movie_features_v1['title'].map(title_map)


### 预处理

In [114]:
from sklearn.preprocessing import StandardScaler

# 特征归一化
scaler = StandardScaler()
features_to_scale = ['movie_mean_rating', 'movie_rating_std', 'movie_rating_count', 'genre_purity', 'title_length']
movie_features_v1[features_to_scale] = scaler.fit_transform(movie_features_v1[features_to_scale])

In [115]:
movie_features_v1.head()

Unnamed: 0,movie_id,title,genres,movie_mean_rating,movie_rating_std,movie_rating_count,popularity,year,is_old_movie,genre_purity,title_length
0,1,"[705, 1537, 3956, 3956, 3956, 3956, 3956, 3956...","[2, 3, 4, 2, 2, 2]",1.349448,-0.4745,4.706067,3,3,1,1.638668,-0.825885
1,2,"[1694, 3956, 3956, 3956, 3956, 3956, 3956, 395...","[1, 3, 8, 1, 1, 1]",-0.056107,0.011221,1.122696,3,3,1,1.638668,-1.02806
2,3,"[3483, 2891, 2650, 3956, 3956, 3956, 3956, 395...","[4, 13, 4, 4, 4, 4]",-0.330179,0.339952,0.541961,3,3,1,0.405841,-0.118272
3,4,"[3690, 3699, 2387, 3956, 3956, 3956, 3956, 395...","[4, 7, 4, 4, 4, 4]",-0.757216,0.12338,-0.260131,2,3,1,0.405841,-0.017184
4,5,"[2822, 3830, 602, 4246, 2197, 3409, 3956, 3956...","[4, 4, 4, 4, 4, 4]",-0.345011,0.16684,0.067998,2,3,1,-0.826985,0.993691


## 用户特征工程&预处理
* 用户评分统计（均值、方差、最小值、最大值）
* 严格程度（和全局均值的差）
* 评分波动程度（变异系数）
* 用户对各类型的评分偏好
* 用户最喜欢的电影类型及数量
* 活跃度（按照评分数量分段）
* 性别、职业、年龄分桶与编码

In [116]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


### 统计特征

In [117]:
user_stats=data.groupby('user_id')['rating'].agg([
    ('mean_rating', 'mean'), # 平均评分
    ('rating_std', 'std'), # 评分标准差
    ('rating_count','count'), # 评分次数
    ('rating_min','min'), # 最小评分
    ('rating_max','max') # 最大评分
]).reset_index()


# 计算用户评分严格程度
global_mean_rating=data['rating'].mean()
user_stats['rating_strictness']=global_mean_rating-user_stats['mean_rating']

# 计算用户评分波动程度
user_stats['rating_variability']=user_stats['rating_std']/(user_stats['mean_rating']+1e-5)

# 查看结果
print(user_stats.head())

   user_id  mean_rating  rating_std  rating_count  rating_min  rating_max  \
0        1     4.188679    0.680967            53           3           5   
1        2     3.713178    1.001513           129           1           5   
2        3     3.901961    0.984985            51           1           5   
3        4     4.190476    1.077917            21           1           5   
4        5     3.146465    1.132699           198           1           5   

   rating_strictness  rating_variability  
0          -0.607115            0.162573  
1          -0.131614            0.269718  
2          -0.320396            0.252433  
3          -0.608912            0.257230  
4           0.435100            0.359990  


In [118]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 获取所有电影类型列
genre_columns=[col for col in data.columns if col not in ['user_id', 'movie_id', 'rating', 'timestamp','title', 'genres']]

# 计算用户对每种类型的评分次数和平均评分（用户ID、电影类型以及评分次数）
user_genre_stats=data.groupby('user_id')[genre_columns].sum().reset_index()

# 计算用户对每种类型的偏好程度（按行处理，将每一个用户对某一类型的评分，除以该用户对所有类型评分的总和）
for genre in genre_columns:
    user_genre_stats[f'{genre}_favorite_degree']=user_genre_stats[genre]/user_genre_stats[genre_columns].sum(axis=1)

# for genre in genre_columns:
#     user_genre_stats[f'{genre}_rating_cnt']=user_genre_stats[genre]

# 计算用户最喜欢的类型（返回最大值所在的索引），axis=1表示按照行操作
user_genre_stats['favorite_genre']=user_genre_stats[genre_columns].idxmax(axis=1)

# 计算用户喜欢的类型数量（评分过的类型数）（得到该行中评分大于0的类型数量）
user_genre_stats['num_liked_genres']=(user_genre_stats[genre_columns]>0).sum(axis=1)

user_genre_stats.drop(columns=genre_columns,inplace=True)

# 合并所有用户特征
user_features_v1=pd.merge(user_stats, user_genre_stats, on='user_id')

# 添加用户活跃度分段特征
user_features_v1['activity_level'] = pd.cut(
    user_features_v1['rating_count'],
    bins=[0, 5, 20, 100, float('inf')],
    labels=['0', '1', '2', '3']
)

# 用户活跃度特征编码
activity_encoder = LabelEncoder()
user_features_v1['activity_level_encoded'] = activity_encoder.fit_transform(user_features_v1['activity_level'])

# 用户最喜欢的类型编码
genre_encoder = LabelEncoder()
user_features_v1['favorite_genre_encoded'] = genre_encoder.fit_transform(user_features_v1['favorite_genre'])

### 获取用户自身的特征

In [119]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

user_features_v2=user_features_v1
# 1. 性别标签编码
encoder_gender = LabelEncoder()
user_features_v2['gender_encoded'] = encoder_gender.fit_transform(users_df['gender'])

In [120]:
pd.set_option('display.max_columns', None)
user_features_v2.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded,gender_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,Drama,13,2,1,7,0
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269718,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,Drama,14,3,2,7,1
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,Comedy,15,2,1,4,1
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,Action,12,2,1,0,1
4,5,3.146465,1.132699,198,1,5,0.4351,0.35999,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,Drama,17,3,2,7,1


In [121]:
# 2. 职业标签编码
encoder_occupation=LabelEncoder()
user_features_v2['occupation_encoded'] = encoder_occupation.fit_transform(users_df['occupation'])

In [122]:
# 3. 年龄分桶
n_buckets = 7  # 希望分成的桶的数量
users_df['age_bucket'] = pd.qcut(users_df['age'], q=n_buckets, labels=False, duplicates='drop')

# 如果 qcut 导致桶的数量少于 n_buckets，我们仍然使用 LabelEncoder 保持编码的一致性
encoder_age = LabelEncoder()
user_features_v2['age_encoded'] = encoder_age.fit_transform(users_df['age_bucket'])

In [123]:
genre_to_numeric_mapping = dict(zip(genre_encoder.classes_, range(len(genre_encoder.classes_))))
print(genre_to_numeric_mapping)

{'Action': 0, 'Adventure': 1, 'Animation': 2, "Children's": 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Fantasy': 8, 'Film-Noir': 9, 'Horror': 10, 'Musical': 11, 'Mystery': 12, 'Romance': 13, 'Sci-Fi': 14, 'Thriller': 15, 'War': 16, 'Western': 17}


In [124]:
user_features_v2['favorite_genre'] = user_features_v2['favorite_genre'].apply(lambda x: genre_encoder.transform(x.split('|')))

In [125]:
user_features_v2.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,favorite_genre,num_liked_genres,activity_level,activity_level_encoded,favorite_genre_encoded,gender_encoded,occupation_encoded,age_encoded
0,1,4.188679,0.680967,53,3,5,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,[7],13,2,1,7,0,10,0
1,2,3.713178,1.001513,129,1,5,-0.131614,0.269718,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,[7],14,3,2,7,1,16,4
2,3,3.901961,0.984985,51,1,5,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,[4],15,2,1,4,1,15,1
3,4,4.190476,1.077917,21,1,5,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,[0],12,2,1,0,1,7,3
4,5,3.146465,1.132699,198,1,5,0.4351,0.35999,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,[7],17,3,2,7,1,20,1


### 预处理

In [126]:
from sklearn.preprocessing import StandardScaler

# 特征归一化
scaler = StandardScaler()
features_to_scale = ['rating_count', 'rating_min', 'rating_max', 'rating_std', 'num_liked_genres']
user_features_v2[features_to_scale] = scaler.fit_transform(user_features_v2[features_to_scale])

In [127]:
user_features_v2.drop(columns=['activity_level', 'favorite_genre'], inplace=True)
user_features_v2.head()

Unnamed: 0,user_id,mean_rating,rating_std,rating_count,rating_min,rating_max,rating_strictness,rating_variability,Action_favorite_degree,Adventure_favorite_degree,Animation_favorite_degree,Children's_favorite_degree,Comedy_favorite_degree,Crime_favorite_degree,Documentary_favorite_degree,Drama_favorite_degree,Fantasy_favorite_degree,Film-Noir_favorite_degree,Horror_favorite_degree,Musical_favorite_degree,Mystery_favorite_degree,Romance_favorite_degree,Sci-Fi_favorite_degree,Thriller_favorite_degree,War_favorite_degree,Western_favorite_degree,num_liked_genres,activity_level_encoded,favorite_genre_encoded,gender_encoded,occupation_encoded,age_encoded
0,1,4.188679,-1.615782,-0.584221,3.199606,0.061461,-0.607115,0.162573,0.043103,0.043103,0.155172,0.172414,0.12069,0.017241,0.0,0.181034,0.025862,0.0,0.0,0.12069,0.0,0.051724,0.025862,0.025862,0.017241,0.0,-1.097206,1,7,0,10,0
1,2,3.713178,-0.042568,-0.189889,-0.504394,0.061461,-0.131614,0.269718,0.194444,0.065972,0.0,0.0,0.086806,0.041667,0.0,0.274306,0.003472,0.003472,0.006944,0.0,0.010417,0.083333,0.059028,0.107639,0.052083,0.010417,-0.665949,2,7,1,16,4
2,3,3.901961,-0.123684,-0.594598,-0.504394,0.061461,-0.320396,0.252433,0.186992,0.203252,0.02439,0.02439,0.243902,0.0,0.0,0.065041,0.01626,0.0,0.02439,0.00813,0.00813,0.04065,0.04878,0.04065,0.01626,0.04878,-0.234692,1,4,1,15,1
3,4,4.190476,0.332416,-0.750255,-0.504394,0.061461,-0.608912,0.25723,0.327586,0.103448,0.0,0.017241,0.0,0.017241,0.0,0.103448,0.034483,0.0,0.051724,0.0,0.0,0.034483,0.155172,0.068966,0.051724,0.034483,-1.528462,1,0,1,7,3
4,5,3.146465,0.601283,0.168123,-0.504394,0.061461,0.4351,0.35999,0.088068,0.025568,0.011364,0.017045,0.159091,0.059659,0.017045,0.295455,0.0,0.008523,0.028409,0.008523,0.022727,0.085227,0.042614,0.110795,0.017045,0.002841,0.627821,2,7,1,20,1


## 保存特征数据

In [128]:
user_features_v2.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/user_features.csv', index=False)
movie_features_v1.to_csv('/Users/bytedance/Desktop/MovieLens-Recommendation-System/recall_dual_tower/features/movie_features.csv', index=False)

# 保存为csv的时候，list会变成object

In [129]:
# 保存为 Pickle 格式
import pickle
with open('./features/movie_features.pkl', 'wb') as f:
    pickle.dump(movie_features_v1, f)

with open('./features/user_features.pkl', 'wb') as f:
    pickle.dump(user_features_v1, f)