In [1]:
from modelarts.session import Session
session = Session()
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation
from sklearn.model_selection import train_test_split


In [2]:
# 创建用户信息表
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./ml-100k/u.user', sep='|', names=users_cols, parse_dates=True)
users.shape

(943, 5)

In [3]:
# 创建电影评分信息表
movie_rating_cols = ['user_id',  'movie_id', 'rating', 'unix_tiemstamp'] # set the table and col_name,define it use_rating_cols
movie_rating = pd.read_csv('./ml-100k/u.data',sep = '\t', names = movie_rating_cols, parse_dates = True)
movie_rating.shape

(100000, 4)

In [4]:
# 创建电影信息表
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movie = pd.read_csv('./ml-100k/u.item', sep = '|', names = movie_cols, encoding = 'latin-1',usecols = ["movie_id","title","release_date","video_release_date","imdb_url"])
movie.shape

(1682, 5)

In [5]:
# 合并电影和电影评分信息
movie_ratings = pd.merge(movie, movie_rating)
# 在movie_ratings的基础上合并用户信息
dataframe = pd.merge(movie_ratings, users)
dataframe.shape

(100000, 12)

In [6]:
# 清除无效信息
dataframe.drop(dataframe.columns[[3,4,7]], axis = 1, inplace = True)
movie_rating.drop("unix_tiemstamp", inplace = True, axis = 1)
movie.drop(movie.columns[[3,4]], axis = 1, inplace =  True)
dataframe.head()

Unnamed: 0,movie_id,title,release_date,user_id,rating,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,308,4,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,308,5,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,308,4,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,308,4,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,308,5,60,M,retired,95076


In [7]:
# 创建用户评分表
movie_rating_matrix = movie_rating.pivot_table(index = ['movie_id'], columns = ['user_id'],
                                                values = 'rating').reset_index(drop = True)
movie_rating_matrix.fillna(0, inplace = True)
cmu = movie_rating_matrix
cmu.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
movie_similarity = pairwise_distances(movie_rating_matrix.values, metric = "euclidean")
np.fill_diagonal(movie_similarity,0)
movie_rating_matrix = pd.DataFrame(movie_similarity)
# 归一化
movie_rating_matrix_sim = ( movie_rating_matrix -  movie_rating_matrix.min()) / ( movie_rating_matrix.max() -  movie_rating_matrix.min())
movie_rating_matrix_sim.head()
# 使用 movie_similarity = 1 / pairwise_distances(movie_rating_matrix.values, metric = "euclidean")以归一化时
# 因为有的矩阵值为0，因此出现了1/0的错误，利用函数进行转换时则在矩阵行列重新赋值上出现错误，需要重新改进，于是使用pandas中的归一方法 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.798145,0.79286,0.838456,0.800474,0.797119,0.789699,0.83349,0.839681,0.799925,...,0.789635,0.791043,0.79083,0.790384,0.789635,0.789636,0.789801,0.789698,0.789511,0.789511
1,0.817693,0.0,0.417626,0.516981,0.40511,0.390031,0.80344,0.650229,0.752656,0.471085,...,0.358191,0.359055,0.358285,0.3576,0.358191,0.356972,0.357818,0.35729,0.356095,0.356095
2,0.842428,0.433127,0.0,0.568357,0.386745,0.326689,0.801822,0.67732,0.727459,0.431215,...,0.289976,0.291035,0.290052,0.289239,0.289076,0.288575,0.289674,0.288988,0.290052,0.287342
3,0.808984,0.486884,0.516113,0.0,0.510885,0.516712,0.772038,0.627864,0.722977,0.548609,...,0.496426,0.497059,0.493923,0.494255,0.495375,0.495381,0.495909,0.495579,0.494977,0.49445
4,0.858113,0.423899,0.390199,0.567624,0.0,0.344746,0.816572,0.660419,0.729573,0.461466,...,0.2981,0.299131,0.298178,0.297384,0.2981,0.296726,0.29779,0.297126,0.298178,0.295543


In [9]:
movie_rating_matrix_sim.shape

(1682, 1682)

In [12]:
# 推荐相似性较高的前5部
user_inp = "Copycat (1995)" # have a blank between cat and (), attention!
inp = movie[movie['title'] == user_inp].index.tolist()
# movie['title'] == user_inp条件
# movie[movie['title'] == user_inp] 条件所在行
# movie[movie['title'] == user_inp].index.tolist() 确定这些行所在列
inp = inp[0]
movie['similarity'] = movie_rating_matrix_sim.iloc[inp]
# iloc 基于索引确定
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.858113
1,2,GoldenEye (1995),01-Jan-1995,0.423899
2,3,Four Rooms (1995),01-Jan-1995,0.390199
3,4,Get Shorty (1995),01-Jan-1995,0.567624
4,5,Copycat (1995),01-Jan-1995,0.0
