In [2]:
from modelarts.session import Session
session = Session()
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation
from sklearn.model_selection import train_test_split


In [3]:
# 创建用户信息表
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./ml-100k/u.user', sep='|', names=users_cols, parse_dates=True)
users.shape

(943, 5)

In [4]:
# 创建电影评分信息表
movie_rating_cols = ['user_id',  'movie_id', 'rating', 'unix_tiemstamp'] # set the table and col_name,define it use_rating_cols
movie_rating = pd.read_csv('./ml-100k/u.data',sep = '\t', names = movie_rating_cols, parse_dates = True)
movie_rating.shape

(100000, 4)

In [5]:
# 创建电影信息表
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movie = pd.read_csv('./ml-100k/u.item', sep = '|', names = movie_cols, encoding = 'latin-1',usecols = ["movie_id","title","release_date","video_release_date","imdb_url"])
movie.shape

(1682, 5)

In [6]:
# 合并电影和电影评分信息
movie_ratings = pd.merge(movie, movie_rating)
# 在movie_ratings的基础上合并用户信息
dataframe = pd.merge(movie_ratings, users)
dataframe.shape

(100000, 12)

In [7]:
# 清除无效信息
dataframe.drop(dataframe.columns[[3,4,7]], axis = 1, inplace = True)
movie_rating.drop("unix_tiemstamp", inplace = True, axis = 1)
movie.drop(movie.columns[[3,4]], axis = 1, inplace =  True)
dataframe.head()

Unnamed: 0,movie_id,title,release_date,user_id,rating,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,308,4,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,308,5,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,308,4,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,308,4,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,308,5,60,M,retired,95076


In [8]:
# 创建用户评分表
movie_rating_matrix = movie_rating.pivot_table(index = ['movie_id'], columns = ['user_id'],
                                                values = 'rating').reset_index(drop = True)
movie_rating_matrix.fillna(0, inplace = True)
cmu = movie_rating_matrix
cmu.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
movie_similarity = pairwise_distances(movie_rating_matrix.values, metric = "euclidean")
np.fill_diagonal(movie_similarity,0)
movie_rating_matrix = pd.DataFrame(movie_similarity)
movie_rating_matrix.head()
# 使用 movie_similarity = 1 / pairwise_distances(movie_rating_matrix.values, metric = "euclidean")以使相似度减小时
# 因为有的矩阵值为0，因此出现了1/0的错误，利用函数进行转换时则在矩阵行列重新赋值上出现错误，需要重新改进
''' movie_simi = pairwise_distances(movie_rating_matrix.values, metric = "euclidean")
    def checkZ(movie_simi):
        if(movie_simi.any() == 0)：
            movie_simi = 0
        else:
            movie_simi = 1 / pairwise_distances(movie_rating_matrix.values, metric = "euclidean")
    movie_similarity = movie_simi
    np.fill_diagonal(movie_similarity,0)
    movie_rating_matrix = pd.DataFrame(movie_similarity)
'''

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,77.72387,80.074965,76.896034,81.565924,84.734881,71.007042,77.006493,78.625696,82.752643,...,84.723078,84.87049,84.82924,84.799764,84.723078,84.782074,84.82924,84.799764,84.687661,84.687661
1,77.72387,0.0,42.178193,47.413078,41.279535,41.460825,72.242647,60.074953,70.476947,48.733972,...,38.431758,38.522721,38.431758,38.366652,38.431758,38.327536,38.431758,38.366652,38.196859,38.196859
2,80.074965,42.178193,0.0,52.12485,39.408121,34.727511,72.097157,62.577951,68.117545,44.609416,...,31.112698,31.22499,31.112698,31.032241,31.016125,30.983867,31.112698,31.032241,31.112698,30.82207
3,76.896034,47.413078,52.12485,0.0,52.05766,54.927225,69.419018,58.00862,67.697858,56.753854,...,53.263496,53.329167,52.981129,53.028294,53.150729,53.188345,53.263496,53.216539,53.094256,53.037722
4,81.565924,41.279535,39.408121,52.05766,0.0,36.646964,73.42343,61.016391,68.315445,47.738873,...,31.984371,32.093613,31.984371,31.906112,31.984371,31.859065,31.984371,31.906112,31.984371,31.701735


In [12]:
movie_rating_matrix.shape

(1682, 1682)

In [13]:
# 推荐相似性较高的前5部
user_inp = "Copycat (1995)" # have a blank between cat and (), attention!
inp = movie[movie['title'] == user_inp].index.tolist()
# movie['title'] == user_inp条件
# movie[movie['title'] == user_inp] 条件所在行
# movie[movie['title'] == user_inp].index.tolist() 确定这些行所在列
inp = inp[0]
movie['similarity'] = movie_rating_matrix.iloc[inp]
# iloc 基于索引确定
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,81.565924
1,2,GoldenEye (1995),01-Jan-1995,41.279535
2,3,Four Rooms (1995),01-Jan-1995,39.408121
3,4,Get Shorty (1995),01-Jan-1995,52.05766
4,5,Copycat (1995),01-Jan-1995,0.0
