In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation


In [2]:
data = pd.read_csv('./ml-100k/u.data')
data.head()

Unnamed: 0,196	242	3	881250949
0,186\t302\t3\t891717742
1,22\t377\t1\t878887116
2,244\t51\t2\t880606923
3,166\t346\t1\t886397596
4,298\t474\t4\t884182806


In [3]:
data = pd.read_csv('./ml-100k/u.data',sep = '\t', names = ['user_id','item_id','rating','timstamp'])

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating,timstamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id     100000 non-null int64
item_id     100000 non-null int64
rating      100000 non-null int64
timstamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
data.describe()

Unnamed: 0,user_id,item_id,rating,timstamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data,test_size = 0.1) 
# 训练集与测试集70:30比例拆分

In [8]:
train_data.describe()

Unnamed: 0,user_id,item_id,rating,timstamp
count,85000.0,85000.0,85000.0,85000.0
mean,462.655776,425.708412,3.531094,883519400.0
std,266.578429,330.802168,1.124984,5340452.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448600.0
50%,447.0,322.0,4.0,882824900.0
75%,682.0,631.0,4.0,888206000.0
max,943.0,1682.0,5.0,893286600.0


In [9]:
# 此时的data就为用户评分表，即复现中的movie_rating
# 创建用户-电影评分表
rating_matrix = train_data.pivot_table(index = ['item_id'],columns = ['user_id'],
                                values = 'rating').reset_index(drop = True)
rating_matrix.fillna(0, inplace = True)
user_item = rating_matrix
user_item.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
user_item.shape

(1663, 943)

In [11]:
# 训练集构造见物品相似矩阵
movie_similarity = 1 - pairwise_distances(rating_matrix.values, metric = "cosine")
np.fill_diagonal(movie_similarity,0)
rating_matrix = pd.DataFrame(movie_similarity)
rating_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1653,1654,1655,1656,1657,1658,1659,1660,1661,1662
0,0.0,0.325363,0.268649,0.388008,0.215161,0.085867,0.53009,0.406943,0.419975,0.241605,...,0.035962,0.0,0.0,0.0,0.0,0.038143,0.0,0.0,0.050858,0.050858
1,0.325363,0.0,0.204581,0.415963,0.276896,0.073932,0.314428,0.29325,0.186448,0.119369,...,0.059785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084549,0.084549
2,0.268649,0.204581,0.0,0.277023,0.19661,0.087499,0.300503,0.172512,0.247741,0.153123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10224
3,0.388008,0.415963,0.277023,0.0,0.303763,0.085834,0.436866,0.467931,0.368093,0.238914,...,0.042367,0.0,0.0,0.09986,0.09986,0.0,0.0,0.0,0.059916,0.0
4,0.215161,0.276896,0.19661,0.303763,0.0,0.044885,0.256256,0.18805,0.253625,0.035021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104893


In [12]:
rating_matrix.shape

(1663, 1663)

In [13]:
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movie = pd.read_csv('./ml-100k/u.item', sep = '|', names = movie_cols, encoding = 'latin-1',usecols = ["movie_id","title","release_date","video_release_date","imdb_url"])
movie.drop(movie.columns[[3,4]], axis = 1, inplace =  True)
movie.head()

Unnamed: 0,movie_id,title,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


In [14]:
# 训练集推荐TOP_N
user_inp = "Four Rooms (1995)" 
inp = movie[movie['title'] == user_inp].index.tolist()
inp = inp[0]
movie['similarity'] = rating_matrix.iloc[inp]
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.268649
1,2,GoldenEye (1995),01-Jan-1995,0.204581
2,3,Four Rooms (1995),01-Jan-1995,0.0
3,4,Get Shorty (1995),01-Jan-1995,0.277023
4,5,Copycat (1995),01-Jan-1995,0.19661


In [15]:
# 测试集构造
# 此时的data就为用户评分表，即复现中的movie_rating
# 创建用户-电影评分表
rating_test_matrix = test_data.pivot_table(index = ['item_id'],columns = ['user_id'],
                                values = 'rating').reset_index(drop = True)
rating_test_matrix.fillna(0, inplace = True)
user_item_test = rating_test_matrix
user_item_test.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
user_item_test.shape

(1344, 939)

In [17]:
# 测试集构造见物品相似矩阵
movie_similarity_test = 1 - pairwise_distances(rating_test_matrix.values, metric = "cosine")
np.fill_diagonal(movie_similarity_test,0)
rating_matrix_test = pd.DataFrame(movie_similarity_test)
rating_matrix_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343
0,0.0,0.013118,0.047892,0.134074,0.036122,0.0,0.09819,0.028913,0.044272,0.121506,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.013118,0.0,0.0,0.173735,0.039527,0.0,0.072338,0.047457,0.015298,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.047892,0.0,0.0,0.056381,0.151523,0.0,0.046606,0.023102,0.130322,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.134074,0.173735,0.056381,0.0,0.0,0.0,0.092705,0.025529,0.057606,0.050486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111629,0.0,0.0,0.0
4,0.036122,0.039527,0.151523,0.0,0.0,0.0,0.138412,0.182956,0.039493,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
rating_matrix_test.shape

(1344, 1344)

In [19]:
# 测试集top-N推荐
user_inp = "Four Rooms (1995)" 
inp = movie[movie['title'] == user_inp].index.tolist()
inp = inp[0]
movie['similarity'] = rating_matrix_test.iloc[inp]
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.047892
1,2,GoldenEye (1995),01-Jan-1995,0.0
2,3,Four Rooms (1995),01-Jan-1995,0.0
3,4,Get Shorty (1995),01-Jan-1995,0.056381
4,5,Copycat (1995),01-Jan-1995,0.151523


In [20]:
# 训练集预测，求得RMSE
# 将dataframe形式的用户物品相似矩阵转为numpy 数组
rating_matrix_array = rating_matrix.values
user_item_pre = rating_matrix_array.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis = 1)])

pre_flatten = user_item_pre
rating_matrix_flatten = rating_matrix_array
from sklearn.metrics import mean_squared_error
from math import sqrt
error_train = sqrt(mean_squared_error(pre_flatten, rating_matrix_flatten))
print(error_train)

0.07056032278349414


In [21]:
# 测试集预测，求得RMSE
rating_test_matrix_array = rating_matrix_test.values
user_item_test_pre = rating_test_matrix_array.dot(movie_similarity_test) / np.array([np.abs(movie_similarity_test).sum(axis = 1)])

test_pre_flatten = user_item_test_pre
test_rating_matrix_flatten = rating_test_matrix_array
error_test = sqrt(mean_squared_error(test_pre_flatten, test_rating_matrix_flatten))

In [22]:
print(error_test)

0.04059048693811386
