In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation


In [2]:
data = pd.read_csv('./ml-100k/u.data')
data.head()

Unnamed: 0,196	242	3	881250949
0,186\t302\t3\t891717742
1,22\t377\t1\t878887116
2,244\t51\t2\t880606923
3,166\t346\t1\t886397596
4,298\t474\t4\t884182806


In [3]:
data = pd.read_csv('./ml-100k/u.data',sep = '\t', names = ['user_id','item_id','rating','timstamp'])

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating,timstamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id     100000 non-null int64
item_id     100000 non-null int64
rating      100000 non-null int64
timstamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


In [6]:
data.describe()

Unnamed: 0,user_id,item_id,rating,timstamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data,test_size = 0.2) 
# 训练集与测试集70:30比例拆分

In [8]:
train_data.describe()

Unnamed: 0,user_id,item_id,rating,timstamp
count,80000.0,80000.0,80000.0,80000.0
mean,462.47835,425.68265,3.531125,883519900.0
std,266.754262,331.326184,1.125476,5346875.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879447800.0
50%,446.0,322.0,4.0,882824200.0
75%,682.0,631.0,4.0,888206500.0
max,943.0,1681.0,5.0,893286600.0


In [9]:
# 此时的data就为用户评分表，即复现中的movie_rating
# 创建用户-电影评分表
rating_matrix = train_data.pivot_table(index = ['item_id'],columns = ['user_id'],
                                values = 'rating').reset_index(drop = True)
rating_matrix.fillna(0, inplace = True)
user_item = rating_matrix
user_item.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
user_item.shape

(1653, 943)

In [11]:
# 训练集构造见物品相似矩阵
movie_similarity = 1 - pairwise_distances(rating_matrix.values, metric = "cosine")
np.fill_diagonal(movie_similarity,0)
rating_matrix = pd.DataFrame(movie_similarity)
rating_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652
0,0.0,0.29502,0.257755,0.375809,0.244381,0.079282,0.474922,0.387328,0.420855,0.240923,...,0.0,0.0,0.038821,0.0,0.0,0.0,0.038821,0.0,0.0,0.0
1,0.29502,0.0,0.179476,0.379244,0.22292,0.03772,0.308107,0.2591,0.210892,0.114269,...,0.0,0.060758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085925
2,0.257755,0.179476,0.0,0.275472,0.146916,0.088198,0.283946,0.17615,0.247357,0.145872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.035007,0.0,0.0,0.0
3,0.375809,0.379244,0.275472,0.0,0.24178,0.093721,0.383565,0.359082,0.343314,0.217048,...,0.0,0.0,0.0,0.0,0.104921,0.104921,0.041968,0.0,0.0,0.0
4,0.244381,0.22292,0.146916,0.24178,0.0,0.050246,0.289706,0.212446,0.240609,0.0451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
rating_matrix.shape

(1653, 1653)

In [13]:
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movie = pd.read_csv('./ml-100k/u.item', sep = '|', names = movie_cols, encoding = 'latin-1',usecols = ["movie_id","title","release_date","video_release_date","imdb_url"])
movie.drop(movie.columns[[3,4]], axis = 1, inplace =  True)
movie.head()

Unnamed: 0,movie_id,title,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


In [14]:
# 训练集推荐TOP_N
user_inp = "Four Rooms (1995)" 
inp = movie[movie['title'] == user_inp].index.tolist()
inp = inp[0]
movie['similarity'] = rating_matrix.iloc[inp]
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.257755
1,2,GoldenEye (1995),01-Jan-1995,0.179476
2,3,Four Rooms (1995),01-Jan-1995,0.0
3,4,Get Shorty (1995),01-Jan-1995,0.275472
4,5,Copycat (1995),01-Jan-1995,0.146916


In [15]:
# 测试集构造
# 此时的data就为用户评分表，即复现中的movie_rating
# 创建用户-电影评分表
rating_test_matrix = test_data.pivot_table(index = ['item_id'],columns = ['user_id'],
                                values = 'rating').reset_index(drop = True)
rating_test_matrix.fillna(0, inplace = True)
user_item_test = rating_test_matrix
user_item_test.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
user_item_test.shape

(1408, 943)

In [17]:
# 测试集构造见物品相似矩阵
movie_similarity_test = 1 - pairwise_distances(rating_test_matrix.values, metric = "cosine")
np.fill_diagonal(movie_similarity_test,0)
rating_matrix_test = pd.DataFrame(movie_similarity_test)
rating_matrix_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407
0,0.0,0.098176,0.057578,0.179906,0.074565,0.0,0.085373,0.131421,0.090497,0.056024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.098176,0.0,0.047695,0.0725,0.063081,0.0,0.166615,0.072013,0.02751,0.011976,...,0.0,0.190117,0.190117,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.057578,0.047695,0.0,0.077952,0.04162,0.0,0.079729,0.053452,0.0,0.084285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.179906,0.0725,0.077952,0.0,0.046863,0.0,0.137108,0.08727,0.121396,0.064059,...,0.0,0.0,0.0,0.0,0.0,0.169485,0.0,0.0,0.0,0.0
4,0.074565,0.063081,0.04162,0.046863,0.0,0.0,0.070299,0.086406,0.038409,0.059221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
rating_matrix_test.shape

(1408, 1408)

In [19]:
# 测试集top-N推荐
user_inp = "Four Rooms (1995)" 
inp = movie[movie['title'] == user_inp].index.tolist()
inp = inp[0]
movie['similarity'] = rating_matrix_test.iloc[inp]
movie.columns=['movie_id', 'title', 'release_date', 'similarity']
movie.head(5)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.057578
1,2,GoldenEye (1995),01-Jan-1995,0.047695
2,3,Four Rooms (1995),01-Jan-1995,0.0
3,4,Get Shorty (1995),01-Jan-1995,0.077952
4,5,Copycat (1995),01-Jan-1995,0.04162


In [20]:
# 训练集预测，求得RMSE
# 将dataframe形式的用户物品相似矩阵转为numpy 数组
rating_matrix_array = rating_matrix.values
user_item_pre = rating_matrix_array.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis = 1)])

pre_flatten = user_item_pre
rating_matrix_flatten = rating_matrix_array
from sklearn.metrics import mean_squared_error
from math import sqrt
error_train = sqrt(mean_squared_error(pre_flatten, rating_matrix_flatten))
print(error_train)

0.06818551177939682


In [21]:
# 测试集预测，求得RMSE
rating_test_matrix_array = rating_matrix_test.values
user_item_test_pre = rating_test_matrix_array.dot(movie_similarity_test) / np.array([np.abs(movie_similarity_test).sum(axis = 1)])

test_pre_flatten = user_item_test_pre
test_rating_matrix_flatten = rating_test_matrix_array
error_test = sqrt(mean_squared_error(test_pre_flatten, test_rating_matrix_flatten))

In [22]:
print(error_test)

0.042863154834767984
