# == 목차 == 
# 1. pre-processing for movie-lens data (ml-latest-small, 2.4M)
# 2. 모델 정의
# 3. 테스트
# 4. cross validation
# 5. 실제 구현

## =====================================================================

In [1]:
import sys
sys.path.insert(0, '..')
import recsys_movielens as rm

## =====================================================================
# 1. pre-processing for movie-lens data (ml-latest-small, 2.4M)
## users: 671, movies: 9066

## ● 기본 데이터셋 생성

In [2]:
import pandas as pd
import numpy as np

In [3]:
file_path_movie = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/movies.csv'
file_path_rating = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/ratings.csv'

### 영화 데이터프레임

In [4]:
header_movie = ['item_id', 'title', 'genres']
df_movies = pd.read_csv(file_path_movie, header=0, names=header_movie)
df_movies

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


### 평점 데이터프레임

In [5]:
header_rating = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings = pd.read_csv(file_path_rating, header=0, names=header_rating)
df_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


### 영화 id의 최대 숫자가 163949로 sparse matrix 를 생성했을 때 용량이 매우 커지게 된다.
### 이 문제를 해결하기 위해서 영화별 unique id 를 생성하여 mapping table로 관리한다.
### unique id는 영화별 1씩 증가하는 숫자

In [6]:
from recsys_movielens.preprocessing import get_mapping_table

df_mapping = get_mapping_table(df_ratings, 'item_id', 'movie_unique_id')
df_mapping

Unnamed: 0,movie_unique_id,item_id
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,8
8,9,9
9,10,10


### 위에서 만든 mapping table로 df_ratings의 item_id를 movie_unique_id로 바꾼다.

In [7]:
df_join = pd.merge(df_ratings, df_mapping, on=['item_id'])

del df_join['item_id']
columns_new = ['user_id', 'movie_unique_id', 'rating', 'timestamp']
df_join = df_join[columns_new]
df = df_join.rename(columns={'movie_unique_id': 'item_id'})

df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,7,31,3.0,851868750
2,31,31,4.0,1273541953
3,32,31,4.0,834828440
4,36,31,3.0,847057202
5,39,31,3.0,832525157
6,73,31,3.5,1255591860
7,88,31,3.0,1239755559
8,96,31,2.5,1223256331
9,110,31,4.0,840100695


## ● 모델의 성능평가를 위해 위에서 만든 기본 데이터셋을 훈련/테스트 셋으로 나눈다.
### create train / test split (randomly)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;무작위로 데이터를 train set과 test set으로 나눈다. ratio_test=0.25면 train:test (0.75:0.25)



### randomly create train / test split

In [8]:
from recsys_movielens.model_selection import split_train_test_random
from recsys_movielens.preprocessing import convert_df_spar

df_train, df_test = split_train_test_random(df, ratio_test=0.25)
df_spar_train = convert_df_spar(df_train, df.item_id.max())
df_spar_test = convert_df_spar(df_test, df.item_id.max())

print('shape of train: {}'.format(df_spar_train.shape))
print('shape of test: {}'.format(df_spar_test.shape))

shape of train: (671, 9066)
shape of test: (671, 9066)


## =====================================================================
# 2. 모델 정의

## ● User-based

#### 특정 유저의 특정 영화에 대한 평점 예측 ( user:movie -> 1:1 )

In [9]:
from recsys_movielens.model import predict_userbased_user_item

#### 특정 유저의 모든 영화에 대한 평점 예측 ( user:movie -> 1:all )

In [10]:
from recsys_movielens.model import predict_userbased_user_allitems

#### Model 생성 (recommend_userbased)

In [11]:
from recsys_movielens.model import recommend_userbased

## ● Item-based

#### 특정 영화의 특정 유저에 대한 평점 예측 (user:movie -> 1:1)

In [12]:
from recsys_movielens.model import predict_itembased_user_item

#### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [13]:
from recsys_movielens.model import predict_itembased_item_allusers

#### Model 생성 (recommend_itembased)

In [14]:
from recsys_movielens.model import recommend_itembased

## ● Item-based adj-cosine

#### 유사도 매트릭스 (Item-based adj cosine에만 적용)

#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;user_A = [0 0 0 5 6 0]
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;user_B = [0 2 3 4 0 0]
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;두 유저에 대한 아이템 평가가 위와 같이 있을 때 adj cosine 을 적용하는 방법
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 1. 모든 평점 값에(0 포함) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 2. 평점이 있는 값들만(0 제외) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 1과 2를 각각 돌려보고 rmse 값의 차이를 확인한다.




#### 특정 유저의 특정 영화에 대한 평점 예측 (user:movie -> 1:1)

In [15]:
from recsys_movielens.model import predict_itembased_user_item_adjcos

#### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [16]:
from recsys_movielens.model import predict_itembased_item_allusers_adjcos                                                             

#### 모델 생성 (recommend_itembased_adjcosine)

#### case 1.

In [17]:
from recsys_movielens.model import recommend_itembased_adjcosine_all

#### case 2.

In [18]:
from recsys_movielens.model import recommend_itembased_adjcosine_exist

## ● SVD

#### 모델 생성 (SVD)

In [19]:
from recsys_movielens.model import model_svd

## =====================================================================
# 3. 테스트

In [22]:
import time
from scipy import sparse
from recsys_movielens.model_selection import get_rmse

## ● User-based

#### 특정 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(영화수: 9066)

In [23]:
time_start = time.time()

pred = predict_userbased_user_allitems(43, df_spar_train)

time_end = time.time()
print('process time: {} seconds'.format(time_end - time_start))

process time: 0.9879601001739502 seconds


#### Multi-core test

In [24]:
np_multicore_result = recommend_userbased(df_spar_train, cores='*')

recommend_userbased process time is: 124.60884308815002 secconds


#### Multi-core RMSE

In [25]:
rmse_user = get_rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 2.274529593302882


## ● Item-based

#### 특정 영화의 모든 유저에 대한 평점 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(유저수: 671)

In [26]:
time_start = time.time()

pred = predict_itembased_item_allusers(9063, df_spar_train, metric='cosine', k=5)

time_end = time.time()
print('process time: {} seconds'.format(time_end - time_start))

process time: 0.08015823364257812 seconds


#### Multi-core test

In [27]:
np_multicore_result = recommend_itembased(df_spar_train, metric='cosine', k=5, cores='*')

recommend_itembased process time is: 208.63938307762146 secconds


#### Multi-core RMSE

In [28]:
rmse_user = get_rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 2.637040468749335


## ● Item-based adj-cosine

In [29]:
from recsys_movielens.model import get_sim_matrix_sub_all, get_sim_matrix_sub_exist

#### case 1.

In [30]:
time_start = time.time()

sim_matrix_all = get_sim_matrix_sub_all(df_spar_train)
predict_itembased_item_allusers_adjcos(9063, df_spar_train, sim_matrix_all, k=5)

time_end = time.time()
print('process time (subtract all): {} seconds'.format(time_end - time_start))

process time (subtract all): 30.892136096954346 seconds


#### case 2.

In [31]:
time_start = time.time()

sim_matrix_exist = get_sim_matrix_sub_exist(df_spar_train)
predict_itembased_item_allusers_adjcos(9063, df_spar_train, sim_matrix_exist, k=5)

time_end = time.time()
print('process time (subtract exist): {} seconds'.format(time_end - time_start))

process time (subtract exist): 30.00233817100525 seconds


#### test

#### case 1.

In [32]:
np_multicore_result_T_all = recommend_itembased_adjcosine_all(df_spar_train, k=5, cores='*')

recommend_itembased_adjcosine_all process time is: 218.09147691726685 secconds


#### case 2.

In [33]:
np_multicore_result_T_exist = recommend_itembased_adjcosine_exist(df_spar_train, k=5, cores='*')

recommend_itembased_adjcosine_exist process time is: 208.12700510025024 secconds


#### RMSE

#### case 1.

In [34]:
rmse_user = get_rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_all))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 2.5872578460884013


#### case 2.

In [35]:
rmse_user = get_rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_exist))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 3.258188811215428


## ● SVD

#### 모든 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간 (user : movie -> all : all) &nbsp;&nbsp;(유저수: 671, 영화수: 9066)

In [36]:
np_preds_svd = model_svd(df_spar_train, 100, 3.4)

model_svd process time is: 1.8330838680267334 secconds


In [37]:
np_preds_svd.shape

(671, 9066)

#### RMSE

In [38]:
rmse_user = get_rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_preds_svd))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 1.1533147373238206


## =====================================================================
# 4. Cross validation

In [40]:
from recsys_movielens.model_selection import cross_validation

## ● User-based

In [41]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_user = {'metric': 'cosine', 'k': 5, 'cores': '*'}
list_validations, avg_rmse = cross_validation(df, recommend_userbased, dict_args_user, k_fold=5)

print('1. list of validations: {}'.format(list_validations))
print()
print('2. average of rmse: {}'.format(avg_rmse))

recommend_userbased process time is: 124.21095395088196 secconds
recommend_userbased process time is: 133.85060095787048 secconds
recommend_userbased process time is: 133.53828406333923 secconds
recommend_userbased process time is: 138.51878595352173 secconds
recommend_userbased process time is: 193.17075300216675 secconds
1. list of validations: [3.341583468971939, 3.0617905575408875, 2.856158364897267, 2.9008954790835704, 3.208277987008117]

2. average of rmse: 3.073741171500356


## ● Item-based

In [42]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_item = {'metric': 'cosine', 'k': 5, 'cores': '*'}
list_validations, avg_rmse =  cross_validation(df, recommend_itembased, dict_args_item, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations))
print()
print('2. average of rmse: \n{}'.format(avg_rmse))

recommend_itembased process time is: 204.9301359653473 secconds
recommend_itembased process time is: 220.7637858390808 secconds
recommend_itembased process time is: 213.6071240901947 secconds
recommend_itembased process time is: 207.6651690006256 secconds
recommend_itembased process time is: 202.8849470615387 secconds
1. list of validations: 
[3.518695442697611, 3.338093421496395, 3.158124584825143, 3.17273485524054, 3.3970437005896743]

2. average of rmse: 
3.3169384009698724


## ● Item-based adj-cosine

#### case 1.

In [43]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_item_adj_all = {'k': 5, 'cores': '*'}
list_validations_all, avg_rmse_all = \
            cross_validation(df, recommend_itembased_adjcosine_all, dict_args_item_adj_all, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations_all))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_all))

recommend_itembased_adjcosine_all process time is: 219.4983880519867 secconds
recommend_itembased_adjcosine_all process time is: 211.79837894439697 secconds
recommend_itembased_adjcosine_all process time is: 208.5556378364563 secconds
recommend_itembased_adjcosine_all process time is: 202.695059299469 secconds
recommend_itembased_adjcosine_all process time is: 207.2712230682373 secconds
1. list of validations: 
[3.5278897045892808, 3.320317177113818, 3.1285993062283506, 3.1511320254275925, 3.3893478288468537]

2. average of rmse: 
3.3034572084411793


#### case 2.

In [44]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_item_adj_exist = {'k': 5, 'cores': '*'}
list_validations_exist, avg_rmse_exist = \
        cross_validation(df, recommend_itembased_adjcosine_exist, dict_args_item_adj_exist, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations_exist))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_exist))

recommend_itembased_adjcosine_exist process time is: 213.06494808197021 secconds
recommend_itembased_adjcosine_exist process time is: 199.80396175384521 secconds
recommend_itembased_adjcosine_exist process time is: 202.3062698841095 secconds
recommend_itembased_adjcosine_exist process time is: 216.8268756866455 secconds
recommend_itembased_adjcosine_exist process time is: 202.84531617164612 secconds
1. list of validations: 
[3.5350181047124107, 3.4535655010734514, 3.3885471229058197, 3.4082471170122823, 3.518039849369206]

2. average of rmse: 
3.460683539014634


## ● SVD

In [45]:
# cross_validation(df, model, dict_args, k_fold=5)
# dict_args_svd = {'k_input': 50, 'val_adj': 3}
# list_validations, avg_rmse = cross_validation(df, model_svd, dict_args_svd, k_fold=5)

dict_args_svd = {'k_input': 100, 'val_adj': 3.4}
list_validation, avg_rmse = cross_validation(df, model_svd, dict_args_svd, k_fold=5)

print('1. list of validations: {}'.format(list_validations))
print()
print('2. average of rmse: {}'.format(avg_rmse))
        

model_svd process time is: 1.7764520645141602 secconds
model_svd process time is: 1.7971959114074707 secconds
model_svd process time is: 1.7911689281463623 secconds
model_svd process time is: 1.8110136985778809 secconds
model_svd process time is: 1.8042759895324707 secconds
1. list of validations: [3.518695442697611, 3.338093421496395, 3.158124584825143, 3.17273485524054, 3.3970437005896743]

2. average of rmse: 1.0862551849010251


## =====================================================================
# 5. 실제 추천

In [20]:
import recsys_movielens as rm

#### 특정 유저에게 영화를 추천
###### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
###### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

In [21]:
#### 3번 유저가 본 영화목록을 보여준다

rm.get_seen_movie(df, df_movies, 3)

Unnamed: 0,item_id,title,genres
0,57,Home for the Holidays (1995),Drama
1,101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
2,220,Castle Freak (1995),Horror
3,240,Hideaway (1995),Thriller
4,267,Major Payne (1995),Comedy
5,285,Beyond Bedlam (1993),Drama|Horror
6,321,Strawberry and Chocolate (Fresa y chocolate) (...,Drama
7,322,Swimming with Sharks (1995),Comedy|Drama
8,342,Muriel's Wedding (1994),Comedy
9,473,In the Army Now (1994),Comedy|War


## ● User-based

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [21]:
# dict_args로 유사도 알고리즘(metric), k 값 가능

dict_args = {'metric': 'cosine', 'k': 5}
rm.get_recomm_movie(df, df_movies, 3, flag='user-based', dict_args=dict_args)

get_recomm_movie process time is: 1.228332757949829 secconds


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,10,GoldenEye (1995),Action|Adventure|Thriller
2,16,Casino (1995),Crime|Drama
3,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,46,How to Make an American Quilt (1995),Drama|Romance


## ● Item-based

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [22]:
# dict_args로 k 값 가능
# 유사도 알고리즘(metric) cosine only

dict_args = {'k': 5}
rm.get_recomm_movie(df, df_movies, 3, flag='item-based', dict_args=dict_args)

get_recomm_movie process time is: 39.16015386581421 secconds


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,7,Sabrina (1995),Comedy|Romance
2,16,Casino (1995),Crime|Drama
3,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,43,Restoration (1995),Drama


## ● Item-based adj-cosine

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [23]:
# dict_args로 k 값 가능
# 유사도 알고리즘(metric) adj_cosine only

dict_args = {'k': 5}
rm.get_recomm_movie(df, df_movies, 3, flag='item-based-adjall', dict_args=dict_args)

get_recomm_movie process time is: 41.72283220291138 secconds


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,16,Casino (1995),Crime|Drama
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,46,How to Make an American Quilt (1995),Drama|Romance
4,49,When Night Is Falling (1995),Drama|Romance


In [24]:
# dict_args로 k 값 가능
# 유사도 알고리즘(metric) adj_cosine only

dict_args = {'k': 5}
rm.get_recomm_movie(df, df_movies, 3, flag='item-based-adjexist', dict_args=dict_args)

get_recomm_movie process time is: 39.53447484970093 secconds


Unnamed: 0,item_id,title,genres
0,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
1,46,How to Make an American Quilt (1995),Drama|Romance
2,49,When Night Is Falling (1995),Drama|Romance
3,130,Angela (1995),Drama
4,233,Exotica (1994),Drama


## ● SVD

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [25]:
# dict_args로 k_input, val_adj 값 가능. 
# k_input: dimension of sigma matrix
# val_adj: 보정값

dict_args = {'k_input': 100, 'val_adj': 3.4}
rm.get_recomm_movie(df, df_movies, 3, flag='svd', dict_args=dict_args)

model_svd process time is: 1.8108718395233154 secconds
get_recomm_movie process time is: 2.7257678508758545 secconds


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
