# 목차
## &nbsp;&nbsp;&nbsp;&nbsp;1. Pre-processing for movie-lens data (ml-latest-small, 2.4M)
## &nbsp;&nbsp;&nbsp;&nbsp;2. User-based (cosine)
## &nbsp;&nbsp;&nbsp;&nbsp;3. Item-based
## &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3-1. Item-based (cosine)
## &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3-2. Item-based (adjusted cosine)
## &nbsp;&nbsp;&nbsp;&nbsp;4. SVD

## ● 1번은 전처리 과정이므로 각각의 알고리즘 수행을 위해 아래를 참고한다.
### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;User-based CF 돌리기 위해서 1번을 수행한 후 2번의 과정을 수행 ( 1 -> 2 )
### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Item-based CF 돌리기 위해서 1번을 수행한 후 3번의 과정을 수행 ( 1 -> 3 )
### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;SVD 돌리기 위해서 1번을 수행한 후 4번의 과정을 수행 ( 1 -> 4 )
## =====================================================================

# 1. pre-processing for movie-lens data (ml-latest-small, 2.4M)

## ● 기본 데이터셋 생성

In [21]:
import pandas as pd
import numpy as np

In [22]:
file_path_movie = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/movies.csv'
file_path_rating = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/ratings.csv'

In [23]:
header_movie = ['item_id', 'title', 'genres']
df_movies = pd.read_csv(file_path_movie, header=0, names=header_movie)
df_movies

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [24]:
header_rating = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings = pd.read_csv(file_path_rating, header=0, names=header_rating)
df_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [25]:
i = df_ratings['item_id']
np_movie_unique = np.unique(np.array(i), axis=0)

list_movie = []
for i, movie_id in enumerate(np_movie_unique):
    list_movie.append((i+1, movie_id))
    
df_mapping = pd.DataFrame(list_movie, columns=['movie_unique_id', 'item_id'])
df_mapping

Unnamed: 0,movie_unique_id,item_id
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,8
8,9,9
9,10,10


In [26]:
df_join = pd.merge(df_ratings, df_mapping, on=['item_id'])

del df_join['item_id']
columns_new = ['user_id', 'movie_unique_id', 'rating', 'timestamp']
df_join = df_join[columns_new]
df = df_join.rename(columns={'movie_unique_id': 'item_id'})

df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,7,31,3.0,851868750
2,31,31,4.0,1273541953
3,32,31,4.0,834828440
4,36,31,3.0,847057202
5,39,31,3.0,832525157
6,73,31,3.5,1255591860
7,88,31,3.0,1239755559
8,96,31,2.5,1223256331
9,110,31,4.0,840100695


In [27]:
import scipy
from scipy import sparse

r = df['rating'].astype(float)
u = df['user_id'].astype(int)
i = df['item_id'].astype(int)

mat_spar_rating = sparse.csr_matrix(
        (r, (u, i)),
        dtype=np.float
)

mat_spar_rating.shape

(672, 9067)

## ● 모델의 성능평가를 위해 위에서 만든 기본 데이터셋을 훈련/테스트 셋으로 나눈다.
###  실제 train / test 데이터를 반환하는 부분인 1)과 교차검증 함수를 정의한 2)로 나뉜다.
### 1) create train / test split (randomly)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;무작위로 데이터를 train set과 test set으로 나눈다. ratio=0.25면 train:test (0.75:0.25)
### 2) create function for checking cross validation
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;cross validation check 를 위한 함수 정의 
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;k_fold 이 10이면 train:test (0.9:0.1) 비율로 테스트를 10번 수행하고 평균값을 냄


### 1) randomly create train / test split

In [28]:
# split data to train(0.75) / test(0.25) by movies ranked for all users
# and make it sparse matrix

def convert_df_mat(df):
    r = df['rating'].astype(float)
    u = df['user_id'].astype(int)
    i = df['item_id'].astype(int)
    
    mat_spar = sparse.csr_matrix(
            (r, (u, i)),
            dtype=np.float
    )
    
    return mat_spar


def make_df_from_mat(mat_spar, num_movie):
    df_spar = pd.DataFrame(mat_spar.toarray())
    df_spar = df_spar.drop(df_spar.index[0], axis=0)
    del df_spar[0]
    
    if df_spar.shape[1] < num_movie:
        num_row = df_spar.shape[0]
        num_col = df_spar.shape[1]
        num_diff = num_movie - num_col
        
        np_add = np.zeros([num_row, num_diff])
        df_add = pd.DataFrame(np_add, index=np.arange(num_row) + 1, 
                              columns=(num_col + np.arange(num_diff) + 1))
        
        df_spar = pd.concat([df_spar, df_add], axis=1)
        
    return df_spar


def split_mat_random(mat_spar, ratio_test=0.25):
    n_1 = mat_spar.nonzero()[0]
    n_2 = mat_spar.nonzero()[1]
    np_sparse = np.dstack((n_1, n_2))[0, :, :]
    
    df_sparse_index = pd.DataFrame(np_sparse, columns=['user', 'movie'])
    df_group = df_sparse_index.groupby('user')['movie'].apply(np.array).reset_index(name='movies')
    
    list_movie_2d = [np_movie for np_movie in df_group['movies']]
    
    list_test = list(map(lambda x: np.random.choice(x, size=int(len(x) / int(1/ratio_test))), list_movie_2d))
    list_train = []
    for i, movies_all in enumerate(list_movie_2d):
        movies_test = list_test[i]
        movies_train = list(set(movies_all) - set(movies_test))
        
        list_train.append(movies_train)
        
    # np_test = list(map(lambda x: np.array_split(x, 4)[0], np_movie_2d))
    # np_train = list(map(lambda x: np.concatenate(np.array_split(x, 4)[1:]), np_movie_2d))
    
    lens_test = [len(movies) for movies in list_test]
    df_test = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_test),
                            'item_id': np.concatenate(list_test)})
    
    lens_train = [len(movies) for movies in list_train]
    df_train = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_train),
                             'item_id': np.concatenate(list_train)})
    
    df_train = pd.merge(df_train, df, on=['user_id', 'item_id'])
    df_test = pd.merge(df_test, df, on=['user_id', 'item_id'])
    
    mat_spar_train = convert_df_mat(df_train)
    mat_spar_test = convert_df_mat(df_test)
    
    return mat_spar_train, mat_spar_test
    

In [29]:
mat_spar_train, mat_spar_test = split_mat_random(mat_spar_rating)
df_spar_train = make_df_from_mat(mat_spar_train, mat_spar_rating.shape[1] - 1)
df_spar_test = make_df_from_mat(mat_spar_train, mat_spar_rating.shape[1] - 1)

In [30]:
print('shape of train: {}'.format(df_spar_train.shape))
print('shape of test: {}'.format(df_spar_test.shape))

shape of train: (671, 9066)
shape of test: (671, 9066)


### 2) creating function for checking cross validation

In [31]:
def convert_df_mat(df):
    r = df['rating'].astype(float)
    u = df['user_id'].astype(int)
    i = df['item_id'].astype(int)
    
    mat_spar = sparse.csr_matrix(
            (r, (u, i)),
            dtype=np.float
    )
    
    return mat_spar

def make_df_from_mat(mat_spar, num_movie):
    df_spar = pd.DataFrame(mat_spar.toarray())
    df_spar = df_spar.drop(df_spar.index[0], axis=0)
    del df_spar[0]
    
    if df_spar.shape[1] < num_movie:
        num_row = df_spar.shape[0]
        num_col = df_spar.shape[1]
        num_diff = num_movie - num_col
        
        np_add = np.zeros([num_row, num_diff])
        df_add = pd.DataFrame(np_add, index=np.arange(num_row) + 1, 
                              columns=(num_col + np.arange(num_diff) + 1))
        
        df_spar = pd.concat([df_spar, df_add], axis=1)
        
    return df_spar

def get_dataset(df, k_fold, offset):
    mat_spar = convert_df_mat(df)
    n_1 = mat_spar.nonzero()[0]
    n_2 = mat_spar.nonzero()[1]
    np_sparse = np.dstack((n_1, n_2))[0, :, :]
    
    df_sparse_index = pd.DataFrame(np_sparse, columns=['user', 'movie'])
    df_group = df_sparse_index.groupby('user')['movie'].apply(np.array).reset_index(name='movies')
    
    list_movie_2d = [np_movie for np_movie in df_group['movies']]
    
    list_test = list(map(lambda x: np.array_split(x, k_fold)[offset], list_movie_2d))
    list_train = []
    for i, movies_all in enumerate(list_movie_2d):
        movies_test = list_test[i]
        movies_train = list(set(movies_all) - set(movies_test))
        
        list_train.append(movies_train)
    
    lens_train = [len(movies) for movies in list_train]
    df_train_without_rating = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_train),
                             'item_id': np.concatenate(list_train)})

    lens_test = [len(movies) for movies in list_test]
    df_test_without_rating = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_test),
                            'item_id': np.concatenate(list_test)})
    
    df_train = pd.merge(df_train_without_rating, df, on=['user_id', 'item_id'])
    df_test = pd.merge(df_test_without_rating, df, on=['user_id', 'item_id'])
    
    mat_spar_train = convert_df_mat(df_train)
    mat_spar_test = convert_df_mat(df_test)

    num_items = mat_spar.shape[1]
    df_spar_train = make_df_from_mat(mat_spar_train, num_items - 1)
    df_spar_test = make_df_from_mat(mat_spar_test, num_items - 1)

    return df_spar_train, df_spar_test

def get_rmse(mat_spar_test, mat_spar_predict):
    def rmse(actual, prediction):
        tu_new = actual.nonzero()[0] - 1, actual.nonzero()[1] - 1
        pred = prediction[tu_new]
        actu = actual[tu_new]
        
        return sqrt(mean_squared_error(actu, pred))

    rmse = rmse(mat_spar_test, mat_spar_predict)
    print('rmse for item-based is {}'.format(rmse))

    return rmse


In [32]:
def cross_validation(df, model, k_fold=5, cores=4):
    list_validations = []
    for offset in range(k_fold):
        df_train, df_validation = get_dataset(df, k_fold, offset)
        np_result = model(df_train, cores)

        mat_spar_test = sparse.csr_matrix(df_validation)
        mat_spar_predict = sparse.csr_matrix(np_result)

        list_validations.append(get_rmse(mat_spar_test, mat_spar_predict))
        
    avg_rmse = np.mean(list_validations)
    return list_validations, avg_rmse


## =====================================================================
# 2. User-based CF (similarity algorithm = cosine)

## ● 모델 정의

### KNN

In [33]:
from sklearn.neighbors import NearestNeighbors

def findksimilarusers(user_id, df_spar, metric='cosine', k=5):
    similarities = []
    indices=[]
    model_knn = NearestNeighbors(metric=metric, algorithm='brute') 
    model_knn.fit(df_spar)

    distances, indices = model_knn.kneighbors(df_spar.iloc[user_id - 1, :].values.reshape(1, -1), n_neighbors = k)
    similarities = 1 - distances.flatten()

    return similarities, indices

### 특정 유저의 특정 영화에 대한 평점 예측 ( user:movie -> 1:1 )

In [34]:
def predict_userbased_user_item(user_id, item_id, df_spar, metric='cosine', k=5):
    prediction = 0
    similarities, indices = findksimilarusers(user_id, df_spar, metric, k) # similar users based on cosine similarity
    mean_rating = df_spar.iloc[user_id - 1, :].mean() # to adjust for zero based indexing
    sum_wt = np.sum(similarities) - 1
    product = 1
    wtd_sum = 0
    
    indices_flat = indices.flatten()
    for i, indice in enumerate(indices_flat):
        if indice + 1 == user_id:
            continue
        else:
            ratings_diff = df_spar.iloc[indice, item_id - 1] - np.mean(df_spar.iloc[indice, :])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    if sum_wt == 0:
        prediction = 0
    else:
        prediction = int(round(mean_rating + (wtd_sum / sum_wt)))
    
    return prediction

### 특정 유저의 모든 영화에 대한 평점 예측 ( user:movie -> 1:all )

In [35]:
def predict_userbased_user_allitems(user_id, df_spar, metric='cosine', k=5):
    similarities, indices = findksimilarusers(user_id, df_spar, metric, k)
    mean_rating = df_spar.iloc[user_id - 1, :].mean()
    
    list_pred_user = []
    indices_flat = indices.flatten()
    for item_id in range(1, df_spar.shape[1] + 1):
        sum_wt = np.sum(similarities) - 1
        product = 1
        wtd_sum = 0
        prediction = 0
        for i, indice in enumerate(indices_flat):
            if indice + 1 == user_id:
                continue
            else:
                rating_diff = df_spar.iloc[indice, item_id - 1] - np.mean(df_spar.iloc[indice, :])
                product = rating_diff * (similarities[i])
                wtd_sum = wtd_sum + product

        if sum_wt == 0:
            prediction = 0
        else:
            prediction = int(round(mean_rating + (wtd_sum / sum_wt)))
            
        list_pred_user.append(prediction)
    
    return list_pred_user

### Model 생성 (recommend_userbased)

In [42]:
def work(df_spar, id_user):
        return id_user, predict_userbased_user_allitems(id_user, df_spar)

def recommend_userbased(df_spar, cores=4):
    from multiprocessing import Pool
    from functools import partial
    import time
 
    time_start = time.time()
    
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[0] + 1))
    func = partial(work, df_spar)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all

## ● 테스트 (using random data set created in step 1.)

### 특정 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(영화수: 9066)

In [37]:
import time

time_start = time.time()
pred = predict_userbased_user_allitems(43, df_spar_train)
time_end = time.time()

print('한 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: {} seconds'.format(time_end - time_start))
print('영화 수: {}'.format(len(pred)))

한 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: 8.041028261184692 seconds
영화 수: 9066


### 모델을 돌려서 예측 값 매트릭스를 도출
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;싱글코어로 671 유저에 대한 평점을 예측하는 데 시간이 오래 걸림
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;싱글코어와 멀티코어 두가지 방법을 사용해서 시간을 측정하고 rmse 값을 비교 (동일한 결과 증명)

### Single-core

In [38]:
import time

time_start = time.time()

list_result = []
for i in range(1, df_spar_train.shape[0] + 1):
    list_pred_user = predict_userbased_user_allitems(i, df_spar_train)
    list_result.append(list_pred_user)

time_end = time.time()
time_process = time_end - time_start

print('process time for single core is: {} seconds'.format(time_process))
    
np_singlecore_result = np.array(list_result)
    

process time for single core is: 4336.494698047638 seconds


### Single-core RMSE

In [39]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0] - 1, actual.nonzero()[1] - 1
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_singlecore_result))
print('rmse for user-based is {}'.format(rmse_user))


rmse for user-based is 0.5851852641824478


### Multi-core

In [43]:
from multiprocessing import Pool
from functools import partial
import time

np_multicore_result = recommend_userbased(df_spar_train, cores=12)


processing time is: 1354.1436290740967 seconds


### Multi-core RMSE

In [130]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for user-based is {}'.format(rmse_user))

rmse for user-based is 2.4195351583498956


## ● Cross validation

In [45]:
# cross_validation(df, model, k_fold=5, cores=4)

list_validations, avg_rmse = cross_validation(df, recommend_userbased, k_fold=5, cores=12)
print('1. list of validations: {}'.format(list_validations))
print()
print('2. average of rmse: {}'.format(avg_rmse))

processing time is: 1430.518856048584 seconds
rmse for item-based is 0.8700279067086393
processing time is: 1462.0760560035706 seconds
rmse for item-based is 0.7050301048042926
processing time is: 1359.1902298927307 seconds
rmse for item-based is 0.5678812610317454
processing time is: 1349.71630525589 seconds
rmse for item-based is 0.45828062822050847
processing time is: 1334.771852016449 seconds
rmse for item-based is 0.46762767690326396
1. list of validations: [0.8700279067086393, 0.7050301048042926, 0.5678812610317454, 0.45828062822050847, 0.46762767690326396]

2. average of rmse: 0.6137695155336899


## ● 실제 추천
### 특정 유저에게 영화를 추천 (유저 기반)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

### 함수 정의

In [85]:
def get_list_seen_movie_id(df, id_user):
    mat_spar = convert_df_mat(df)
    sparse_nonzero = mat_spar.nonzero()
    list_id_movie_seen = []
    for i, id_user_exist_rating in enumerate(sparse_nonzero[0]):
        if id_user_exist_rating == id_user:
            list_id_movie_seen.append(sparse_nonzero[1][i])    

    return list_id_movie_seen

def get_df_movie_from_id(list_id_movie):
    np_insert = np.array(list_id_movie).reshape(-1, 1)
    df_id_movie = pd.DataFrame(np_insert, columns=['movie_unique_id'])
    df_joined = pd.merge(df_id_movie, df_mapping, on=['movie_unique_id'])
    df_movie = pd.merge(df_joined, df_movies, on=['item_id']).drop(columns=['movie_unique_id'])
    
    return df_movie

def recommend_item(df, id_user):
    mat_spar = convert_df_mat(df)
    df_spar = make_df_from_mat(mat_spar, 9066)
    list_pred_rank = predict_userbased_user_allitems(id_user, df_spar) 
    
    list_recommendation = []
    for i, rank in enumerate(list_pred_rank):
        if rank >= 3:
            if i + 1 not in get_list_seen_movie_id(df, id_user):
                list_recommendation.append(i+1)
    
    return list_recommendation

def get_seen_movie(df, id_user):
    return get_df_movie_from_id(get_list_seen_movie_id(df, id_user))

def get_recomm_movie(df, id_user):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item(df, id_user)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

### 해당 유저가 본 영화목록을 보여준다

In [86]:
get_seen_movie(df, 3)

Unnamed: 0,item_id,title,genres
0,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
1,110,Braveheart (1995),Action|Drama|War
2,247,Heavenly Creatures (1994),Crime|Drama
3,267,Major Payne (1995),Comedy
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,318,"Shawshank Redemption, The (1994)",Crime|Drama
6,355,"Flintstones, The (1994)",Children|Comedy|Fantasy
7,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8,377,Speed (1994),Action|Romance|Thriller
9,527,Schindler's List (1993),Drama|War


### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [87]:
get_recomm_movie(df, 3)

process time: 10.486234188079834


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,150,Apollo 13 (1995),Adventure|Drama|IMAX
2,457,"Fugitive, The (1993)",Thriller
3,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
4,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...


## =====================================================================
# 3. Item-based CF

# 3-1. Item-based (cosine)

## ● 모델 정의

### KNN

In [49]:
from sklearn.neighbors import NearestNeighbors

def find_similar_items(item_id, df_spar, metric='cosine', k=5):
    df_T = df_spar.T
    model_knn = NearestNeighbors(metric=metric, algorithm='brute') 
    model_knn.fit(df_T)

    similarities, indices = model_knn.kneighbors(df_T.iloc[item_id - 1, :].values.reshape(1, -1), n_neighbors = k)
    similarities = 1 - similarities.flatten()

    return similarities, indices

### 특정 유저의 특정 영화에 대한 평점 예측 (user:movie -> 1:1)

In [50]:
def predict_itembased_user_item(user_id, item_id, df_spar, metric='cosine', k=5):
    prediction = 0
    similarities, indices = find_similar_items(item_id, df_spar, metric, k) 
    sum_wt = np.sum(similarities) - 1
    product = 1
    wtd_sum = 0
    
    indices_flat = indices.flatten()
    for i, indice in enumerate(indices_flat):
        if indice + 1 == item_id:
            continue
        else:
            product = df_spar.iloc[user_id - 1, indice] * (similarities[i])
            wtd_sum = wtd_sum + product

    if sum_wt == 0:
        prediction = 0
    else:
        prediction = int(round(wtd_sum / sum_wt))
            
    return prediction

### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [51]:
def predict_itembased_item_allusers(item_id, df_spar, metric='cosine', k=5):
    similarities, indices = find_similar_items(item_id, df_spar, metric, k)
    
    list_pred_all_rating = []
    indices_flat = indices.flatten()
    for user_id in range(1, df_spar.shape[0] + 1):
        prediction = 0
        sum_wt = np.sum(similarities) - 1
        product = 1
        wtd_sum = 0
    
        indices_flat = indices.flatten()
        for i, indice in enumerate(indices_flat):
            if indice + 1 == item_id:
                continue
            else:
                product = df_spar.iloc[user_id - 1, indice] * (similarities[i])
                wtd_sum = wtd_sum + product

        if sum_wt == 0:
            prediction = 0
        else:
            prediction = int(round(wtd_sum / sum_wt))
        
        list_pred_all_rating.append(prediction)
    
    return list_pred_all_rating
    
    

### Model 생성 (recommend_itembased)

In [99]:
def work(df_spar, id_item):
        return id_item, predict_itembased_item_allusers(id_item, df_spar)

def recommend_itembased(df_spar, cores=4):
    from multiprocessing import Pool
    from functools import partial
    import time
 
    time_start = time.time()
    
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work, df_spar)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all.T

## ● 테스트 (using random data set created in step 1.)

### 특정 영화의 모든 유저에 대한 평점 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(유저수: 671)

In [53]:
import time

time_start = time.time()
predict_itembased_item_allusers(9063, df_spar_train)
time_end = time.time()

print('process time: {} seconds'.format(time_end - time_start))

process time: 0.14527487754821777 seconds


### Multi-core

In [55]:
from multiprocessing import Pool
from functools import partial
import time

np_multicore_result = recommend_itembased(df_spar_train, cores=4)


processing time is: 331.95966124534607 seconds


### Multi-core RMSE

In [131]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_item = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for item-based is {}'.format(rmse_item))

rmse for item-based is 2.4195351583498956


## ● Cross validation

#### k=5

In [57]:
list_validations, avg_rmse =  cross_validation(df, recommend_itembased, k_fold=5, cores=12)

processing time is: 279.9190180301666 seconds
rmse for item-based is 0.847609255168369
processing time is: 277.76744771003723 seconds
rmse for item-based is 0.6655370086749495
processing time is: 270.00314688682556 seconds
rmse for item-based is 0.5043305935573018
processing time is: 271.35660910606384 seconds
rmse for item-based is 0.4355942653565045
processing time is: 261.707701921463 seconds
rmse for item-based is 0.41940115885727797


In [58]:
print('1. list of validations: \n{}'.format(list_validations))
print()
print('2. average of rmse: \n{}'.format(avg_rmse))

1. list of validations: 
[0.847609255168369, 0.6655370086749495, 0.5043305935573018, 0.4355942653565045, 0.41940115885727797]

2. average of rmse: 
0.5744944563228807


## ● 실제 추천
### 특정 유저에게 영화를 추천 (아이템 기반)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

### 함수 정의

In [104]:
def get_list_seen_movie_id(df, id_user):
    mat_spar = convert_df_mat(df)
    sparse_nonzero = mat_spar.nonzero()
    list_id_movie_seen = []
    for i, id_user_exist_rating in enumerate(sparse_nonzero[0]):
        if id_user_exist_rating == id_user:
            list_id_movie_seen.append(sparse_nonzero[1][i])    

    return list_id_movie_seen

def get_df_movie_from_id(list_id_movie):
    np_insert = np.array(list_id_movie).reshape(-1, 1)
    df_id_movie = pd.DataFrame(np_insert, columns=['movie_unique_id'])
    df_joined = pd.merge(df_id_movie, df_mapping, on=['movie_unique_id'])
    df_movie = pd.merge(df_joined, df_movies, on=['item_id']).drop(columns=['movie_unique_id'])
    
    return df_movie

def recommend_item(df, id_user):
    mat_spar = convert_df_mat(df)
    df_spar = make_df_from_mat(mat_spar, 9066)
    np_multicore_result = recommend_itembased(df_spar, cores=12)
    
    list_recommendation = []
    for i, rank in enumerate(np_multicore_result[id_user - 1]):
        if rank >= 3:
            if i + 1 not in get_list_seen_movie_id(df, id_user):
                list_recommendation.append(i+1)
    
    return list_recommendation

def get_seen_movie(df, id_user):
    return get_df_movie_from_id(get_list_seen_movie_id(df, id_user))

def get_recomm_movie(df, id_user):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item(df, id_user)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

### 해당 유저가 본 영화목록을 보여준다

In [105]:
get_seen_movie(df, 3)

Unnamed: 0,item_id,title,genres
0,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
1,110,Braveheart (1995),Action|Drama|War
2,247,Heavenly Creatures (1994),Crime|Drama
3,267,Major Payne (1995),Comedy
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,318,"Shawshank Redemption, The (1994)",Crime|Drama
6,355,"Flintstones, The (1994)",Children|Comedy|Fantasy
7,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8,377,Speed (1994),Action|Romance|Thriller
9,527,Schindler's List (1993),Drama|War


### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [106]:
get_recomm_movie(df, 3)

processing time is: 268.2513756752014 seconds
process time: 270.41912293434143


Unnamed: 0,item_id,title,genres
0,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
1,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller
3,2329,American History X (1998),Crime|Drama


## =====================================================================

# 3-2. Item-based (adjusted cosine)

#### user_A = [0 0 0 5 6 0]
#### user_B = [0 2 3 4 0 0]
#### 두 유저에 대한 아이템 평가가 위와 같이 있을 때 adj cosine 을 적용하는 방법
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 1. 모든 평점 값에(0 포함) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 2. 평점이 있는 값들만(0 제외) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산




### case 1과 2를 각각 돌려보고 rmse 값의 차이를 확인한다.

## ● 모델 정의

### KNN

In [59]:
def find_similar_items_adjcos(sim_matrix, id_item, k=5):
    np_top_k = sim_matrix[id_item-1].sort_values(ascending=False)[:k+1]
    similarities = np_top_k.values
    indices = np_top_k.index

    return similarities, indices



### 특정 유저의 특정 영화에 대한 평점 예측 (user:movie -> 1:1)

In [60]:
def predict_itembased_user_item_adjcos(sim_matrix, id_user, id_item, k=5):
    prediction = wtd_sum = 0
    product = 1
    similarities, indices = find_similar_items_adjcos(sim_matrix, id_item, k) 
    sum_wt = np.sum(similarities) - 1
    
    for i, indice in enumerate(indices):
        if indice + 1 == item_id:
            continue
        else:
            product = df_spar.iloc[id_user - 1, indice] * (similarities[i])
            wtd_sum = wtd_sum + product
            
    if sum_wt == 0:
        prediction = 0
    else:
        prediction = int(round(wtd_sum / sum_wt))

    return prediction

### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [61]:
def predict_itembased_item_all_users_adjcos(df_spar, sim_matrix, id_item, k=5):
    similarities, indices = find_similar_items_adjcos(sim_matrix, id_item, k)
    
    list_pred_all_rating = []
    for id_user in range(1, df_spar.shape[0] + 1):
        prediction = 0
        sum_wt = np.sum(similarities) - 1
        product = 1
        wtd_sum = 0
        
        for i, indice in enumerate(indices):
            if indice + 1 == id_item:
                continue
            else:
                product = df_spar.iloc[id_user - 1, indice] * (similarities[i])
                wtd_sum = wtd_sum + product
        
        if sum_wt == 0:
            prediction = 0
        else:
            prediction = int(round(wtd_sum / sum_wt))
            
        list_pred_all_rating.append(prediction)
        
    return list_pred_all_rating


### adj cosine 유사도 매트릭스

#### case 1.

In [62]:
import time
from scipy.spatial.distance import pdist, squareform

def get_sim_matrix_sub_all(df_spar):
    M = np.array(df_spar)
    
    M_u = M.mean(axis=1)
    M_adj = M - M_u[:, None]
    
    time_start = time.time()
    sim_matrix_sub_all = 1 - squareform(pdist(M_adj.T, 'cosine'))
    time_end = time.time()
    
    print(time_end - time_start)
    sim_matrix_sub_all = pd.DataFrame(sim_matrix_sub_all)
    sim_matrix_sub_all
    
    return sim_matrix_sub_all

#### case 2.

In [63]:
import time
from scipy.spatial.distance import pdist, squareform

def get_sim_matrix_sub_exist(df_spar):
    M = np.array(df_spar)
    list_mean = []
    for rating_user in M:
        np_idx_nonzero = rating_user.nonzero()[0]
        mean = np.mean(rating_user[np_idx_nonzero])
        list_mean.append(mean)
    M_u = np.array(list_mean)
    
    M_adj = np.zeros([df_spar.shape[0], df_spar.shape[1]])

    for row, np_ratings in enumerate(M):
        np_idx_nonzero = np_ratings.nonzero()[0]
        M_adj[row][np_idx_nonzero] = np_ratings[np_idx_nonzero] - M_u[row]
    
    time_start = time.time()
    sim_matrix_sub_exist = 1 - squareform(pdist(M_adj.T, 'cosine'))
    time_end = time.time()
    print(time_end - time_start)
    
    mask = np.isnan(sim_matrix_sub_exist)
    sim_matrix_sub_exist[mask] = 0
    
    sim_matrix_sub_exist = pd.DataFrame(sim_matrix_sub_exist)
    
    return sim_matrix_sub_exist

### 모델 생성 (recommend_itembased_adjcosine)

#### case 1.

In [64]:
def work_all(df_spar, sim_matrix, id_item):
    return id_item, predict_itembased_item_all_users_adjcos(df_spar, sim_matrix, id_item)

def recommend_itembased_adjcosine_all(df_spar, cores=4):
    from multiprocessing import Pool
    from functools import partial
    import time
 
    time_start = time.time()

    sim_matrix = get_sim_matrix_sub_all(df_spar)
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work_all, df_spar, sim_matrix)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all.T

#### case 2.

In [65]:
def work_exist(df_spar, sim_matrix, id_item):
    return id_item, predict_itembased_item_all_users_adjcos(df_spar, sim_matrix, id_item)

def recommend_itembased_adjcosine_exist(df_spar, cores=4):
    from multiprocessing import Pool
    from functools import partial
    import time
 
    time_start = time.time()

    sim_matrix = get_sim_matrix_sub_exist(df_spar)
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work_exist, df_spar, sim_matrix)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_exist = sorted(result_pred, key=lambda x: x[0])
    list_pred_ratings_exist = np.array(list(map(lambda x: x[1], list_pred_ratings_exist)))

    return list_pred_ratings_exist.T

## ● 테스트 (using random data set created in step 1.)

### 특정 영화의 모든 유저에 대한 평점 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(유저수: 671)

#### case 1.

In [66]:
import time

time_start = time.time()
sim_matrix = get_sim_matrix_sub_all(df_spar_train)
predict_itembased_item_all_users_adjcos(df_spar_train, sim_matrix, 9063, 5)
time_end = time.time()

print('process time (subtract all): {} seconds'.format(time_end - time_start))

30.034446001052856
process time (subtract all): 30.118995904922485 seconds


#### case 2.

In [67]:
import time

time_start = time.time()
sim_matrix = get_sim_matrix_sub_exist(df_spar_train)
predict_itembased_item_all_users_adjcos(df_spar_train, sim_matrix, 9063, 5)
time_end = time.time()

print('process time (subtract exist): {} seconds'.format(time_end - time_start))

30.06711173057556
process time (subtract exist): 30.573616981506348 seconds


### Multi-core

#### case 1.

In [68]:
np_multicore_result_T_all = recommend_itembased_adjcosine_all(df_spar_train, cores=12)

29.984957218170166
processing time is: 221.62176704406738 seconds


#### case 2.

In [69]:
np_multicore_result_T_exist = recommend_itembased_adjcosine_exist(df_spar_train, cores=12)

30.09826922416687
processing time is: 215.94884490966797 seconds


### Multi-core RMSE

#### case 1.

In [132]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_all))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 2.3175792296557756


#### case 2.

In [71]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0] - 1, actual.nonzero()[1] - 1
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_exist))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 0.5609812522739072


## ● Cross validation

#### case 1.

In [72]:
list_validations_all, avg_rmse_all =  cross_validation(df, recommend_itembased_adjcosine_all, k_fold=5, cores=12)

29.942795038223267
processing time is: 217.62825417518616 seconds
rmse for item-based is 0.8463269188407647
29.985151052474976
processing time is: 233.05463695526123 seconds
rmse for item-based is 0.6633676679308442
29.8596351146698
processing time is: 217.72955083847046 seconds
rmse for item-based is 0.5111734590666968
29.808252096176147
processing time is: 218.79511213302612 seconds
rmse for item-based is 0.43738114889734137
29.63261079788208
processing time is: 211.875155210495 seconds
rmse for item-based is 0.4183133730399872


In [73]:
print('1. list of validations: \n{}'.format(list_validations_all))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_all))

1. list of validations: 
[0.8463269188407647, 0.6633676679308442, 0.5111734590666968, 0.43738114889734137, 0.4183133730399872]

2. average of rmse: 
0.5753125135551269


#### case 2.

In [74]:
list_validations_exist, avg_rmse_exist =  cross_validation(df, recommend_itembased_adjcosine_exist, k_fold=5, cores=12)

29.929625034332275
processing time is: 219.63015699386597 seconds
rmse for item-based is 0.7728787281559889
29.999388933181763
processing time is: 224.47297191619873 seconds
rmse for item-based is 0.5648685642303076
29.545958995819092
processing time is: 218.4144651889801 seconds
rmse for item-based is 0.3987024648176661
29.759747982025146
processing time is: 218.1958041191101 seconds
rmse for item-based is 0.3339847540400054
29.713716983795166
processing time is: 215.2415030002594 seconds
rmse for item-based is 0.336712315658973


In [75]:
print('1. list of validations: \n{}'.format(list_validations_exist))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_exist))

1. list of validations: 
[0.7728787281559889, 0.5648685642303076, 0.3987024648176661, 0.3339847540400054, 0.336712315658973]

2. average of rmse: 
0.48142936538058817


## ● 실제 추천
### 특정 유저에게 영화를 추천 (adj cosine 아이템 기반)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

### 함수 정의

In [112]:
def get_list_seen_movie_id(df, id_user):
    mat_spar = convert_df_mat(df)
    sparse_nonzero = mat_spar.nonzero()
    list_id_movie_seen = []
    for i, id_user_exist_rating in enumerate(sparse_nonzero[0]):
        if id_user_exist_rating == id_user:
            list_id_movie_seen.append(sparse_nonzero[1][i])    

    return list_id_movie_seen

def get_df_movie_from_id(list_id_movie):
    np_insert = np.array(list_id_movie).reshape(-1, 1)
    df_id_movie = pd.DataFrame(np_insert, columns=['movie_unique_id'])
    df_joined = pd.merge(df_id_movie, df_mapping, on=['movie_unique_id'])
    df_movie = pd.merge(df_joined, df_movies, on=['item_id']).drop(columns=['movie_unique_id'])
    
    return df_movie

def recommend_item_all(df, id_user):
    mat_spar = convert_df_mat(df)
    df_spar = make_df_from_mat(mat_spar, 9066)
    np_multicore_result = recommend_itembased_adjcosine_all(df_spar, cores=12)
    
    list_recommendation = []
    for i, rank in enumerate(np_multicore_result[id_user - 1]):
        if rank >= 3:
            if i + 1 not in get_list_seen_movie_id(df, id_user):
                list_recommendation.append(i+1)
    
    return list_recommendation

def recommend_item_exist(df, id_user):
    mat_spar = convert_df_mat(df)
    df_spar = make_df_from_mat(mat_spar, 9066)
    np_multicore_result = recommend_itembased_adjcosine_exist(df_spar, cores=12)
    
    list_recommendation = []
    for i, rank in enumerate(np_multicore_result[id_user - 1]):
        if rank >= 3:
            if i + 1 not in get_list_seen_movie_id(df, id_user):
                list_recommendation.append(i+1)
    
    return list_recommendation

def get_seen_movie(df, id_user):
    return get_df_movie_from_id(get_list_seen_movie_id(df, id_user))

def get_recomm_movie_all(df, id_user):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item_all(df, id_user)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

def get_recomm_movie_exist(df, id_user):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item_exist(df, id_user)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

### 해당 유저가 본 영화목록을 보여준다

In [113]:
get_seen_movie(df, 3)

Unnamed: 0,item_id,title,genres
0,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
1,110,Braveheart (1995),Action|Drama|War
2,247,Heavenly Creatures (1994),Crime|Drama
3,267,Major Payne (1995),Comedy
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,318,"Shawshank Redemption, The (1994)",Crime|Drama
6,355,"Flintstones, The (1994)",Children|Comedy|Fantasy
7,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8,377,Speed (1994),Action|Romance|Thriller
9,527,Schindler's List (1993),Drama|War


### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [114]:
get_recomm_movie_all(df, 3)

30.023032188415527
processing time is: 217.78400373458862 seconds
process time: 219.9251048564911


Unnamed: 0,item_id,title,genres
0,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
1,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller
3,2329,American History X (1998),Crime|Drama


In [115]:
get_recomm_movie_exist(df, 3)

30.033227682113647
processing time is: 220.1394009590149 seconds
process time: 221.13793992996216


Unnamed: 0,item_id,title,genres


# 4. SVD

## ● 모델 정의

In [147]:
def model_svd(df_spar, k_input, val_adj):
    np_spar = np.array(df_spar)
    mean_user_ratings = np.mean(np_spar, axis = 1)
    np_train_demeaned = np_spar - mean_user_ratings.reshape(-1, 1)
    
    from scipy.sparse.linalg import svds
    u, sigma, vt = svds(np_train_demeaned, k = k_input)
    
    sigma = np.diag(sigma)
    
    np_pred_ratings_all_users = np.dot(np.dot(u, sigma), vt) + mean_user_ratings.reshape(-1, 1) + val_adj
    
    return np_pred_ratings_all_users

## ● 테스트

In [155]:
np_preds = model_svd(df_spar_train, 1, 3.2)
np_preds.shape
        

(671, 9066)

## ● RMSE

In [156]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_preds))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 1.1928929520976053


## ● Cross validation

In [159]:
def cv_svd(df, k_fold=5, k_svd=50, value_adj=3):
    list_validations = []
    for offset in range(k_fold):
        df_train, df_validation = get_dataset(df, k_fold, offset)
        np_result = model_svd(df_train, k_svd, value_adj)
    
        mat_spar_test = sparse.csr_matrix(df_validation)
        mat_spar_predict = sparse.csr_matrix(np_result)
    
        list_validations.append(rmse(mat_spar_test, mat_spar_predict))
        
    avg_rmse = np.mean(list_validations)
    
    return list_validations, avg_rmse

list_validations, avg_rmse = cv_svd(df, 5, 50, 3.2)

print('1. list of validations: \n{}'.format(list_validations))
print()
print('2. average of rmse: \n{}'.format(avg_rmse))

1. list of validations: 
[1.1348973455581102, 1.1106192569260647, 1.0938477221618759, 1.0865253776808304, 1.0732549123961361]

2. average of rmse: 
1.0998289229446034


### k 값 증가시켜 가며 테스트 1 ~ 100

In [162]:
np_val_adj = np.array(list(range(10, 41))) / 10
k_best = float('inf')
for k in range(1, 101):
    for val_adj in np_val_adj:
        avg_rmse = cv_svd(df, k_fold=5, k_svd=k, value_adj=val_adj)[1]
        if avg_rmse <= k_best:
            k_best = avg_rmse
            i_best = k, val_adj
            
        
print('best k is: {}'.format(i_best))

best k is: (100, 3.4)


## ● 실제 추천
### 특정 유저에게 영화를 추천 (svd)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

### 함수 정의

In [122]:
def get_list_seen_movie_id(df, id_user):
    mat_spar = convert_df_mat(df)
    sparse_nonzero = mat_spar.nonzero()
    list_id_movie_seen = []
    for i, id_user_exist_rating in enumerate(sparse_nonzero[0]):
        if id_user_exist_rating == id_user:
            list_id_movie_seen.append(sparse_nonzero[1][i])    

    return list_id_movie_seen

def get_df_movie_from_id(list_id_movie):
    np_insert = np.array(list_id_movie).reshape(-1, 1)
    df_id_movie = pd.DataFrame(np_insert, columns=['movie_unique_id'])
    df_joined = pd.merge(df_id_movie, df_mapping, on=['movie_unique_id'])
    df_movie = pd.merge(df_joined, df_movies, on=['item_id']).drop(columns=['movie_unique_id'])
    
    return df_movie

def recommend_item(df, id_user):
    mat_spar = convert_df_mat(df)
    df_spar = make_df_from_mat(mat_spar, 9066)
    np_multicore_result = model_svd(df_spar, 1)
    
    list_recommendation = []
    for i, rank in enumerate(np_multicore_result[id_user - 1]):
        if rank >= 3:
            if i + 1 not in get_list_seen_movie_id(df, id_user):
                list_recommendation.append(i+1)
    
    return list_recommendation

def get_seen_movie(df, id_user):
    return get_df_movie_from_id(get_list_seen_movie_id(df, id_user))

def get_recomm_movie(df, id_user):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item(df, id_user)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

### 해당 유저가 본 영화목록을 보여준다

In [123]:
get_seen_movie(df, 3)

Unnamed: 0,item_id,title,genres
0,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
1,110,Braveheart (1995),Action|Drama|War
2,247,Heavenly Creatures (1994),Crime|Drama
3,267,Major Payne (1995),Comedy
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,318,"Shawshank Redemption, The (1994)",Crime|Drama
6,355,"Flintstones, The (1994)",Children|Comedy|Fantasy
7,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8,377,Speed (1994),Action|Romance|Thriller
9,527,Schindler's List (1993),Drama|War


### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [124]:
get_recomm_movie(df, 3)

process time: 0.3055093288421631


Unnamed: 0,item_id,title,genres
