# == 목차 == 
# 1. pre-processing for movie-lens data (ml-latest-small, 2.4M)
# 2. 모델 정의
# 3. 테스트
# 4. cross validation
# 5. 실제 구현

## =====================================================================
# 1. pre-processing for movie-lens data (ml-latest-small, 2.4M)
## users: 671, movies: 9066

## ● 기본 데이터셋 생성

In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path_movie = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/movies.csv'
file_path_rating = '/Users/morulabs/dev/source/git_hub/recsys/movie-lens/ml-latest-small/ratings.csv'

### 영화 데이터프레임

In [3]:
header_movie = ['item_id', 'title', 'genres']
df_movies = pd.read_csv(file_path_movie, header=0, names=header_movie)
df_movies

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


### 평점 데이터프레임

In [4]:
header_rating = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings = pd.read_csv(file_path_rating, header=0, names=header_rating)
df_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


### 영화 id의 최대 숫자가 163949로 sparse matrix 를 생성했을 때 용량이 매우 커지게 된다.
### 이 문제를 해결하기 위해서 영화별 unique id 를 생성하여 mapping table로 관리한다.
### unique id는 영화별 1씩 증가하는 숫자

In [5]:
i = df_ratings['item_id']
np_movie_unique = np.unique(np.array(i), axis=0)

list_movie = []
for i, movie_id in enumerate(np_movie_unique):
    list_movie.append((i+1, movie_id))
    
df_mapping = pd.DataFrame(list_movie, columns=['movie_unique_id', 'item_id'])
df_mapping

Unnamed: 0,movie_unique_id,item_id
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,8
8,9,9
9,10,10


### 위에서 만든 mapping table로 df_ratings의 item_id를 movie_unique_id로 바꾼다.

In [6]:
df_join = pd.merge(df_ratings, df_mapping, on=['item_id'])

del df_join['item_id']
columns_new = ['user_id', 'movie_unique_id', 'rating', 'timestamp']
df_join = df_join[columns_new]
df = df_join.rename(columns={'movie_unique_id': 'item_id'})

df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,7,31,3.0,851868750
2,31,31,4.0,1273541953
3,32,31,4.0,834828440
4,36,31,3.0,847057202
5,39,31,3.0,832525157
6,73,31,3.5,1255591860
7,88,31,3.0,1239755559
8,96,31,2.5,1223256331
9,110,31,4.0,840100695


In [7]:
import scipy
from scipy import sparse

r = df['rating'].astype(float)
u = df['user_id'].astype(int)
i = df['item_id'].astype(int)

mat_spar_rating = sparse.csr_matrix(
        (r, (u, i)),
        dtype=np.float
)

mat_spar_rating.shape

(672, 9067)

## ● 모델의 성능평가를 위해 위에서 만든 기본 데이터셋을 훈련/테스트 셋으로 나눈다.
###  실제 train / test 데이터를 반환하는 부분인 1)과 교차검증 함수를 정의한 2)로 나뉜다.
### 1) create train / test split (randomly)
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;무작위로 데이터를 train set과 test set으로 나눈다. ratio_test=0.25면 train:test (0.75:0.25)
### 2) create function for checking cross validation
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;cross validation check 를 위한 함수 정의 
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;k_fold 이 5이면 train:test (0.8:0.2) 비율로 테스트를 5번 수행하고 평균값을 냄
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;k_fold 이 10이면 train:test (0.9:0.1) 비율로 테스트를 10번 수행하고 평균값을 냄


In [8]:
def convert_df_mat(df):
    r = df['rating'].astype(float)
    u = df['user_id'].astype(int)
    i = df['item_id'].astype(int)
    
    mat_spar = sparse.csr_matrix(
            (r, (u, i)),
            dtype=np.float
    )
    
    return mat_spar


def delete_zero_idx_mat(mat_spar):
    df_spar = pd.DataFrame(mat_spar.toarray())
    df_spar = df_spar.drop(df_spar.index[0], axis=0)
    df_spar = df_spar.drop(df_spar.columns[0], axis=1)
    
    return sparse.csr_matrix(df_spar)


def make_df_from_mat(mat_spar, num_movie):
    df_spar = pd.DataFrame(mat_spar.toarray())
    
    if df_spar.shape[1] < num_movie:
        num_row = df_spar.shape[0]
        num_col = df_spar.shape[1]
        num_diff = num_movie - num_col
        
        np_add = np.zeros([num_row, num_diff])
        df_add = pd.DataFrame(np_add, index=np.arange(num_row), 
                              columns=(num_col + np.arange(num_diff)))
        
        df_spar = pd.concat([df_spar, df_add], axis=1)
        
    return df_spar

def convert_df_spar(df, max_id_item):
    mat = delete_zero_idx_mat(convert_df_mat(df))
    df_spar = make_df_from_mat(mat, max_id_item)
    
    return df_spar

### 1) randomly create train / test split

In [9]:
# split data to train(0.75) / test(0.25) by movies ranked for all users
# and make it sparse matrix

def split_train_test_random(df, ratio_test=0.25):
    mat_spar = convert_df_mat(df)
    n_1 = mat_spar.nonzero()[0]
    n_2 = mat_spar.nonzero()[1]
    np_sparse = np.dstack((n_1, n_2))[0, :, :]
    
    df_sparse_nonzero_index = pd.DataFrame(np_sparse, columns=['user', 'movie'])
    df_group = df_sparse_nonzero_index.groupby('user')['movie'].apply(np.array).reset_index(name='movies')
    
    list_movie_2d = [np_movie for np_movie in df_group['movies']]
    
    list_test = list(map(lambda x: np.random.choice(
        x, size=int(len(x) / int(1/ratio_test)), replace=False), list_movie_2d))
    list_train = []
    for i, movies_all in enumerate(list_movie_2d):
        movies_test = list_test[i]
        movies_train = list(set(movies_all) - set(movies_test))
        
        list_train.append(movies_train)
    
    lens_test = [len(movies) for movies in list_test]
    df_test = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_test),
                            'item_id': np.concatenate(list_test)})
    
    lens_train = [len(movies) for movies in list_train]
    df_train = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_train),
                             'item_id': np.concatenate(list_train)})
    
    df_train = pd.merge(df_train, df, on=['user_id', 'item_id'])
    df_test = pd.merge(df_test, df, on=['user_id', 'item_id'])
    
    return df_train, df_test
    

In [10]:
df_train, df_test = split_train_test_random(df, ratio_test=0.25)
df_spar_train = convert_df_spar(df_train, df.item_id.max())
df_spar_test = convert_df_spar(df_test, df.item_id.max())


In [11]:
print('shape of train: {}'.format(df_spar_train.shape))
print('shape of test: {}'.format(df_spar_test.shape))

shape of train: (671, 9066)
shape of test: (671, 9066)


### 2) creating function for checking cross validation

In [12]:
def get_dataset(df, k_fold, offset):
    mat_spar = convert_df_mat(df)
    n_1 = mat_spar.nonzero()[0]
    n_2 = mat_spar.nonzero()[1]
    np_sparse = np.dstack((n_1, n_2))[0, :, :]
    
    df_sparse_nonzero_index = pd.DataFrame(np_sparse, columns=['user', 'movie'])
    df_group = df_sparse_nonzero_index.groupby('user')['movie'].apply(np.array).reset_index(name='movies')
    
    list_movie_2d = [np_movie for np_movie in df_group['movies']]
    
    list_test = list(map(lambda x: np.array_split(x, k_fold)[offset], list_movie_2d))
    list_train = []
    for i, movies_all in enumerate(list_movie_2d):
        movies_test = list_test[i]
        movies_train = list(set(movies_all) - set(movies_test))
        
        list_train.append(movies_train)
    
    lens_train = [len(movies) for movies in list_train]
    df_train_without_rating = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_train),
                             'item_id': np.concatenate(list_train)})

    lens_test = [len(movies) for movies in list_test]
    df_test_without_rating = pd.DataFrame({'user_id': np.repeat(df_group['user'], lens_test),
                            'item_id': np.concatenate(list_test)})
    
    df_train = pd.merge(df_train_without_rating, df, on=['user_id', 'item_id'])
    df_test = pd.merge(df_test_without_rating, df, on=['user_id', 'item_id'])

    return df_train, df_test


In [13]:
def get_rmse(mat_spar_test, mat_spar_predict):
    def rmse(actual, prediction):
        tu_new = actual.nonzero()[0], actual.nonzero()[1]
        pred = prediction[tu_new]
        actu = actual[tu_new]
        
        return sqrt(mean_squared_error(actu, pred))

    rmse = rmse(mat_spar_test, mat_spar_predict)
    # print('rmse for item-based is {}'.format(rmse))

    return rmse

def cross_validation(df, model, dict_args, k_fold=5):
    list_validations = []
    max_id_movies = df.item_id.max()
    for offset in range(k_fold):
        df_train_set, df_validation_set = get_dataset(df, k_fold, offset)
        df_spar_train_set = convert_df_spar(df_train_set, max_id_movies)
        df_spar_validation_set = convert_df_spar(df_validation_set, max_id_movies)
        np_result = model(df_spar_train_set, **dict_args)

        mat_spar_test = sparse.csr_matrix(df_spar_validation_set)
        mat_spar_predict = sparse.csr_matrix(np_result)

        list_validations.append(get_rmse(mat_spar_test, mat_spar_predict))
        
    avg_rmse = np.mean(list_validations)
    return list_validations, avg_rmse

## =====================================================================
# 2. 모델 정의

## ● User-based

#### KNN

In [14]:
from sklearn.neighbors import NearestNeighbors

def findksimilarusers(id_user, df_spar, metric='cosine', k=5):
    model_knn = NearestNeighbors(metric=metric, algorithm='brute') 
    model_knn.fit(df_spar)

    distances, indices = model_knn.kneighbors(df_spar.iloc[id_user - 1, :].values.reshape(1, -1), n_neighbors = k + 1)
    similarities = 1 - distances.flatten()

    return similarities, indices.flatten()

#### 특정 유저의 특정 영화에 대한 평점 예측 ( user:movie -> 1:1 )

In [15]:
def predict_userbased_user_item(id_user, id_item, df_spar, metric='cosine', k=5):
    similarities, indices = findksimilarusers(id_user, df_spar, metric, k)
    series_au = df_spar.iloc[id_user - 1, :]
    mean_rating_au = series_au.sum() / len(series_au.nonzero()[0])
    
    sum_sim = sum_wtd= prediction = 0
    for i, indice in enumerate(indices):
        if indice + 1 == id_user:
            continue
        else:
            rating_ui = df_spar.iloc[indice, id_item - 1]
            if rating_ui != 0:
                series_u = df_spar.iloc[indice, :]
                sim = similarities[i]

                mean_rating_u = sum(series_u) / len(series_u.nonzero()[0])
                rating_diff = rating_ui - mean_rating_u 
                product = rating_diff * sim
                sum_wtd += product
                sum_sim += sim

    if sum_sim == 0:
        prediction = 0
    else:
        prediction = mean_rating_au + (sum_wtd / sum_sim)
    
    return prediction

#### 특정 유저의 모든 영화에 대한 평점 예측 ( user:movie -> 1:all )

In [16]:
def predict_userbased_user_allitems(id_user, df_spar, metric='cosine', k=5):
    similarities, indices = findksimilarusers(id_user, df_spar, metric, k)
    series_au = df_spar.iloc[id_user - 1, :]
    mean_rating_au = series_au.sum() / len(series_au.nonzero()[0])
    
    print(id_user, df_spar.shape[1] + 1)
    list_pred_user = []
    for id_item in range(1, df_spar.shape[1] + 1):
        sum_sim = sum_wtd = prediction = 0
        for i, indice in enumerate(indices):
            if indice + 1 == id_user:
                continue
            else:
                rating_ui = df_spar.iloc[indice, id_item - 1]
                if rating_ui != 0:
                    series_u = df_spar.iloc[indice, :]
                    sim = similarities[i]

                    mean_rating_u = sum(series_u) / len(series_u.nonzero()[0])
                    rating_diff = rating_ui - mean_rating_u 
                    product = rating_diff * sim
                    sum_wtd += product
                    sum_sim += sim

        if sum_sim == 0:
            prediction = 0
        else:
            prediction = mean_rating_au + (sum_wtd / sum_sim)
            
        list_pred_user.append(prediction)
    
    return list_pred_user

#### Model 생성 (recommend_userbased)

In [17]:
def work(df_spar, metric, k, id_user):
        return id_user, predict_userbased_user_allitems(id_user, df_spar, metric=metric, k=k)

def recommend_userbased(df_spar, metric='cosine', k=5, cores=4):
    import multiprocessing
    from multiprocessing import Pool
    from functools import partial
    import time
    
    if cores == '*':
        cores = multiprocessing.cpu_count()
 
    time_start = time.time()
    
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[0] + 1))
    func = partial(work, df_spar, metric, k)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all

## ● Item-based

#### KNN

In [18]:
from sklearn.neighbors import NearestNeighbors

def find_similar_items(id_item, df_spar, metric='cosine', k=5):
    df_T = df_spar.T
    model_knn = NearestNeighbors(metric=metric, algorithm='brute') 
    model_knn.fit(df_T)

    similarities, indices = model_knn.kneighbors(df_T.iloc[id_item - 1, :].values.reshape(1, -1), n_neighbors = k + 1)
    similarities = 1 - similarities.flatten()

    return similarities, indices.flatten()

#### 특정 영화의 특정 유저에 대한 평점 예측 (user:movie -> 1:1)

In [19]:
def predict_itembased_user_item(id_user, id_item, df_spar, metric='cosine', k=5):
    similarities, indices = find_similar_items(id_item, df_spar, metric, k) 
    
    prediction = sum_wtd = sum_sim = 0
    for i, indice in enumerate(indices):
        if indice + 1 == id_item:
            continue
        else:
            rating_ui = df_spar.iloc[id_user - 1, indice]
            if rating_ui != 0:
                sim_item = similarities[i]

                product = rating_ui * sim_item
                sum_wtd += product
                sum_sim += sim_item

    if sum_sim == 0:
        prediction = 0
    else:
        prediction = (sum_wtd / sum_sim)
            
    return prediction

#### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [20]:
def predict_itembased_item_allusers(id_item, df_spar, metric='cosine', k=5):
    similarities, indices = find_similar_items(id_item, df_spar, metric, k)
    
    list_pred_all_rating = []
    for id_user in range(1, df_spar.shape[0] + 1):
        prediction = sum_wtd = sum_sim = 0
    
        for i, indice in enumerate(indices):
            if indice + 1 == id_item:
                continue
            else:
                rating_ui = df_spar.iloc[id_user - 1, indice]
                if rating_ui != 0:
                    sim_item = similarities[i]

                    product = rating_ui * sim_item
                    sum_wtd += product
                    sum_sim += sim_item
                    
        if sum_sim == 0:
            prediction = 0
        else:
            prediction = (sum_wtd / sum_sim)
        
        list_pred_all_rating.append(prediction)
    
    return list_pred_all_rating
    
    

#### Model 생성 (recommend_itembased)

In [21]:
def work(df_spar, metric, k, id_item):
        return id_item, predict_itembased_item_allusers(id_item, df_spar, metric, k=k)

def recommend_itembased(df_spar, metric='cosine', k=5, cores=4):
    import multiprocessing
    from multiprocessing import Pool
    from functools import partial
    import time
    
    if cores == '*':
        cores = multiprocessing.cpu_count()
 
    time_start = time.time()
    
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work, df_spar, metric, k)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all.T

## ● Item-based adj-cosine

#### 유사도 매트릭스 (Item-based adj cosine에만 적용)

#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;user_A = [0 0 0 5 6 0]
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;user_B = [0 2 3 4 0 0]
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;두 유저에 대한 아이템 평가가 위와 같이 있을 때 adj cosine 을 적용하는 방법
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 1. 모든 평점 값에(0 포함) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 2. 평점이 있는 값들만(0 제외) 평점평균값을 빼준 후 벡터간의 cosine similarity를 계산
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;case 1과 2를 각각 돌려보고 rmse 값의 차이를 확인한다.




#### case 1.

In [22]:
import time
from scipy.spatial.distance import pdist, squareform

def get_sim_matrix_sub_all(df_spar):
    M = np.array(df_spar)
    M_u = M.mean(axis=1)
    M_adj = M - M_u[:, None]
    
    time_start = time.time()
    sim_matrix_sub_all = 1 - squareform(pdist(M_adj.T, 'cosine'))
    time_end = time.time()
    print(time_end - time_start)
    
    sim_matrix_sub_all = pd.DataFrame(sim_matrix_sub_all)
    
    return sim_matrix_sub_all

#### case 2.

In [23]:
import time
from scipy.spatial.distance import pdist, squareform

def get_sim_matrix_sub_exist(df_spar):
    M = np.array(df_spar)
    list_mean = []
    for rating_user in M:
        np_idx_nonzero = rating_user.nonzero()[0]
        mean = np.mean(rating_user[np_idx_nonzero])
        list_mean.append(mean)
    M_u = np.array(list_mean)
    
    M_adj = np.zeros([df_spar.shape[0], df_spar.shape[1]])

    for row, np_ratings in enumerate(M):
        np_idx_nonzero = np_ratings.nonzero()[0]
        M_adj[row][np_idx_nonzero] = np_ratings[np_idx_nonzero] - M_u[row]
    
    time_start = time.time()
    sim_matrix_sub_exist = 1 - squareform(pdist(M_adj.T, 'cosine'))
    time_end = time.time()
    print(time_end - time_start)
    
    mask = np.isnan(sim_matrix_sub_exist)
    sim_matrix_sub_exist[mask] = 0
    
    sim_matrix_sub_exist = pd.DataFrame(sim_matrix_sub_exist)
    
    return sim_matrix_sub_exist

#### KNN

In [24]:
def find_similar_k_items_adjcos(sim_matrix, id_item, k=5):
    np_top_k = sim_matrix[id_item-1].sort_values(ascending=False)[:k+1]
    similarities = np_top_k.values
    indices = np.array(np_top_k.index)

    return similarities, indices

#### 특정 유저의 특정 영화에 대한 평점 예측 (user:movie -> 1:1)

In [25]:
def predict_itembased_user_item_adjcos(id_user, id_item, df_spar, sim_matrix, k=5):
    similarities, indices = find_similar_k_items_adjcos(sim_matrix, id_item, k) 

    prediction = sum_wtd = sum_sim = 0
    for i, indice in enumerate(indices):
        if indice + 1 == id_item:
            continue
        else:
            rating_ui = df_spar.iloc[id_user - 1, indice]
            if rating_ui != 0:
                sim_item = similarities[i]

                product = rating_ui * sim_item
                sum_wtd += product
                sum_sim += sim_item
            
    if sum_sim == 0:
        prediction = 0
    else:
        prediction = (sum_wtd / sum_sim)

    return prediction

#### 특정 영화의 모든 유저에 대한 평점 예측 (user:movie -> all:1) 

In [26]:
def predict_itembased_item_allusers_adjcos(id_item, df_spar, sim_matrix, k=5):
    similarities, indices = find_similar_k_items_adjcos(sim_matrix, id_item, k)
    
    list_pred_all_rating = []
    for id_user in range(1, df_spar.shape[0] + 1):
        prediction = sum_wtd = sum_sim = 0
        
        for i, indice in enumerate(indices):
            if indice + 1 == id_item:
                continue
            else:
                rating_ui = df_spar.iloc[id_user - 1, indice]
                if rating_ui != 0:
                    sim_item = similarities[i]

                    product = rating_ui * sim_item
                    sum_wtd += product
                    sum_sim += sim_item
        
        if sum_sim == 0:
            prediction = 0
        else:
            prediction = (sum_wtd / sum_sim)
            
        list_pred_all_rating.append(prediction)
        
    return list_pred_all_rating

#### 모델 생성 (recommend_itembased_adjcosine)

#### case 1.

In [27]:
def work_all(df_spar, sim_matrix, k, id_item):
    return id_item, predict_itembased_item_allusers_adjcos(id_item, df_spar, sim_matrix, k=k)

def recommend_itembased_adjcosine_all(df_spar, sim_matrix, k=5, cores=4):
    import multiprocessing
    from multiprocessing import Pool
    from functools import partial
    import time
    
    if cores == '*':
        cores = multiprocessing.cpu_count()
 
    time_start = time.time()

    sim_matrix = get_sim_matrix_sub_all(df_spar)
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work_all, df_spar, sim_matrix, k)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_all = sorted(result_pred, key=lambda x: x[0])
    np_pred_ratings_all = np.array(list(map(lambda x: x[1], list_pred_ratings_all)))

    return np_pred_ratings_all.T

#### case 2.

In [28]:
def work_exist(df_spar, sim_matrix, k, id_item):
    return id_item, predict_itembased_item_allusers_adjcos(id_item, df_spar, sim_matrix, k=k)

def recommend_itembased_adjcosine_exist(df_spar, sim_matrix, k=5, cores=4):
    import multiprocessing
    from multiprocessing import Pool
    from functools import partial
    import time
    
    if cores == '*':
        cores = multiprocessing.cpu_count()
 
    time_start = time.time()

    sim_matrix = get_sim_matrix_sub_exist(df_spar)
    p = Pool(processes=cores)
    iterable = list(range(1, df_spar.shape[1] + 1))
    func = partial(work_exist, df_spar, sim_matrix, k)
    result_pred = p.map(func, iterable)
    p.close()
    
    time_end = time.time()
    
    time_process = time_end - time_start
    print('processing time is: {} seconds'.format(time_process))
    
    list_pred_ratings_exist = sorted(result_pred, key=lambda x: x[0])
    list_pred_ratings_exist = np.array(list(map(lambda x: x[1], list_pred_ratings_exist)))

    return list_pred_ratings_exist.T

## ● SVD

#### 모델 생성 (SVD)

In [29]:
def model_svd(df_spar, k_input, val_adj):
    np_spar = np.array(df_spar)
    mean_user_ratings = np.mean(np_spar, axis = 1)
    np_train_demeaned = np_spar - mean_user_ratings.reshape(-1, 1)
    
    from scipy.sparse.linalg import svds
    u, sigma, vt = svds(np_train_demeaned, k = k_input)
    
    sigma = np.diag(sigma)
    
    np_pred_ratings_all_users = np.dot(np.dot(u, sigma), vt) + mean_user_ratings.reshape(-1, 1) + val_adj
    
    return np_pred_ratings_all_users

## =====================================================================
# 3. 테스트

## ● User-based

#### 특정 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(영화수: 9066)

In [30]:
import time

time_start = time.time()
pred = predict_userbased_user_allitems(43, df_spar_train)
time_end = time.time()

print('한 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: {} seconds'.format(time_end - time_start))
print('영화 수: {}'.format(len(pred)))

43 9067
한 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: 1.089385986328125 seconds
영화 수: 9066


#### 모델을 돌려서 예측 값 매트릭스를 도출
##### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;싱글코어로 671 유저에 대한 평점을 예측하는 데 시간이 오래 걸림
##### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;싱글코어와 멀티코어 두가지 방법을 사용해서 시간을 측정하고 rmse 값을 비교 (동일한 결과 증명)

#### Single-core test

In [None]:
import time

time_start = time.time()

list_result = []
for i in range(1, df_spar_train.shape[0] + 1):
    list_pred_user = predict_userbased_user_allitems(i, df_spar_train)
    list_result.append(list_pred_user)

time_end = time.time()
time_process = time_end - time_start

print('process time for single core is: {} seconds'.format(time_process))
    
np_singlecore_result = np.array(list_result)
    

#### Single-core RMSE

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0] , actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_singlecore_result))
print('rmse for user-based is {}'.format(rmse_user))


#### Multi-core test

In [36]:
df_spar_train.shape

(671, 9066)

In [31]:
np_multicore_result = recommend_userbased(df_spar_train, cores='*')

processing time is: 45.41520094871521 seconds


In [33]:
np_multicore_result.shape

(671, 671)

In [35]:
len(np_multicore_result[0])

671

#### Multi-core RMSE

In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for user-based is {}'.format(rmse_user))

IndexError: index (9065) out of range

## ● Item-based

#### 특정 영화의 모든 유저에 대한 평점 예측하는 데 걸리는 시간 (user : movie -> 1 : all) &nbsp;&nbsp;(유저수: 671)

In [35]:
import time

time_start = time.time()
predict_itembased_item_allusers(9063, df_spar_train, metric='cosine', k=5)
time_end = time.time()

print('process time: {} seconds'.format(time_end - time_start))

process time: 0.12210988998413086 seconds


#### test

In [36]:
np_multicore_result = recommend_itembased(df_spar_train, metric='cosine', k=5, cores='*')

processing time is: 293.2637848854065 seconds


#### RMSE

In [37]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_item = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result))
print('rmse for item-based is {}'.format(rmse_item))

rmse for item-based is 2.641884937004804


## ● Item-based adj-cosine

#### case 1.

In [38]:
import time

time_start = time.time()
sim_matrix = get_sim_matrix_sub_all(df_spar_train)
predict_itembased_item_allusers_adjcos(9063, df_spar_train, sim_matrix, k=5)
time_end = time.time()

print('process time (subtract all): {} seconds'.format(time_end - time_start))

25.129395246505737
process time (subtract all): 25.21860384941101 seconds


#### case 2.

In [40]:
import time

time_start = time.time()
sim_matrix = get_sim_matrix_sub_exist(df_spar_train)
predict_itembased_item_allusers_adjcos(9063, df_spar_train, sim_matrix, k=5)
time_end = time.time()

print('process time (subtract exist): {} seconds'.format(time_end - time_start))

20.033955574035645
process time (subtract exist): 20.198989391326904 seconds


#### test

#### case 1.

In [41]:
sim_matrix_all = get_sim_matrix_sub_all(df_spar_train)
np_multicore_result_T_all = recommend_itembased_adjcosine_all(df_spar_train, sim_matrix_all, k=5, cores='*')

24.540088176727295
19.925617694854736
processing time is: 230.65127539634705 seconds


#### case 2.

In [45]:
sim_matrix_exist = get_sim_matrix_sub_exist(df_spar_train)
np_multicore_result_T_exist = recommend_itembased_adjcosine_exist(df_spar_train, sim_matrix_exist, k=5, cores='*')

20.199281454086304
19.911203145980835
processing time is: 228.51010656356812 seconds


#### RMSE

#### case 1.

In [43]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_all))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 2.599106081924889


#### case 2.

In [46]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_multicore_result_T_exist))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 3.398406129171345


## ● SVD

#### 모든 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간 (user : movie -> all : all) &nbsp;&nbsp;(유저수: 671, 영화수: 9066)

In [30]:
import time

time_start = time.time()
np_preds = model_svd(df_spar_train, 50, 3)
time_end = time.time()

print('모든 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: {} seconds'.format(time_end - time_start))
print('영화 수: {}'.format(len(np_preds)))


모든 유저의 모든 영화에 대한 평점을 예측하는 데 걸리는 시간: 1.1176979541778564 seconds
영화 수: 671


#### RMSE

In [31]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import sparse

def rmse(actual, prediction):
    tu_new = actual.nonzero()[0], actual.nonzero()[1]
    pred = prediction[tu_new]
    actu = actual[tu_new]
    
    return sqrt(mean_squared_error(actu, pred))

rmse_user = rmse(sparse.csr_matrix(df_spar_test), sparse.csr_matrix(np_preds))
print('rmse for item-based is {}'.format(rmse_user))

rmse for item-based is 1.1573615600637281


## =====================================================================
# 4. Cross validation

## ● User-based

In [None]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_user = {'metric': 'cosine', 'k': 5, 'cores': '*'}
list_validations, avg_rmse = cross_validation(df, recommend_userbased, dict_args_user, k_fold=5)

print('1. list of validations: {}'.format(list_validations))
print()
print('2. average of rmse: {}'.format(avg_rmse))

## ● Item-based

In [None]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_item = {'metric': 'cosine', 'k': 5, 'cores': '*'}
list_validations, avg_rmse =  cross_validation(df, recommend_itembased, dict_args_item, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations))
print()
print('2. average of rmse: \n{}'.format(avg_rmse))

## ● Item-based adj-cosine

In [None]:
df_spar = convert_df_spar(df, df.item_id.max())

#### case 1.

In [None]:
# cross_validation(df, model, dict_args, k_fold=5)

sim_matrix_all = get_sim_matrix_sub_all(df_spar)
dict_args_item_adj_all = {'sim_matrix': sim_matrix_all, 'k': 5, 'cores': '*'}
list_validations_all, avg_rmse_all = \
        cross_validation(df, recommend_itembased_adjcosine_all, dict_args_item_adj_all, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations_all))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_all))

#### case 2.

In [None]:
# cross_validation(df, model, dict_args, k_fold=5)

sim_matrix_exist = get_sim_matrix_sub_exist(df_spar)
dict_args_item_adj_exist = {'sim_matrix': sim_matrix_exist, 'k': 5, 'cores': '*'}
list_validations_all, avg_rmse_all = \
        cross_validation(df, recommend_itembased_adjcosine_exist, dict_args_item_adj_exist, k_fold=5)

print('1. list of validations: \n{}'.format(list_validations_all))
print()
print('2. average of rmse: \n{}'.format(avg_rmse_all))

## ● SVD

In [None]:
# cross_validation(df, model, dict_args, k_fold=5)

dict_args_svd = {'k_input': 50, 'val_adj': 3}
list_validations, avg_rmse = cross_validation(df, model_svd, dict_args, k_fold=5)

print('1. list of validations: {}'.format(list_validations))
print()
print('2. average of rmse: {}'.format(avg_rmse))

## =====================================================================
# 5. 실제 추천

#### 특정 유저에게 영화를 추천
###### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1) 3점 이상의 예측 평점을 가진 영화 중 보지 않은 영화를 선별
###### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2) 1)의 결과 중 예측 평점이 높은 순서, movie_id 가 빠른 순서대로 top 5의 영화를 추천

#### 함수 정의

In [34]:
################### seen movie
def get_list_seen_movie_id(df, id_user):
    mat_spar = convert_df_mat(df)
    sparse_nonzero = mat_spar.nonzero()
    list_id_movie_seen = []
    for i, id_user_exist_rating in enumerate(sparse_nonzero[0]):
        if id_user_exist_rating == id_user:
            list_id_movie_seen.append(sparse_nonzero[1][i])    

    return list_id_movie_seen

def get_df_movie_from_id(list_id_movie):
    np_insert = np.array(list_id_movie).reshape(-1, 1)
    df_id_movie = pd.DataFrame(np_insert, columns=['movie_unique_id'])
    df_joined = pd.merge(df_id_movie, df_mapping, on=['movie_unique_id'])
    df_movie = pd.merge(df_joined, df_movies, on=['item_id']).drop(columns=['movie_unique_id'])
    
    return df_movie

def get_seen_movie(df, id_user):
    return get_df_movie_from_id(get_list_seen_movie_id(df, id_user))



################### recommend item
def get_recomm_list_user(df_spar, id_user, flag, cores):
    if flag == 'user-based':
        list_pred_rank = predict_userbased_user_allitems(id_user, df_spar) 
    elif flag == 'item-based':
        np_multicore_result = recommend_itembased(df_spar, cores='*')
        list_pred_rank = np_multicore_result[id_user - 1]
    elif flag == 'item-based-adjall':
        sim_matrix = get_sim_matrix_sub_all(df_spar)
        np_multicore_result = recommend_itembased_adjcosine_all(df_spar, sim_matrix, cores='*')
        list_pred_rank = np_multicore_result[id_user - 1]
    elif flag == 'item-based-adjexist':
        sim_matrix = get_sim_matrix_sub_exist(df_spar)
        np_multicore_result = recommend_itembased_adjcosine_exist(df_spar, sim_matrix, cores='*')
        list_pred_rank = np_multicore_result[id_user - 1]
    elif flag == 'svd':
        np_multicore_result = model_svd(df_spar, 50, 3)
        list_pred_rank = np_multicore_result[id_user - 1]
        
    return list_pred_rank


def recommend_item(df, id_user, flag, cores):
    df_spar = convert_df_spar(df, df.item_id.max())
    list_pred_rank = get_recomm_list_user(df_spar, id_user, flag, cores) 
    list_seen_movie = get_list_seen_movie_id(df, id_user)
    
    list_recommendation = []
    for i, rank in enumerate(list_pred_rank):
        if rank >= 3:
            if i + 1 not in list_seen_movie:
                list_recommendation.append(i + 1)
    
    return list_recommendation


def get_recomm_movie(df, id_user, flag='user-based', cores='*'):
    import time
    
    time_start = time.time()
    list_recomm = recommend_item(df, id_user, flag, cores)[:5]
    time_end = time.time()
    print('process time: {}'.format(time_end - time_start))

    return get_df_movie_from_id(list_recomm)

In [40]:
#### 해당 유저가 본 영화목록을 보여준다

get_seen_movie(df, 3)

Unnamed: 0,item_id,title,genres
0,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
1,110,Braveheart (1995),Action|Drama|War
2,247,Heavenly Creatures (1994),Crime|Drama
3,267,Major Payne (1995),Comedy
4,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,318,"Shawshank Redemption, The (1994)",Crime|Drama
6,355,"Flintstones, The (1994)",Children|Comedy|Fantasy
7,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8,377,Speed (1994),Action|Romance|Thriller
9,527,Schindler's List (1993),Drama|War


## ● User-based

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [41]:
get_recomm_movie(df, 3, flag='user-based')

process time: 33.851152181625366


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,10,GoldenEye (1995),Action|Adventure|Thriller
2,16,Casino (1995),Crime|Drama
3,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller


## ● Item-based

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [42]:
get_recomm_movie(df, 3, flag='item-based')

processing time is: 292.9621195793152 seconds
process time: 319.21243357658386


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,7,Sabrina (1995),Comedy|Romance
2,16,Casino (1995),Crime|Drama
3,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,44,Mortal Kombat (1995),Action|Adventure|Fantasy


## ● Item-based adj-cosine

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [52]:
get_recomm_movie(df, 3, flag='item-based-adjall')

24.897565603256226
20.11640501022339
processing time is: 230.7624499797821 seconds
process time: 284.28092551231384


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,16,Casino (1995),Crime|Drama
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [53]:
get_recomm_movie(df, 3, flag='item-based-adjexist')

22.817161083221436
20.17572331428528
processing time is: 230.19850277900696 seconds
process time: 258.6686682701111


Unnamed: 0,item_id,title,genres
0,104,Happy Gilmore (1996),Comedy
1,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
2,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
3,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
4,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi


## ● SVD

#### 추천 시스템에 의해서 추천된 영화목록을 보여준다 (예측평점 상위 5개)

In [35]:
get_recomm_movie(df, 3, flag='svd')

process time: 1.686265230178833


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,4,Waiting to Exhale (1995),Comedy|Drama|Romance
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller
