In [33]:
# Created or modified on Sep 2022
# author: 임일
# Simple CF with EachMovie

import numpy as np
import pandas as pd

In [34]:
# csv 파일에서 불러오기
ratings = pd.read_csv('C:/RecoSys/Data/EM_ratings.csv', encoding='utf-8')
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,5,911,3.0
1,5,52,2.0
2,5,609,1.0
3,5,946,2.0
4,5,342,2.0


In [35]:
# Rating 데이터를 test, train으로 나누고 train을 full matrix로 변환
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=12)

rating_matrix = x_train.pivot(values='rating', index='user_id', columns='movie_id')
rating_matrix.head()

movie_id,2,3,4,5,6,7,8,9,10,11,...,1624,1625,1626,1627,1628,1631,1634,1635,1638,1648
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,1.0,
17,,,,,,,,,,,...,,,,,,,,,,
23,,,,4.0,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,


In [36]:
rating_matrix.shape

(5759, 1512)

In [37]:
# Train set의 모든 사용자 pair의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

user_similarity.head()

user_id,5,17,23,27,33,71,119,130,160,162,...,74338,74352,74353,74364,74397,74404,74406,74409,74413,74418
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,1.0,0.020016,0.076249,0.0,0.0548,0.033218,0.0,0.0,0.090722,0.0,...,0.0,0.0,0.0,0.0,0.071067,0.092848,0.073828,0.0,0.062177,0.0
17,0.020016,1.0,0.146516,0.0,0.175499,0.0,0.069043,0.0,0.058108,0.0,...,0.212066,0.110697,0.0,0.189703,0.113798,0.0,0.0,0.0,0.0,0.109847
23,0.076249,0.146516,1.0,0.0,0.125353,0.072947,0.0,0.160539,0.0,0.0,...,0.180323,0.0,0.0,0.0,0.086701,0.0,0.072056,0.108399,0.0,0.07846
27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.110208,0.066111,0.0,0.058606,0.0,0.0,0.0
33,0.0548,0.175499,0.125353,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.116637,0.0,0.0,0.0,0.0,0.0,0.077678,0.0,0.109033,0.140971


In [38]:
# RMSE 계산을 위한 함수
def RMSE(y_true, y_pred):
    import numpy as np
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [39]:
# 모든 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def cf_simple(user_id, movie_id):

    if movie_id in rating_matrix:   # 해당 movie_id가 rating_matrix에 존재하는지 확인
        
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id]
        
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id]
        
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.dropna()
        
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        if sim_scores.sum() > 0 :
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else :
            mean_rating = 3.0

    else:  #해당 movie_id가 없으므로 기본값 3.0을 예측치로 돌려 줌
        mean_rating = 3.0
    
    return mean_rating

In [40]:
# 정확도 계산
score(cf_simple)

1.2098104587832816

#### 추천하기

In [45]:
# 추천을 위한 데이터 읽기 (추천을 위해서는 전체 데이터를 읽어야 함)
ratings = pd.read_csv('C:/RecoSys/Data/EM_ratings.csv', encoding='utf-8')
rating_matrix = ratings.pivot(values='rating', index='user_id', columns='movie_id')

movies = pd.read_csv('C:/RecoSys/Data/movie.csv', encoding='utf-8')
movies = movies[['ID', 'Name']]
movies.columns = ['movie_id', 'title']
movies = movies.set_index('movie_id')

In [42]:
movies.head()

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
1,Toy Story
2,Jumanji
3,Grumpier Old Men
4,Waiting to Exhale
5,Father of the Bride Part II


In [43]:
# Cosine similarity 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [44]:
# 추천하기
def recommender(user, n_items=10):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index    # 이미 평가한 영화 확인
    items = rating_matrix.loc[user].drop(rated_index)

    for item in items.index:
        predictions.append(cf_simple(user, item))                               # 예상평점 계산

    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상평점이 가장 높은 영화 선택
    recommended_items = movies.loc[recommendations.index]['title']
    
    return recommended_items

# 영화 추천 함수 부르기
recommender(5, 10)

movie_id
1648                              The Game (1997)
1604                        In the Company of Men
142                                Shadows (1988)
1351                               Blood and Wine
1567            The Last Time I Committed Suicide
1572                          Contempt (Le M?ris)
1109    Charms Zwischenfaelle (Charm's Incidents)
526              Savage Nights (Les Nuits fauves)
947                         My Man Godfrey (1936)
936                              Ninotchka (1939)
Name: title, dtype: object