# 추천 - 협력 필터링 (Collaborative Filtering)

### 추천을 위한 파이썬 패키지 사용

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import surprise

# 설치 에러 해결 - 선생님과 함께

In [2]:
# 데이터 준비 1.
from surprise import Dataset
data = Dataset.load_builtin("ml-100k")

In [3]:
# 데이터 준비 2.
from surprise import Reader     # 데이터를 읽을 때 읽는 방법을 설정하는 클래스

ratings_small = pd.read_csv('data-files/ml-latest-small/ratings.csv')
data2 = Dataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']], 
                             Reader(rating_scale=(0.5, 5)))

In [4]:
ratings_small['rating'].agg(['min', 'max'])

min    0.5
max    5.0
Name: rating, dtype: float64

In [5]:
# 데이터 준비 3.
from surprise import Reader     # 데이터를 읽을 때 읽는 방법을 설정하는 클래스

data3 = Dataset.load_from_file('data-files/ml-latest-small/ratings.csv', 
                                Reader(rating_scale=(0.5, 5), sep=',', skip_lines=1))

In [6]:
# from sklearn.model_selection import train_test_split
from surprise.model_selection import train_test_split

In [7]:
trainset, testset = train_test_split(data3, random_state=42)

In [8]:
from surprise import KNNBasic   # 기본: 코사인 유사도

knn_model = KNNBasic(sim_options={'name':"cosine", 'user_based':False})
# 유사도는 코사인, 사용자기반 false = 아이템 기반

In [9]:
knn_model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x206082e3590>

In [10]:
# 개별 값 예측

print( testset[:3] )
print( knn_model.predict('140', '6765') )   # est=3.425 예측 값
print( knn_model.predict('603', '290') )

[('140', '6765', 3.5), ('603', '290', 4.0), ('438', '5055', 4.0)]
user: 140        item: 6765       r_ui = None   est = 3.42   {'actual_k': 40, 'was_impossible': False}
user: 603        item: 290        r_ui = None   est = 3.55   {'actual_k': 40, 'was_impossible': False}


In [11]:
# 다수 예측
predictions = knn_model.test(testset)
predictions[:3]
# 'was_impossible': 계산 못함 False > 계산 해냈다는 뜻

[Prediction(uid='140', iid='6765', r_ui=3.5, est=3.425, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='603', iid='290', r_ui=4.0, est=3.55, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='438', iid='5055', r_ui=4.0, est=3.1625, details={'actual_k': 40, 'was_impossible': False})]

In [12]:
from surprise import accuracy

print( accuracy.mae(predictions = predictions) )
print( accuracy.rmse(predictions = predictions) )

MAE:  0.7615
0.7614697475628528
RMSE: 0.9800
0.97995665962038


In [13]:
movies = pd.read_csv('data-files/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
all_movie_id = ratings_small['movieId'].unique()
# print(all_movie_id)
# print(ratings_small['userId'].unique())
# print((ratings_small['userId'] == 42).sum())
rated_movies = ratings_small[ratings_small['userId'] == 42]['movieId'].values
# 42번 사용자가 평점 매긴 영화만
# len(rated_movies)

# 42번 사용자가 추천하지 않은 모든 영화에 대한 예상 평점 계산
predictions_of_user42 = []
for movie_id in all_movie_id:
    if movie_id not in rated_movies:
        prediction = knn_model.predict("42", str(movie_id))
        predictions_of_user42.append(prediction)

In [15]:
# 리스트 정렬
a = [10, 4, 7, 22, 9]
a.sort()    # 리스트를 sort하면 정렬해 줌 (오름차순 정렬)
a.reverse() # 그 정렬을 뒤집기 (내림차순 정렬)
a   

[22, 10, 9, 7, 4]

In [16]:
len(predictions_of_user42)
# 위 결과를 평점 기준 정렬 (내림차순)
predictions_of_user42.sort()
# Prediction uid iid r_ui est 이 중 뭘로 정렬할지 문제 발생, 단일 값만 있으면 문제 없다

In [17]:
predictions_of_user42[-10:]
# 정렬 기준이 여러개라서 우리가 원하는데로 정렬되지 않음

[Prediction(uid='42', iid='99750', r_ui=None, est=3.95, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99764', r_ui=None, est=4.275, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='998', r_ui=None, est=3.675, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99813', r_ui=None, est=3.625, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99846', r_ui=None, est=3.7, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99853', r_ui=None, est=3.909090909090909, details={'actual_k': 22, 'was_impossible': False}),
 Prediction(uid='42', iid='999', r_ui=None, est=3.475, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99910', r_ui=None, est=4.05, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='99917', r_ui=None, est=3.625, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid

In [18]:
# 위 결과를 평점 기준 정렬 (내림차순)
predictions_of_user42.sort(key = lambda v: v.est, reverse=True)
# 단일 값이 아닌 경우 sort함수에 기준(est) 지정해야 함.
# 정렬할 때 뒤집기 옵션 True로 해서 내림차순 정렬

In [19]:
# predictions_of_user42[:10]
# est 기준 상위 10개

top_10_movies = []
for p in predictions_of_user42[:10]:
    movie = movies[movies['movieId'] == int(p.iid)]
    # 아까 movieid를 str로 바꿨는데, 형이 안 맞으면 empty 됨
    top_10_movies.append(movie['title'].values[0])

top_10_movies

['One I Love, The (2014)',
 'Laggies (2014)',
 'Annabelle (2014)',
 'Delirium (2014)',
 'Deathgasm (2015)',
 'A Street Cat Named Bob (2016)',
 'Alvarez Kelly (1966)',
 'Jungle Book 2, The (2003)',
 'Cinderella (1997)',
 'Young Victoria, The (2009)']

In [52]:
# 데이터 준비 2.
from surprise import Reader     # 데이터를 읽을 때 읽는 방법을 설정하는 클래스

movies_small = pd.read_csv('data-files/ml-latest-small/movies.csv')
ratings_small = pd.read_csv('data-files/ml-latest-small/ratings.csv')
data2 = Dataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']], 
                             Reader(rating_scale=(0.5, 5)))

In [53]:
trainset = data.build_full_trainset()
testset = trainset.build_testset

In [54]:
len(testset), testset[0]

TypeError: object of type 'method' has no len()

In [None]:
# 모델 훈련 (학습)
from surprise import SVD

svd = SVD(n_factors = 100, n_epochs = 20, random_state=42) # 잠재요인 몇개로 할까?

svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20622737d40>

In [None]:
# 26번 사용자가 시청하지 않은 영화 평점 예측

print( ratings_small[ratings_small['userId'] == 26].shape )

uid_mask = ratings_small['userId'] == 26
ratings_small[uid_mask][['userId', 'movieId']] # 26번 사용자가 평점 부여한 영화 조회
t = ratings_small[~uid_mask][['userId', 'movieId']] # 26번 사용자가 평점 부여하지 않은 영화 조회
t['movieId'].value_counts() # 영화마다 평점 매겨진 횟수 조회

(21, 4)


movieId
356       328
318       317
296       306
593       278
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: count, Length: 9724, dtype: int64

In [62]:
def select_unrated_movies(user_id):
    all_movie_id = ratings['movieId'].unique() # 모든 영화 id
    uid_mask = ratings_small['userId'] == user_id
    rated_movies_ids = ratings[uid_mask]['movieId'].values # 평가한 영화 id
    unrated_movies = [ mid for mid in all_movie_id if mid not in rated_movies_ids ]
    return unrated_movies


In [63]:
print( select_unrated_movies(ratings_small, 26)[:10] )

TypeError: select_unrated_movies() takes 1 positional argument but 2 were given

In [None]:
movies_small[movies_small['movieId'] == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
svd.predict(26, 1)

Prediction(uid=26, iid=1, r_ui=None, est=3.52986, details={'was_impossible': False})

In [64]:
def get_movie_title(MOVIES, movie_id):
    movie_id_mask = movies['movieId']== movie_id
    return movies[movie_id_mask]['title'].values[0]

def recommend_movies(ratings, movies, uid, top_n=10):
    unrated_movie_ids = select_unrated_movies(ratings, user_id)
    # for m in unrated_movies:
    #     svd.predict(user_id, m)
    prdictions = [ svd.predict(user_id, movie_id) for movie_id in unrated_movie_ids ]
    prdictions.sort(key=lambda p: p.est, reverse=True) # 예상평점기준 내림차순 정렬

    top_n_predcitions = prdictions[:top_n]
    recommendations = [ (p.iid, get_movie_title(movies, p.iid)) for p in top_n_predcitions ]

    return recommendations

In [None]:
recommend_movies(ratings_small, movies_small, 26)

NameError: name 'user_id' is not defined

In [65]:
recommend_movies(ratings_small, movies_small, 26)

NameError: name 'user_id' is not defined

In [71]:
uid_mask = ratings_small['userId'] == 26
rated_movie_ids = ratings_small[uid_mask]['movieId']
[ movies_small[movies_small['movieId'] == movie_id]['title'] for movie_id in rated_movie_ids ]

[9    GoldenEye (1995)
 Name: title, dtype: object,
 32    Babe (1995)
 Name: title, dtype: object,
 43    Seven (a.k.a. Se7en) (1995)
 Name: title, dtype: object,
 123    Apollo 13 (1995)
 Name: title, dtype: object,
 126    Batman Forever (1995)
 Name: title, dtype: object,
 138    Die Hard: With a Vengeance (1995)
 Name: title, dtype: object,
 156    Net, The (1995)
 Name: title, dtype: object,
 176    Waterworld (1995)
 Name: title, dtype: object,
 192    Disclosure (1994)
 Name: title, dtype: object,
 249    Natural Born Killers (1994)
 Name: title, dtype: object,
 257    Pulp Fiction (1994)
 Name: title, dtype: object,
 260    Quiz Show (1994)
 Name: title, dtype: object,
 302    Ace Ventura: Pet Detective (1994)
 Name: title, dtype: object,
 307    Clear and Present Danger (1994)
 Name: title, dtype: object,
 314    Forrest Gump (1994)
 Name: title, dtype: object,
 337    True Lies (1994)
 Name: title, dtype: object,
 378    Cliffhanger (1993)
 Name: title, dtype: object,
 395  