# [추천 시스템 입문편] Content-Based Recommendation 1

## 데이터 전처리

In [1]:
# 필요한 패키지 설치
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 읽어오기
movies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/추천 시스템 입문편/data/ml-latest-small/movies.csv')
ratings = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/추천 시스템 입문편/data/ml-latest-small/ratings_update.p')
genres = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/추천 시스템 입문편/data/ml-latest-small/genres.p')

In [3]:
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp
41832,285,1193,5.0,2015-12-26 17:32:33


In [4]:
genres.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6390,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [5]:
# ratings 테이블에 genres 데이터를 이너 조인으로 붙이기
# ratings의 movieId와 genres의 index(movieId)를 기준으로 조인
ratings = ratings.merge(genres, left_on='movieId', right_index=True)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
34662,232,61250,2.5,2010-08-30 01:59:28,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# 0 값을 NaN으로 바꿔주기
ratings = ratings.replace(0, np.nan)
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
63239,414,2804,4.0,2000-06-20 15:58:13,,,,,1.0,1.0,...,,,,,,,,,,


In [7]:
# 모델을 훈련하는 데이터와 성능을 테스트하는 데이터로 나눠주기
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(ratings, random_state=42, test_size=0.1)

In [9]:
print(train.shape)
print(test.shape)

(90755, 24)
(10084, 24)


In [10]:
# 아이템 프로필 만들기 (장르 테이블과 동일)
# 어떤 아이템이 어떤 장르를 가지고 있는지를 보여줌
# 해리포터의 장르
genres.loc[4896]

(no genres listed)    0
Action                0
Adventure             1
Animation             0
Children              1
Comedy                0
Crime                 0
Documentary           0
Drama                 0
Fantasy               1
Film-Noir             0
Horror                0
IMAX                  0
Musical               0
Mystery               0
Romance               0
Sci-Fi                0
Thriller              0
War                   0
Western               0
Name: 4896, dtype: int64

## 유저 프로필 만들기

In [11]:
# 장르 이름 가져오기
genre_cols = genres.columns
genre_cols

Index(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [13]:
# 각 영화에 대해서 장르에 몇 점을 줬는지 곱하기 연산
for cols in genre_cols:
    train[cols] = train[cols] * train['rating']

In [14]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
25794,177,112290,2.5,2015-07-02 02:35:34,,,,,,,...,,,,,,,,,,
49800,318,127114,4.0,2015-11-18 18:24:36,,,,,,,...,,,,,,,,,,
49156,318,2580,4.0,2012-01-06 17:53:15,,,,,,4.0,...,,,,,,,,,,
100240,610,52328,4.0,2016-11-19 07:54:45,,,4.0,,,,...,,,,,,,4.0,4.0,,
16435,105,5785,3.5,2015-11-06 01:27:03,,3.5,,,,3.5,...,,,,,,,,,,


In [15]:
# 유저 별로 그룹을 만들어서 장르별 평균값 구하기
# 각 유저 별로 Action 아이템에 준 평점의 평균
train.groupby('userId')['Action'].mean()

userId
1       4.303797
2       4.125000
3       3.571429
4       3.428571
5       3.111111
          ...   
607     3.666667
608     3.338710
609     3.100000
610     3.607219
1000    5.000000
Name: Action, Length: 611, dtype: float64

In [19]:
# 유저 프로필
user_profile = train.groupby('userId')[genre_cols].mean()
user_profile.sample()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
478,,3.0,2.75,3.0,3.25,2.714286,3.5,,2.25,3.25,,0.5,4.5,3.75,2.5,2.5,2.5,1.625,1.5,1.5


In [20]:
# 나의 유저 프로필 찾아보기
user_profile.loc[1000]

(no genres listed)    NaN
Action                5.0
Adventure             5.0
Animation             NaN
Children              NaN
Comedy                NaN
Crime                 NaN
Documentary           NaN
Drama                 5.0
Fantasy               NaN
Film-Noir             NaN
Horror                NaN
IMAX                  5.0
Musical               NaN
Mystery               NaN
Romance               NaN
Sci-Fi                5.0
Thriller              5.0
War                   NaN
Western               NaN
Name: 1000, dtype: float64

## 샘플 유저의 평점 예측

In [26]:
# 테스트 데이터 중 인덱스 8313의 userId 가져와서 샘플 유저로 사용하기
sample = test.loc[8313]

In [27]:
# 57번 유저
sample_user = sample['userId']
sample_user

57

In [30]:
# 57번 유저 프로필 가져오기
# 유저의 영화 성향을 알 수 있음
sample_user_profile = user_profile.loc[sample_user]
sample_user_profile

(no genres listed)         NaN
Action                3.250000
Adventure             3.430894
Animation             3.866667
Children              3.435897
Comedy                3.407609
Crime                 3.459016
Documentary           4.000000
Drama                 3.535484
Fantasy               3.500000
Film-Noir             4.090909
Horror                2.769231
IMAX                  3.750000
Musical               3.840000
Mystery               3.806452
Romance               3.389474
Sci-Fi                3.227848
Thriller              3.150442
War                   4.129032
Western               3.625000
Name: 57, dtype: float64

In [34]:
# Godzilla 영화에 대한 장르
print(sample['movieId'])
sample[genre_cols]

1882


(no genres listed)    NaN
Action                1.0
Adventure             NaN
Animation             NaN
Children              NaN
Comedy                NaN
Crime                 NaN
Documentary           NaN
Drama                 NaN
Fantasy               NaN
Film-Noir             NaN
Horror                NaN
IMAX                  NaN
Musical               NaN
Mystery               NaN
Romance               NaN
Sci-Fi                1.0
Thriller              1.0
War                   NaN
Western               NaN
Name: 8313, dtype: object

In [33]:
movies[movies['movieId'] == 1882]

Unnamed: 0,movieId,title,genres
1373,1882,Godzilla (1998),Action|Sci-Fi|Thriller


In [36]:
# 특정 영화인 Godzilla에 대한 57번 유저의 예상 평균 평점 구하기
sample_user_profile * sample[genre_cols]

(no genres listed)         NaN
Action                    3.25
Adventure                  NaN
Animation                  NaN
Children                   NaN
Comedy                     NaN
Crime                      NaN
Documentary                NaN
Drama                      NaN
Fantasy                    NaN
Film-Noir                  NaN
Horror                     NaN
IMAX                       NaN
Musical                    NaN
Mystery                    NaN
Romance                    NaN
Sci-Fi                3.227848
Thriller              3.150442
War                        NaN
Western                    NaN
dtype: object

In [37]:
# 57번 유저의 Godzilla 영화에 대한 예측된 평균 평점
(sample_user_profile * sample[genre_cols]).mean()

3.2094301930473095

## 전체 유저의 평점 예측

In [38]:
# 전체 데이터로 확장
for idx, row in test.iterrows():
    print(idx, row)
    break

8313 userId                                 57
movieId                              1882
rating                                2.0
timestamp             2000-08-09 05:30:10
(no genres listed)                    NaN
Action                                1.0
Adventure                             NaN
Animation                             NaN
Children                              NaN
Comedy                                NaN
Crime                                 NaN
Documentary                           NaN
Drama                                 NaN
Fantasy                               NaN
Film-Noir                             NaN
Horror                                NaN
IMAX                                  NaN
Musical                               NaN
Mystery                               NaN
Romance                               NaN
Sci-Fi                                1.0
Thriller                              1.0
War                                   NaN
Western                      

In [40]:
# for문 진행 상황 확인
from tqdm import tqdm_notebook

In [44]:
predict = []
for idx, row in tqdm_notebook(test.iterrows()):
    # userId
    user = row['userId']
    # 유저 프로필 가져오기 * 아이템의 장르 정보
    predict.append((user_profile.loc[user] * row[genre_cols]).mean())

0it [00:00, ?it/s]

In [45]:
test['predict'] = predict

In [46]:
test['predict']

8313      3.209430
3424      3.229751
33637     4.052521
100232    3.665410
77733     3.786194
            ...   
39614     3.272912
49446     3.745655
48014     2.803571
44225     2.853662
64367     3.375285
Name: predict, Length: 10084, dtype: float64

In [47]:
# Cold-Start 존재
test[test['predict'].isnull()]

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict
36355,248,122896,4.0,2018-08-18 11:34:18,1.0,,,,,,...,,,,,,,,,,
36369,248,171011,3.5,2018-08-22 15:15:54,,,,,,,...,,,,,,,,,,
76223,479,5643,3.0,2002-12-08 17:11:09,,,,,,,...,,,,,,,,,,
60265,388,1203,1.0,2013-12-10 16:48:52,,,,,,,...,,,,,,,,,,
72418,467,2394,2.0,1999-02-22 08:02:39,,,,1.0,,,...,,,1.0,,,,,,,
48086,312,1289,5.0,2003-01-21 19:26:40,,,,,,,...,,,,,,,,,,
14881,95,162,5.0,2003-01-23 16:45:11,,,,,,,...,,,,,,,,,,
96017,602,246,3.0,1996-08-24 08:43:48,,,,,,,...,,,,,,,,,,
3624,21,122896,4.0,2016-01-06 03:49:23,1.0,,,,,,...,,,,,,,,,,
22163,147,596,3.0,2008-02-17 17:02:10,,,,1.0,1.0,,...,,,1.0,,,,,,,


In [48]:
# Cold Start 문제로 인한 NaN은 global mean으로 넣어주기
test.loc[test['predict'].isnull(), 'predict'] = train['rating'].mean()

In [49]:
test[test['predict'].isnull()]

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,predict


## 모델 평가

In [50]:
# rsme로 평가
from sklearn.metrics import mean_squared_error

In [51]:
mse = mean_squared_error(test['rating'], test['predict'])
rmse = np.sqrt(mse)
rmse

0.9078973553755864