# Surprise 모듈

In [1]:
!pip install scikit-surprise



You should consider upgrading via the 'c:\users\mylaptop\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
# SVD 말그대로 SVD를 통한 행렬 분해를 시키는 함수 -> 객체는 predict, fit, test
# Dataset은 surprise 모듈에서 사용가능하도록 데이터를 불러오는 함수
# accuracy는 정확도를 구하는 함수
# Reader는 데이터를 읽을 때 어떻게 읽을지를 명시하는 함수 
from surprise import SVD, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [3]:
ratings = pd.read_csv('./data/ratings.csv')
# print(ratings)
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df=ratings[['userId', 'movieId', 'rating']], reader=reader)
print(data)

<surprise.dataset.DatasetAutoFolds object at 0x000002177A23C588>


In [4]:
train, test = train_test_split(data, test_size=0.2, shuffle=True)

# 분해한 행렬을 연산했을 때 trainset 데이터 (평점) 에 가깝도록 행렬을 분해
algo = SVD(n_factors=50, n_epochs=20)
algo.fit(trainset=train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2177a23cc08>

In [5]:
pred = algo.test(testset=test)
print(pred[0:3])
accuracy.rmse(predictions=pred)

[Prediction(uid=457, iid=133295, r_ui=2.5, est=2.5523303190675755, details={'was_impossible': False}), Prediction(uid=15, iid=1186, r_ui=4.5, est=3.269564033754299, details={'was_impossible': False}), Prediction(uid=185, iid=2054, r_ui=3.0, est=2.8671077314509965, details={'was_impossible': False})]
RMSE: 0.8929


0.8928863264486003

In [6]:
# 분해된 행렬을 연산을 해서 예측값을 도출해냄
pred = algo.predict('10', '200')
print(pred)

user: 10         item: 200        r_ui = None   est = 3.54   {'was_impossible': False}


In [7]:
# 10번 유저에 대해서 모든 영화에 부여할 평점을 예측하고,
# 그 평점을 높은 순으로 정렬
preds = []
for i in ratings['movieId'].unique():
    pred = algo.predict('10', i)
    preds.append((pred.est, pred.iid))
print(preds)

[(3.3169302587287137, 31), (3.725483611974276, 1029), (3.5100566579147965, 1061), (3.516912550184294, 1129), (4.251897910377387, 1172), (3.9372820351207807, 1263), (4.0996473126014354, 1287), (4.081480056421271, 1293), (3.47217208953, 1339), (3.8760515077198057, 1343), (3.11997700616169, 1371), (3.3091616064886726, 1405), (4.022551996345319, 1953), (3.601903706210456, 2105), (3.740262327197306, 2150), (3.3660015078677645, 2193), (3.3002103776285856, 2294), (3.490648364961256, 2455), (3.8298208943480696, 2968), (4.048945397557032, 3671), (3.434036409319785, 10), (3.910292473933878, 17), (3.6421238791893282, 39), (4.023834630520165, 47), (4.277117684031811, 50), (3.602700710933811, 52), (3.759224585680357, 62), (3.9472392681113706, 110), (3.4755041061984824, 144), (3.8896098296006745, 150), (2.879176252679574, 153), (3.7668489931450826, 161), (3.473487262234772, 165), (3.288568048457486, 168), (3.2034911063958935, 185), (2.972944486278049, 186), (2.875326148285284, 208), (3.8523205270689

In [8]:
preds.sort(reverse=True)
print(preds)

[(4.481338288948747, 318), (4.431617418126759, 969), (4.425304471736006, 3462), (4.396731220920688, 858), (4.3920675419709525, 913), (4.382269094105942, 905), (4.377487553223131, 899), (4.369915195663625, 1945), (4.365709752291098, 904), (4.360323293438823, 1203), (4.344355993183172, 1254), (4.337804418564238, 1221), (4.3371640602055175, 2064), (4.332107954132847, 1228), (4.329895007605258, 7502), (4.326278216887134, 1219), (4.321495546548946, 908), (4.31933611806053, 926), (4.31686216449069, 898), (4.306373679981026, 2019), (4.3038014216707845, 527), (4.300585633878477, 48516), (4.295894358938502, 1193), (4.295464012520689, 6016), (4.294980514358085, 2318), (4.292927795125166, 1060), (4.292486137999258, 1299), (4.2901635838151435, 912), (4.28692525774883, 1212), (4.286520145664559, 2692), (4.281496753174796, 1252), (4.277117684031811, 50), (4.270292209767036, 2542), (4.270146561671668, 3035), (4.262316666607426, 750), (4.259409066406543, 608), (4.259122589869785, 745), (4.254892165874

In [9]:
movies = pd.read_csv('./data/movies.csv')
print(movies[movies['movieId']==858])

     movieId                  title       genres
695      858  Godfather, The (1972)  Crime|Drama


# 유튜브 추천 시스템

In [10]:
# x값으로 사용자 히스토리 (시청한 내역, 좋아요, 싫어요) 를 넣으면,
# y값으로 해당 사용자가 다음으로 볼 영상의 인덱스
import pandas as pd
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')

movie_ratings = pd.merge(movies, ratings, on='movieId')
print(movie_ratings)

        movieId                                              title  \
0             1                                   Toy Story (1995)   
1             1                                   Toy Story (1995)   
2             1                                   Toy Story (1995)   
3             1                                   Toy Story (1995)   
4             1                                   Toy Story (1995)   
...         ...                                                ...   
99999    161944              The Last Brickmaker in America (2001)   
100000   162376                                    Stranger Things   
100001   162542                                      Rustom (2016)   
100002   162672                                Mohenjo Daro (2016)   
100003   163949  The Beatles: Eight Days a Week - The Touring Y...   

                                             genres  userId  rating  \
0       Adventure|Animation|Children|Comedy|Fantasy       7     3.0   
1       Adventure

In [11]:
import numpy as np
movie_ratings['movie_type'] = np.where(movie_ratings['rating'] >= 3, 'like', 'dislike')
print(movie_ratings)

        movieId                                              title  \
0             1                                   Toy Story (1995)   
1             1                                   Toy Story (1995)   
2             1                                   Toy Story (1995)   
3             1                                   Toy Story (1995)   
4             1                                   Toy Story (1995)   
...         ...                                                ...   
99999    161944              The Last Brickmaker in America (2001)   
100000   162376                                    Stranger Things   
100001   162542                                      Rustom (2016)   
100002   162672                                Mohenjo Daro (2016)   
100003   163949  The Beatles: Eight Days a Week - The Touring Y...   

                                             genres  userId  rating  \
0       Adventure|Animation|Children|Comedy|Fantasy       7     3.0   
1       Adventure

In [12]:
# userId, movieId 를 정수 인코딩
user_ids = movie_ratings["userId"].unique().tolist()
user_to_index = {x: i for i, x in enumerate(user_ids)}
index_to_user = {i: x for i, x in enumerate(user_ids)}

movie_ids = movie_ratings["movieId"].unique().tolist()
movie_to_index = {x: i for i, x in enumerate(movie_ids)}
index_to_movie = {i: x for i, x in enumerate(movie_ids)}

print(user_to_index)
print(movie_to_index)

{7: 0, 9: 1, 13: 2, 15: 3, 19: 4, 20: 5, 23: 6, 26: 7, 30: 8, 37: 9, 43: 10, 44: 11, 47: 12, 48: 13, 55: 14, 56: 15, 63: 16, 67: 17, 68: 18, 69: 19, 70: 20, 72: 21, 73: 22, 75: 23, 77: 24, 79: 25, 84: 26, 86: 27, 87: 28, 89: 29, 90: 30, 91: 31, 92: 32, 93: 33, 94: 34, 97: 35, 99: 36, 100: 37, 106: 38, 112: 39, 119: 40, 120: 41, 121: 42, 122: 43, 124: 44, 125: 45, 126: 46, 128: 47, 130: 48, 134: 49, 136: 50, 138: 51, 142: 52, 146: 53, 149: 54, 150: 55, 152: 56, 153: 57, 154: 58, 157: 59, 163: 60, 164: 61, 165: 62, 168: 63, 169: 64, 173: 65, 175: 66, 176: 67, 177: 68, 178: 69, 179: 70, 184: 71, 185: 72, 187: 73, 193: 74, 200: 75, 201: 76, 205: 77, 212: 78, 213: 79, 215: 80, 219: 81, 220: 82, 224: 83, 232: 84, 237: 85, 240: 86, 241: 87, 242: 88, 247: 89, 252: 90, 253: 91, 261: 92, 262: 93, 268: 94, 272: 95, 273: 96, 275: 97, 282: 98, 283: 99, 284: 100, 285: 101, 287: 102, 292: 103, 293: 104, 294: 105, 306: 106, 311: 107, 312: 108, 313: 109, 314: 110, 318: 111, 321: 112, 324: 113, 328: 114

In [13]:
movie_ratings['userId'] = movie_ratings['userId'].map(user_to_index)
movie_ratings['movieId'] = movie_ratings['movieId'].map(movie_to_index)
print(movie_ratings)

        movieId                                              title  \
0             0                                   Toy Story (1995)   
1             0                                   Toy Story (1995)   
2             0                                   Toy Story (1995)   
3             0                                   Toy Story (1995)   
4             0                                   Toy Story (1995)   
...         ...                                                ...   
99999      9061              The Last Brickmaker in America (2001)   
100000     9062                                    Stranger Things   
100001     9063                                      Rustom (2016)   
100002     9064                                Mohenjo Daro (2016)   
100003     9065  The Beatles: Eight Days a Week - The Touring Y...   

                                             genres  userId  rating  \
0       Adventure|Animation|Children|Comedy|Fantasy       0     3.0   
1       Adventure

In [14]:
# 사용자 별로 어떤 영상을 좋아했고, 싫어했고, 시청했는지
movie_list = movie_ratings.groupby(['userId', 'movie_type'])['movieId'].apply(list).reset_index()
title_list = movie_ratings.groupby(['userId'])['movieId'].apply(list).reset_index()
print(movie_list)
print(title_list)

      userId movie_type                                            movieId
0          0    dislike         [174, 282, 610, 617, 618, 650, 1113, 1143]
1          0       like  [0, 9, 20, 30, 32, 38, 96, 100, 102, 122, 130,...
2          1    dislike                      [615, 1231, 1815, 1936, 2035]
3          1       like  [0, 16, 25, 34, 45, 284, 444, 461, 472, 478, 5...
4          2    dislike                                   [45, 1019, 3427]
...      ...        ...                                                ...
1281     668       like  [856, 941, 1154, 1185, 1269, 1285, 1288, 1289,...
1282     669    dislike         [1454, 1472, 1473, 1568, 1587, 1685, 1741]
1283     669       like  [1448, 1450, 1453, 1456, 1458, 1462, 1480, 148...
1284     670    dislike               [1765, 1817, 1836, 1865, 1894, 1895]
1285     670       like  [1763, 1798, 1802, 1825, 1832, 1833, 1842, 184...

[1286 rows x 3 columns]
     userId                                            movieId
0         0 

In [15]:
# 사용자 별로 어떤 영상을 좋아했고, 싫어했는지
user_movie_list = movie_list.pivot(index='userId', columns='movie_type', values='movieId').reset_index()
print(user_movie_list)

movie_type  userId                                            dislike  \
0                0         [174, 282, 610, 617, 618, 650, 1113, 1143]   
1                1                      [615, 1231, 1815, 1936, 2035]   
2                2                                   [45, 1019, 3427]   
3                3  [0, 1, 10, 13, 18, 21, 34, 37, 49, 58, 65, 87,...   
4                4  [22, 60, 145, 151, 156, 158, 185, 238, 248, 33...   
..             ...                                                ...   
666            666                           [1302, 1906, 2161, 2181]   
667            667                [954, 2884, 4066, 4157, 4336, 4378]   
668            668                           [1197, 1257, 1332, 1354]   
669            669         [1454, 1472, 1473, 1568, 1587, 1685, 1741]   
670            670               [1765, 1817, 1836, 1865, 1894, 1895]   

movie_type                                               like  
0           [0, 9, 20, 30, 32, 38, 96, 100, 102, 122, 130,.

In [16]:
# 어떤 사용자가 아무런 영상도 보지 않았다면? -> NaN
# 결측치 값을 채워줄건데 -> dummy 영화id -> 9066
# 리스트 형태로 행들을 맞춰줄건데... fillna 함수는 list 형태로 채워줄 수가 없음...
# 결측치 값을 9066, lambda를 써서 [9066]
user_movie_list.fillna(9066, inplace=True)
user_movie_list['like'] = user_movie_list['like'].apply(lambda x: x if type(x) is list else [x])
user_movie_list['dislike'] = user_movie_list['dislike'].apply(lambda x: x if type(x) is list else [x])

In [17]:
user_final_list = pd.merge(user_movie_list, title_list, how='left')
print(user_final_list)

     userId                                            dislike  \
0         0         [174, 282, 610, 617, 618, 650, 1113, 1143]   
1         1                      [615, 1231, 1815, 1936, 2035]   
2         2                                   [45, 1019, 3427]   
3         3  [0, 1, 10, 13, 18, 21, 34, 37, 49, 58, 65, 87,...   
4         4  [22, 60, 145, 151, 156, 158, 185, 238, 248, 33...   
..      ...                                                ...   
666     666                           [1302, 1906, 2161, 2181]   
667     667                [954, 2884, 4066, 4157, 4336, 4378]   
668     668                           [1197, 1257, 1332, 1354]   
669     669         [1454, 1472, 1473, 1568, 1587, 1685, 1741]   
670     670               [1765, 1817, 1836, 1865, 1894, 1895]   

                                                  like  \
0    [0, 9, 20, 30, 32, 38, 96, 100, 102, 122, 130,...   
1    [0, 16, 25, 34, 45, 284, 444, 461, 472, 478, 5...   
2    [0, 100, 249, 266, 284, 321,

In [18]:
# x값(유저 히스토리) -> y값(유저가 다음으로 볼 영상의 인덱스값)
# 사용자가 마지막으로 좋아요를 누른 영화를 빼내서 y값으로 지정
user_final_list['predict_labels'] = user_final_list['like'].apply(lambda x: (x[-1]))
user_final_list['like'] = user_final_list['like'].apply(lambda x: (x[:-1]))
print(user_final_list)

     userId                                            dislike  \
0         0         [174, 282, 610, 617, 618, 650, 1113, 1143]   
1         1                      [615, 1231, 1815, 1936, 2035]   
2         2                                   [45, 1019, 3427]   
3         3  [0, 1, 10, 13, 18, 21, 34, 37, 49, 58, 65, 87,...   
4         4  [22, 60, 145, 151, 156, 158, 185, 238, 248, 33...   
..      ...                                                ...   
666     666                           [1302, 1906, 2161, 2181]   
667     667                [954, 2884, 4066, 4157, 4336, 4378]   
668     668                           [1197, 1257, 1332, 1354]   
669     669         [1454, 1472, 1473, 1568, 1587, 1685, 1741]   
670     670               [1765, 1817, 1836, 1865, 1894, 1895]   

                                                  like  \
0    [0, 9, 20, 30, 32, 38, 96, 100, 102, 122, 130,...   
1    [0, 16, 25, 34, 45, 284, 444, 461, 472, 478, 5...   
2    [0, 100, 249, 266, 284, 321,

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, BatchNormalization, ReLU
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
# 패딩해줍시다!
pm = pad_sequences(user_final_list['movieId'])
pl = pad_sequences(user_final_list['like'])
pd = pad_sequences(user_final_list['dislike'])

In [21]:
# 패딩된 길이대로 shape 지정
input_title = Input(shape=(2391, ))
input_liked = Input(shape=(1819, ))
input_disliked = Input(shape=(843, ))

# 영화 제목은 9067개 (movieId 0~9065, 결측치값 9066)
# 좋아하거나 싫어할 수 있는 영화도 9067개 (movieId 0~9065, 결측치값 9066)
# 혹시나 이것이 다른 경우가 있을 수도 있기에 임베딩 레이어를 각각 만들어줬습니다!
features_embedding_layer = Embedding(9067, 16, mask_zero=True)
labels_embedding_layer = Embedding(9067, 16, mask_zero=True)

dense_1 = Dense(units=48, activation='relu')
dense_2 = Dense(units=48, activation='relu')
dense_3 = Dense(units=48, activation='relu')

# 총 9067개의 y값이 존재할 수 있으므로 (movieId 0~9065, 결측치값 9066)
dense_output = Dense(9067, activation='softmax')

In [22]:
# reduce mean axis=1 을 하면 데이터길이 축을 따라서 평균을 낸다
# 그러면 (671, 2391, 16) = (데이터개수, 데이터길이, 임베딩된 차원)
# 에서 (671, 16) = 각 데이터들이 16짜리 임베딩 벡터로 표현된 것
# 각 유저에 대해서 좋아하는 영화끼리 평균 / 싫어하는 영화끼리 평균 / 시청한 영화끼리 평균한 것
features_embeddings = features_embedding_layer(input_title)
avg_features = tf.reduce_mean(features_embeddings, axis=1)

labels_liked_embeddings = labels_embedding_layer(input_liked)
avg_liked = tf.reduce_mean(labels_liked_embeddings, axis=1)

labels_disliked_embeddings = labels_embedding_layer(input_disliked)
avg_disliked = tf.reduce_mean(labels_disliked_embeddings, axis=1)

# 세 개 다 (671, 16) 형태로 reduce_mean 되었으니 결합하면 (671, 48) 이 될 것!
concat_inputs = Concatenate()([avg_features, avg_liked, avg_disliked])

dense_1_out = dense_1(concat_inputs)
dense_2_out = dense_2(dense_1_out)
dense_3_out = dense_3(dense_2_out)
dense_3_batch_norm = BatchNormalization()(dense_3_out)
outputs = dense_output(dense_3_batch_norm)

In [23]:
model = Model([input_title, input_liked, input_disliked], outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit([pm, pl, pd], user_final_list['predict_labels'].values, epochs=100)

Train on 671 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x21774c5c348>

In [24]:
# 모델 구조도
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
