In [11]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

## Data

In [22]:
movie = pd.read_csv("./ratings.csv")
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [10]:
movie["rating"].unique()

array([2.5, 3. , 2. , 4. , 3.5, 1. , 5. , 4.5, 1.5, 0.5])

In [19]:
user2idx = {}
for i, l in enumerate(movie['userId'].unique()):
    user2idx[l] = i

movie2idx = {}
for i, l in enumerate(movie['movieId'].unique()):
    movie2idx[l] = i

idx2user = {i: user for user, i in user2idx.items()}
idx2movie = {i: item for item, i in movie2idx.items()}
#user2idx.items()

In [11]:
movie["movieId"].unique()

array([  31, 1029, 1061, ...,  129, 4736, 6425], dtype=int64)

In [13]:
for i, j in enumerate(movie['userId'].unique()):
    print(i, j)

0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
30 31
31 32
32 33
33 34
34 35
35 36
36 37
37 38
38 39
39 40
40 41
41 42
42 43
43 44
44 45
45 46
46 47
47 48
48 49
49 50
50 51
51 52
52 53
53 54
54 55
55 56
56 57
57 58
58 59
59 60
60 61
61 62
62 63
63 64
64 65
65 66
66 67
67 68
68 69
69 70
70 71
71 72
72 73
73 74
74 75
75 76
76 77
77 78
78 79
79 80
80 81
81 82
82 83
83 84
84 85
85 86
86 87
87 88
88 89
89 90
90 91
91 92
92 93
93 94
94 95
95 96
96 97
97 98
98 99
99 100
100 101
101 102
102 103
103 104
104 105
105 106
106 107
107 108
108 109
109 110
110 111
111 112
112 113
113 114
114 115
115 116
116 117
117 118
118 119
119 120
120 121
121 122
122 123
123 124
124 125
125 126
126 127
127 128
128 129
129 130
130 131
131 132
132 133
133 134
134 135
135 136
136 137
137 138
138 139
139 140
140 141
141 142
142 143
143 144
144 145
145 146
146 147
147 148
148 149
149 150
150 151
151 152
15

In [14]:
useridx = movie['useridx'] = movie['userId'].apply(lambda x: user2idx[x]).values
movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: movie2idx[x]).values
rating = movie['rating'].values

In [20]:
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [23]:
n_users = movie['userId'].nunique()
n_items = movie['movieId'].nunique()
print(n_users, n_items)

671 9066


In [26]:
import scipy
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

NameError: name 'rating' is not defined

## Model

In [24]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.nn.init as weight_init

class MatrixFactorization(nn.Module):
    def __init__(self,R, n_users, n_items, n_factors=20):
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)
        
        # weight 초기화 
        weight_init.xavier_uniform_(self.user_factors.weight)
        weight_init.xavier_uniform_(self.item_factors.weight)
       
        # original Matrix 
        self.R = R
        
    def forward(self, user, item):
        pred = (self.user_factors(user) * self.item_factors(item)).sum(1)
        return pred
    
    def complete_matrix(self):
        return torch.matmul(self.user_factors.weight, self.item_factors.weight.T)

In [25]:
model = MatrixFactorization(ratings, n_users, n_items, n_factors=20)

NameError: name 'ratings' is not defined

Batch를 사용하지 않은 Matrix Factorization

In [19]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-3)  # learning rate
loss_func = torch.nn.MSELoss()

In [5]:
rows, cols = ratings.nonzero()

nb_epochs = 10
for epoch in tqdm_notebook(range(nb_epochs)):
    train_loss = 0
    for row, col in zip(*(rows, cols)):
        # gradient 값을 0으로 설정 
        optimizer.zero_grad()

        # 데이터를 Tensor형태로 변환 
        rating = torch.FloatTensor([ratings[row, col]])
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # 예측값을 만들고 Loss를 계산 
        prediction = model(row, col)
        loss = loss_func(prediction, rating)
        train_loss += loss.item()
        
        # 역전파 
        loss.backward()

        # 파라미터를 갱신
        optimizer.step()
    cost_ = model.cost()
    print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(rows)))

NameError: name 'ratings' is not defined

## Recommend 

In [6]:
idx2rec = {}
for u in useridx.key():
    item_rec = np.argsort(-torch.matmul(model.user_factors.weight[user2idx[u]], model.item_factors.weight.T).detach().numpy())[0:200]
    # 추천에서 제외해야할 항목
    item_rec = [idx2movie[x[0]] for x in item_rec if x not in movie[movie['useridx']==u]['movieidx'].unique()][0:100]
    idx2rec[idx2user[u]] = item_rec   

NameError: name 'useridx' is not defined

In [7]:
idx2rec[0]

KeyError: 0