# Collaborative Filtering
- Collaborative Filtering을 샘플 데이터셋으로, 실습해보며 코드를 짜봅니다.
- data: MovieLens dataset ([Movielens Dataset]( https://grouplens.org/datasets/movielens/))

In [1]:
# !wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip ./data/

## MovieLens dataset

In [31]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Train, test data split

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(ratings, test_size=0.2, random_state=0)
train_set.head()

Unnamed: 0,userId,movieId,rating,timestamp
77701,483,8529,4.0,1215545278
94477,599,33437,2.5,1498518389
36246,247,5349,2.0,1467645405
17483,111,7361,3.5,1516140853
100300,610,57504,4.5,1493847901


### Encoding
- userId 와 movieId 는 각각 카테고리컬 변수이다. 이를 각각 인코딩 해준다.

In [6]:
def column_to_index(column, train_column=None):
    if train_column is not None:
        unique = train_column.unique()
    else:
        unique = column.unique()
    id_to_index = {id_: index for index, id_ in enumerate(unique)}
    return id_to_index, np.array([id_to_index.get(id_, -1) for id_ in column]), len(unique)

In [7]:
def encode_data(df, train=None):
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _, col, _ = column_to_index(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >=0]
    return df

In [8]:
# encode 전 train dataset, test dataset
print(train_set.head())
print(test_set.head())

        userId  movieId  rating   timestamp
77701      483     8529     4.0  1215545278
94477      599    33437     2.5  1498518389
36246      247     5349     2.0  1467645405
17483      111     7361     3.5  1516140853
100300     610    57504     4.5  1493847901
       userId  movieId  rating   timestamp
41008     276      780     5.0   858350384
94274     599     7624     2.5  1519235950
77380     483     1320     2.5  1215895327
29744     202     3448     3.0   974924072
40462     274    60291     4.0  1296947017


In [9]:
# encoding 후 train dataset, test datasett
train_set_encoded = encode_data(train_set)
test_set_encoded = encode_data(test_set, train_set)
print(train_set_encoded.head())
print(test_set_encoded.head())

        userId  movieId  rating   timestamp
77701        0        0     4.0  1215545278
94477        1        1     2.5  1498518389
36246        2        2     2.0  1467645405
17483        3        3     3.5  1516140853
100300       4        4     4.5  1493847901
       userId  movieId  rating   timestamp
41008     119      135     5.0   858350384
94274       1     7243     2.5  1519235950
77380       0     1830     2.5  1215895327
29744      35     1666     3.0   974924072
40462     105     8418     4.0  1296947017


### Make Batch Iterator
- Batch 별로 Iterate 해주기 위한, Data Iterator

In [32]:
import math
class DataIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
    
    
    def __iter__(self):
        return self
    
    
    def __next__(self):
        return self.next()
    
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        
        return self.X[k*bs:(k+1)*bs], self.y[k*bs:(k+1)*bs]
        
        

In [33]:
def generate_batches(X, y, bs=32, shuffle=True):
    for xb, yb in DataIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1)

### Matrix Factorization Model using Embedding Layer

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [11]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=100):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
    
    def forward(self, user, item):
        user = self.user_embedding(user)
        item = self.item_embedding(item)
        return (user * item).sum(1)

### Training Model

In [12]:
num_users = len(train_set_encoded['userId'].unique())
num_items = len(train_set_encoded['movieId'].unique())
print(num_users, num_items)

610 8975


In [13]:
mf_model = MatrixFactorization(num_users, num_items, embedding_size=100)

In [14]:
def train_model(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_set_encoded['userId'].values)
        items = torch.LongTensor(train_set_encoded['movieId'].values)
        ratings = torch.FloatTensor(train_set['rating'].values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model, unsqueeze)
    
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(test_set_encoded['userId'].values)
    items = torch.LongTensor(test_set_encoded['movieId'].values)
    ratings = torch.FloatTensor(test_set_encoded['rating'].values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss: {0:.3f}".format(loss.item()))

In [15]:
train_model(mf_model, epochs=10, lr=0.1)

112.4322738647461
63.018524169921875
35.95763397216797
22.013687133789062
14.920804023742676
11.190710067749023
9.11557674407959
7.861675262451172
6.988626956939697
6.261568546295166
test loss: 46.418


In [16]:
train_model(mf_model, lr=0.05)

5.580074787139893
2.839423894882202
1.6660510301589966
1.1793171167373657
0.9820349216461182
0.8887496590614319
0.8185961842536926
0.7408943772315979
0.652494490146637
0.5622036457061768
test loss: 31.243


In [17]:
train_model(mf_model, epochs=15, lr=0.05)

0.4794389605522156
0.5453332662582397
0.27274689078330994
0.2827778160572052
0.2757742404937744
0.20973117649555206
0.15962329506874084
0.1408979892730713
0.1299412101507187
0.11288973689079285
0.09474336355924606
0.0820499062538147
0.07524076849222183
0.0701921358704567
0.0635247603058815
test loss: 25.021


In [18]:
train_model(mf_model, epochs=15, lr=0.01)

0.055700261145830154
0.0283324234187603
0.021986456587910652
0.021038152277469635
0.019089234992861748
0.01672755554318428
0.015100487507879734
0.014193456619977951
0.013453289866447449
0.0124736949801445
0.011277560144662857
0.010084757581353188
0.009070792235434055
0.008271683938801289
0.007653325330466032
test loss: 24.786


In [19]:
train_model(mf_model, epochs=10, lr=0.01)

0.007174388971179724
0.02144342102110386
0.010863028466701508
0.011516516096889973
0.011108600534498692
0.008899148553609848
0.007382851094007492
0.007020988501608372
0.006790283601731062
0.006282983813434839
test loss: 24.782


### Embedding Matrix Initialize with uniform distribution

In [20]:
class MatrixFactorization_uniform(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=100):
        super(MatrixFactorization_uniform, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.item_embedding.weight.data.uniform_(0, 0.05)
    
    def forward(self, user, item):
        user = self.user_embedding(user)
        item = self.item_embedding(item)
        return (user * item).sum(1)

In [21]:
mf_uniform_model = MatrixFactorization_uniform(num_users, num_items, embedding_size=100)

In [22]:
train_model(mf_uniform_model, epochs=10, lr=0.1)

12.919435501098633
4.933681488037109
2.4307403564453125
3.229398250579834
0.8534629344940186
1.7833770513534546
2.6639256477355957
2.1696536540985107
1.0906656980514526
0.9177945256233215
test loss: 1.928


In [23]:
train_model(mf_uniform_model, lr=0.05)

1.6422327756881714
1.4307241439819336
1.74745774269104
1.054850459098816
0.73503577709198
1.0960299968719482
1.137143850326538
0.7800192832946777
0.6460190415382385
0.8131081461906433
test loss: 1.150


In [24]:
train_model(mf_uniform_model, epochs=15, lr=0.05)

0.9471482634544373
2.5225729942321777
0.7070363163948059
0.9123658537864685
1.502804160118103
1.3970897197723389
0.9053442478179932
0.6238166689872742
0.7975934147834778
0.9980548024177551
0.8657450079917908
0.6443963646888733
0.6120921969413757
0.7133476734161377
0.7606694102287292
test loss: 1.000


In [25]:
train_model(mf_uniform_model, epochs=15, lr=0.01)

0.683856189250946
0.5094601511955261
0.5330264568328857
0.5602948069572449
0.5288777351379395
0.48965221643447876
0.47636333107948303
0.4834703803062439
0.48810875415802
0.47780880331993103
0.45691660046577454
0.4371405243873596
0.4266369938850403
0.42426589131355286
0.42207497358322144
test loss: 0.809


In [26]:
train_model(mf_uniform_model, epochs=10, lr=0.01)

0.41360846161842346
0.4417349696159363
0.39798808097839355
0.3782435953617096
0.38461095094680786
0.3701026737689972
0.34544897079467773
0.3323555588722229
0.3279605507850647
0.3180648386478424
test loss: 0.809
