# Collaborative Filtering
- Collaborative Filtering을 샘플 데이터셋으로, 실습해보며 코드를 짜봅니다.
- data: MovieLens dataset ([Movielens Dataset]( https://grouplens.org/datasets/movielens/))

In [1]:
# !wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip ./data/

## MovieLens dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('./data/ml-latest-small/ratings.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Train, test data split

In [4]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=0)
train_set.head()

Unnamed: 0,userId,movieId,rating,timestamp
77701,483,8529,4.0,1215545278
94477,599,33437,2.5,1498518389
36246,247,5349,2.0,1467645405
17483,111,7361,3.5,1516140853
100300,610,57504,4.5,1493847901


### Encoding
- userId 와 movieId 는 각각 카테고리컬 변수이다. 이를 각각 인코딩 해준다.

In [5]:
for name, index in enumerate(train_set['userId'].unique()):
    print(name, index)
    break

0 483


In [6]:
def column_to_index(column, train_column=None):
    if train_column is not None:
        unique = train_column.unique()
    else:
        unique = column.unique()
    id_to_index = {id_: index for index, id_ in enumerate(unique)}
    return id_to_index, np.array([id_to_index.get(id_, -1) for id_ in column]), len(unique)

In [7]:
def encode_data(df, train=None):
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _, col, _ = column_to_index(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >=0]
    return df

In [8]:
# encode 전 train dataset, test dataset
print(train_set.head())
print(test_set.head())

        userId  movieId  rating   timestamp
77701      483     8529     4.0  1215545278
94477      599    33437     2.5  1498518389
36246      247     5349     2.0  1467645405
17483      111     7361     3.5  1516140853
100300     610    57504     4.5  1493847901
       userId  movieId  rating   timestamp
41008     276      780     5.0   858350384
94274     599     7624     2.5  1519235950
77380     483     1320     2.5  1215895327
29744     202     3448     3.0   974924072
40462     274    60291     4.0  1296947017


In [9]:
# encoding 후 train dataset, test datasett
train_set_encoded = encode_data(train_set)
test_set_encoded = encode_data(test_set, train_set)
print(train_set_encoded.head())
print(test_set_encoded.head())

        userId  movieId  rating   timestamp
77701        0        0     4.0  1215545278
94477        1        1     2.5  1498518389
36246        2        2     2.0  1467645405
17483        3        3     3.5  1516140853
100300       4        4     4.5  1493847901
       userId  movieId  rating   timestamp
41008     119      135     5.0   858350384
94274       1     7243     2.5  1519235950
77380       0     1830     2.5  1215895327
29744      35     1666     3.0   974924072
40462     105     8418     4.0  1296947017


### Matrix Factorization Model using Embedding Layer

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [11]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=100):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
    
    def forward(self, user, item):
        user = self.user_embedding(user)
        item = self.item_embedding(item)
        return (user * item).sum(1)

### Training Model

In [12]:
num_users = len(train_set_encoded['userId'].unique())
num_items = len(train_set_encoded['movieId'].unique())
print(num_users, num_items)

610 8975


In [13]:
mf_model = MatrixFactorization(num_users, num_items, embedding_size=100)

In [14]:
def train_model(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_set_encoded['userId'].values)
        items = torch.LongTensor(train_set_encoded['movieId'].values)
        ratings = torch.FloatTensor(train_set['rating'].values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model, unsqueeze)
    
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(test_set_encoded['userId'].values)
    items = torch.LongTensor(test_set_encoded['movieId'].values)
    ratings = torch.FloatTensor(test_set_encoded['rating'].values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss: {0:.3f}".format(loss.item()))

In [17]:
train_model(mf_model, epochs=10, lr=0.1)

112.70184326171875
63.00698471069336
35.89173126220703
21.98044204711914
14.95960521697998
11.285816192626953
9.227702140808105
7.960115909576416
7.062285900115967
6.309814929962158
test loss: 46.518


In [18]:
train_model(mf_model, lr=0.05)

5.602383136749268
2.82804799079895
1.669775128364563
1.1921473741531372
0.9990464448928833
0.9065325260162354
0.8331282138824463
0.7519912719726562
0.6618367433547974
0.5699629783630371
test loss: 31.496


In [19]:
train_model(mf_model, epochs=15, lr=0.05)

0.48468807339668274
0.5511645078659058
0.2787975072860718
0.2851133644580841
0.2790704071521759
0.213246151804924
0.16214504837989807
0.14261886477470398
0.13167637586593628
0.11433210968971252
0.09560929238796234
0.08301843702793121
0.07671365141868591
0.07192128151655197
0.06493764370679855
test loss: 25.210


In [20]:
train_model(mf_model, epochs=15, lr=0.01)

0.05660627782344818
0.0290948785841465
0.02249792590737343
0.02166203036904335
0.019849905744194984
0.017429808154702187
0.01568765938282013
0.0146896131336689
0.013876928947865963
0.012871747836470604
0.011670161038637161
0.01047046110033989
0.00944160670042038
0.00861365720629692
0.007955825887620449
test loss: 24.924


In [21]:
train_model(mf_model, epochs=10, lr=0.01)

0.007437265943735838
0.021590830758213997
0.010936557315289974
0.011654525063931942
0.0112707344815135
0.008976185694336891
0.00746501050889492
0.007118956185877323
0.006899690721184015
0.006358754821121693
test loss: 24.912


### Embedding Matrix Initialize with uniform distribution

In [24]:
class MatrixFactorization_uniform(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=100):
        super(MatrixFactorization_uniform, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.item_embedding.weight.data.uniform_(0, 0.05)
    
    def forward(self, user, item):
        user = self.user_embedding(user)
        item = self.item_embedding(item)
        return (user * item).sum(1)

In [25]:
mf_uniform_model = MatrixFactorization_uniform(num_users, num_items, embedding_size=100)

In [26]:
train_model(mf_uniform_model, epochs=10, lr=0.1)

12.918964385986328
4.932332515716553
2.4280447959899902
3.2279775142669678
0.8523130416870117
1.782036304473877
2.661961078643799
2.167083263397217
1.0884267091751099
0.9171753525733948
test loss: 1.930


In [27]:
train_model(mf_uniform_model, lr=0.05)

1.642383098602295
1.4289661645889282
1.7477048635482788
1.0553420782089233
0.7338014841079712
1.0947291851043701
1.1368614435195923
0.7798032164573669
0.6455827355384827
0.8126983642578125
test loss: 1.151


In [28]:
train_model(mf_uniform_model, epochs=15, lr=0.05)

0.9467490315437317
2.5234551429748535
0.7070660591125488
0.9120100736618042
1.5028444528579712
1.397505760192871
0.9052100777626038
0.6227729320526123
0.796795129776001
0.998343825340271
0.865805983543396
0.6432095170021057
0.6104221343994141
0.7122362852096558
0.7603340744972229
test loss: 1.002


In [29]:
train_model(mf_uniform_model, epochs=15, lr=0.01)

0.6835158467292786
0.5082820057868958
0.5317350625991821
0.5594138503074646
0.5279513597488403
0.48834744095802307
0.4747507572174072
0.4818984568119049
0.4867956340312958
0.47664275765419006
0.4556306004524231
0.43555617332458496
0.424803227186203
0.4224257171154022
0.420448899269104
test loss: 0.809


In [30]:
train_model(mf_uniform_model, epochs=10, lr=0.01)

0.4121911823749542
0.4401232898235321
0.39664122462272644
0.37656882405281067
0.38303276896476746
0.36874154210090637
0.34401237964630127
0.3308502435684204
0.3265977203845978
0.31687480211257935
test loss: 0.811
