# Collaborative Filtering: MovieLens
- Pytorch 를 사용하여 Collaborative Filtering 을 모델링 해 봅니다.
- 우리가 사용하려고 하는 데이터 셋은 MovieLens Dataset 입니다.

In [1]:
import math
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from itertools import zip_longest

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import _LRScheduler

- 연습을 위해, random seed 를 동일하게 설정합니다.

In [2]:
def set_random_seed(state=1):
    targets = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in targets:
        set_state(state)
        
RANDOM_STATE = 20
set_random_seed(RANDOM_STATE)

## Data Preparation
- Rating data 와 각 영화에 대한 정보가 담겨있는 Movie dataframe을 로드하고, 학습에 적합하게 encoding 을 해줍니다. movieLens dataset 의 경우, userId 와 movieId가 모두 숫자로 되어 있으나, 실제로 recommendation 을 설계하는 data 의 경우, String이나 hash 정보가 담겨 있는 경우가 많으므로, dataset 의 id 를 기준으로 encoding 을 해주어야 합니다.
- 원핫인코딩의 sparsity 를 해결하기 위해, element 가 1인 위치를 나타내는 integer 로 encoding 해줍니다.

In [3]:
ratings_df = pd.read_csv('./data/ml-latest-small/ratings.csv')
movies_df = pd.read_csv('./data/ml-latest-small/movies.csv')

In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings_df.shape

(100836, 4)

In [6]:
# encoding
def encoding(ratings_df):
    unique_users = ratings_df['userId'].unique()
    unique_movies = ratings_df['movieId'].unique()
    
    user_to_index = {user_id: index for index, user_id in enumerate(unique_users)}
    movie_to_index = {movie_id: index for index, movie_id in enumerate(unique_movies)}
    
    ratings_df['userId'] = ratings_df['userId'].map(user_to_index)
    ratings_df['movieId'] = ratings_df['movieId'].map(movie_to_index)
    
    # return unique_users number and unique_movies number for embedding dimensions
    num_unique_users = len(unique_users)
    num_unique_movies = len(unique_movies)
    
    return ratings_df, user_to_index, movie_to_index, num_unique_users, num_unique_movies

In [7]:
ratings_df, user_to_index, movie_to_index, num_unique_users, num_unique_movies = encoding(ratings_df)

print("number of unique users: {}".format(num_unique_users))
print("number of uniquee movies: {}".format(num_unique_movies))
ratings_df.head()

number of unique users: 610
number of uniquee movies: 9724


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931


## Batch Generator

In [8]:
class DataIterator:
    
    def __init__(self, features, targets, batch_size=32, shuffle=True):
        features, targets = np.asarray(features), np.asarray(targets)
        
        if shuffle:
            index = np.random.permutation(features.shape[0])
            features, targets = features[index], targets[index]
            
        self.features = features
        self.targets = targets
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(features.shape[0] // batch_size))
        self._current = 0
        
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        cur = self._current
        self._current += 1
        batch_size = self.batch_size
        return self.features[cur * batch_size : (cur + 1) * batch_size], self.targets[cur * batch_size : (cur + 1) * batch_size]

In [9]:
def generate_batches(features, targets, batch_size=32, shuffle=True):
    for feature_batch, target_batch in DataIterator(features, targets, batch_size, shuffle):
        feature_batch = torch.LongTensor(feature_batch)
        target_batch = torch.FloatTensor(target_batch)
        yield feature_batch, target_batch.view(-1, 1)

In [10]:
features = ratings_df[['userId', 'movieId']]
targets = ratings_df['rating']

for x_batch, y_batch in generate_batches(features, targets, batch_size=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[  56, 1491],
        [ 364,  915],
        [ 272,  556],
        [ 455,  522]])
tensor([[2.],
        [3.],
        [5.],
        [3.]])


## Modeling

In [11]:
def get_list(n):
        if isinstance(n, (int, float)):
            return [n]
        elif hasattr(n, '__iter__'):
            return list(n)
        raise TypeError("layers configuration should be a single number or a list of numbers")
      
    
class CollabFilter_Network(nn.Module):
              
            
    def __init__(self, n_users, n_movies, embedding_size=100, embedding_dropout=0.03,
                hidden=10, dropouts=0.2):
        super().__init__()
        hidden = get_list(hidden)
        dropouts = get_list(dropouts)
        n_last = hidden[-1]
        
        def gen_layers(n_in):
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            for n_out, rate in zip_longest(hidden, dropouts):
                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
        
        self.u = nn.Embedding(n_users, embedding_size)
        self.m = nn.Embedding(n_movies, embedding_size)
        self.drop = nn.Dropout(embedding_dropout)
        self.hidden = nn.Sequential(*list(gen_layers(embedding_size * 2)))
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, movies, minmax=None):
        features = torch.cat([self.u(users), self.m(movies)], dim=1)
        x = self.drop(features)
        x = self.hidden(x)
        out = torch.sigmoid(self.fc(x))
        if minmax is not None:
            min_rating, max_rating = minmax
            out = out * (max_rating - min_rating + 1) + min_rating - 0.5
        return out
    
    
    def _init(self):
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
        
    

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(features, targets, test_size=0.2, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [13]:
minmax = ratings_df['rating'].min().astype(float), ratings_df['rating'].max().astype(float)
minmax

(0.5, 5.0)