# Collaborative Filtering model on MovieLens

Download the 20m [movielens dataset](http://files.grouplens.org/datasets/movielens/ml-20m.zip)

You can use the aria2c or wget to download

In [1]:
# %cd /data
# !!aria2c -x5 http://files.grouplens.org/datasets/movielens/ml-20m.zip
# !!unzip ml-20m.zip

In [2]:
import pandas as pd
import numpy as np
import os
import torch
from p3self.matchbox import Trainer

In [3]:
DATA = "/data/ml-20m/"
BS = 2000
DIM = 50
CUDA = torch.cuda.is_available()

In [4]:
files = os.listdir(DATA)
files

['genome-scores.csv',
 'genome-tags.csv',
 'links.csv',
 'movies.csv',
 'ratings.csv',
 'README.txt',
 'tags.csv']

In [5]:
data = dict()
for f in files:
    if f[-3:]=="csv":
        data[f.split(".")[0]] = pd.read_csv(DATA+f)

### Check Data

In [6]:
from IPython.display import display
list(display(k,v.sample(5)) for k,v in data.items())

'genome-scores'

Unnamed: 0,movieId,tagId,relevance
6101408,6325,57,0.0135
5329477,5368,806,0.31725
101458,92,1067,0.0195
3344844,3360,325,0.10775
10081264,73431,329,0.038


'genome-tags'

Unnamed: 0,tagId,tag
391,392,figure skating
558,559,iran
460,461,graphic design
33,34,africa
373,374,family


'links'

Unnamed: 0,movieId,imdbId,tmdbId
1672,1732,118715,115.0
19829,97966,1438173,118957.0
24655,116602,106763,57789.0
8003,8686,59470,42737.0
13260,64957,421715,4922.0


'movies'

Unnamed: 0,movieId,title,genres
4457,4552,"Tetsuo, the Ironman (Tetsuo) (1988)",Action|Horror|Sci-Fi|Thriller
24746,116947,Stand Off (2012),Comedy|Drama
20518,100487,Beautiful Creatures (2013),Drama|Fantasy|Romance
24093,114242,Sharknado 2: The Second One (2014),Horror|Sci-Fi|Thriller
15442,78696,"Moment After 2, The: The Awakening (2006)",Drama|Sci-Fi|Thriller


'ratings'

Unnamed: 0,userId,movieId,rating,timestamp
4850205,33344,247,4.0,974700982
10647891,73686,367,1.5,1135953524
15585267,107769,4713,3.0,1265650675
791647,5284,3176,4.0,1215019568
18428692,127548,1635,4.5,1101762672


'tags'

Unnamed: 0,userId,movieId,tag,timestamp
342001,103379,1193,depressing,1338750737
152693,43354,1173,stylized,1290553289
92345,23364,1027,romance,1315150995
4113,1678,4326,racism,1413310959
219107,66455,60469,Christianity,1301559053


[None, None, None, None, None, None]

## Model on rating

In [7]:
data["ratings"].sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
13422041,92757,4896,3.5,1276129129
5843040,40202,5292,4.0,1028334306
11683662,80680,736,3.0,859224474
9805859,67776,592,4.0,1318540169
5566378,38260,6870,3.5,1134416481


In [8]:
len(data["ratings"])

20000263

In [9]:
userId = list(set(data["ratings"]["userId"]))
movieId = list(set(data["ratings"]["movieId"]))
print(len(userId),len(movieId))

138493 26744


### Mapping
user to index, movie to index, index to user, index to movie

In [10]:
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

In [11]:
from torch.utils.data import DataLoader,Dataset

### Separate train/valid dataset

In [13]:
train_pick = np.random.rand(len(data["ratings"]))>.2
valid_pick = ~train_pick

In [14]:
train_pick,valid_pick

(array([False,  True,  True, ...,  True,  True,  True]),
 array([ True, False, False, ..., False, False, False]))

In [15]:
train_df = data["ratings"][train_pick].reset_index()
valid_df = data["ratings"][valid_pick].reset_index()

### Data generator

In [16]:
class reco_data(Dataset):
    def __init__(self,df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        row = self.df.loc[idx]
        return u2i[int(row["userId"])],m2i[int(row["movieId"])],row["rating"]/5

In [17]:
train = reco_data(train_df)
valid = reco_data(valid_df)

## Basic Cross Filtering

In [18]:
from torch import nn

In [19]:
class embeddings(nn.Module):
    def __init__(self):
        super(embeddings,self).__init__()
        self.emb_u = nn.Embedding(len(userId), DIM)
        self.emb_m = nn.Embedding(len(movieId), DIM)
        
    def forward(self,u,m):
        return self.emb_u(u),self.emb_m(m)

In [20]:
class cf(nn.Module):
    def __init__(self):
        super(cf,self).__init__()
        self.ebd = embeddings()
    
    def forward(self,u,m):
        u_vec,m_vec = self.ebd(u,m)
        return u_vec * m_vec
    
class cfnn(nn.Module):
    def __init__(self):
        super(cfnn,self).__init__()
        self.cf = cf()
        self.fcb = nn.Sequential(*[nn.Linear(DIM,512,bias=False),
                                   nn.BatchNorm1d(512),
                                   nn.LeakyReLU(inplace=True),
                                   nn.Linear(512,1,bias=False),
                                   nn.BatchNorm1d(1),
                                   nn.Sigmoid()
                                  ],
                                )
    
    def forward(self,u,m):
        x = self.cf(u,m)
        return self.fcb(x)

In [21]:
cfmodel = cfnn()

In [22]:
from torch.optim import Adam
mse = nn.MSELoss()
opt = Adam(cfmodel.parameters(),amsgrad=True)
if CUDA:
    cfmodel.cuda()

Step function for train and valid

In [None]:
def action(*args,**kwargs):
    u,m,y = args[0]
    opt.zero_grad()
    if CUDA:
        u,m,y  = u.cuda(),m.cuda(),y.cuda()
        
    y_ = cfmodel(u,m) # prediction
    
    loss = mse(y_,y.unsqueeze(-1).float())
    
    loss.backward()
    opt.step()
    
    return {"mse":loss.item()}

def val_action(*args,**kwargs):
    u,m,y = args[0]
    y_ = cfmodel(u,m)
    loss = mse(y_,y)
    
    return {"mse":loss.item()}

In [25]:
trainer = Trainer(train, val_dataset=valid, batch_size=BS, print_on = 5)

trainer.action = action
trainer.val_action = val_action

In [None]:
trainer.train(1)

⭐[ep_0_i_674]	mse	0.050:   8%|▊         | 675/8000 [11:06<2:00:29,  1.01it/s]