In [10]:
import csv
import numpy as np
import pandas as pd
import torch

books_df = pd.read_csv('dataset/reviews/BX_Books.csv', sep=';', encoding="latin-1")
ratings_df = pd.read_csv('dataset/reviews/BX-Book-Ratings.csv', sep=';', encoding="latin-1")

print(books_df.shape, ratings_df.shape)

(271379, 8) (1149780, 3)


In [8]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [9]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [11]:
book_names = books_df.set_index('ISBN')['Book-Title'].to_dict()
user_num = len(ratings_df['User-ID'].unique())
book_num = len(ratings_df['ISBN'].unique())

print("Total Unique Users:", user_num)
print("Total Unique Books:", book_num)
print("Dimensions of Rating Matrix:", user_num, "*", book_num, "=", user_num * book_num)
print("-"*10)
print("Percentage of matrix filled:", 100 * len(ratings_df) / (user_num * book_num), "%")


Total Unique Users: 105283
Total Unique Books: 340556
Dimensions of Rating Matrix: 105283 * 340556 = 35854757348
----------
Percentage of matrix filled: 0.0032067711094526078 %


In [12]:
from torch.autograd import variable
from tqdm.notebook import tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()

        # Embedding modules containing n_users/n_items tensors of size n_factors
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        
        # Set weights between 0 and 0.05
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        users, items = data[:, 0], data[:, 1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)


In [17]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        users = ratings_df['User-ID'].unique()
        books = ratings_df['ISBN'].unique()
                
        # Assign unique indices to users and books
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.ISBN2idx = {o:i for i,o in enumerate(books)}
        
        self.ratings['User-ID'] = self.ratings['User-ID'].map(self.userid2idx)
        self.ratings['ISBN'] = self.ratings['ISBN'].map(self.ISBN2idx)
                
        self.x = self.ratings.drop(columns=['Book-Rating']).values
        self.y = self.ratings['Book-Rating'].values
        self.x, self.y = torch.tensor(self.x, dtype=torch.long), torch.tensor(self.y, dtype=torch.long)
        
        print(self.x.min(), self.x.max())
        print(self.x.shape)
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)
    

In [18]:
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
print("Device", device)

num_epochs = 128

print("Using CUDA:", use_cuda)

model = MatrixFactorization(user_num, book_num, n_factors=8)
print(model)
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.data)

if use_cuda:
    model = model.cuda()

loss_func = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)


Device cuda
Using CUDA: True
MatrixFactorization(
  (user_factors): Embedding(105283, 8)
  (item_factors): Embedding(340556, 8)
)
tensor(0) tensor(340555)
torch.Size([1149780, 2])


In [19]:
for i in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if use_cuda:
            x = x.cuda()
            y = y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_func(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
        
    
    print(f"iter #{i}", "Loss:", sum(losses) / len(losses))

torch.save(model.state_dict(), "models/collaborative_filtering.pth")

  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 22.041282627586
iter #1 Loss: 19.520278633722388
iter #2 Loss: 17.563149325980067
iter #3 Loss: 16.059341162280596
iter #4 Loss: 14.875256335670295
iter #5 Loss: 13.912052115260208
iter #6 Loss: 13.120258874382538
iter #7 Loss: 12.452303497002754
iter #8 Loss: 11.892102404425302
iter #9 Loss: 11.405688240389297
iter #10 Loss: 10.982458861171702
iter #11 Loss: 10.605055359627322
iter #12 Loss: 10.267695257555705
iter #13 Loss: 9.948846753158747
iter #14 Loss: 9.65083348845289
iter #15 Loss: 9.354555454390042
iter #16 Loss: 9.059713491469333
iter #17 Loss: 8.750403120803579
iter #18 Loss: 8.434027153731959
iter #19 Loss: 8.115668919122818
iter #20 Loss: 7.811565162370349
iter #21 Loss: 7.519286037990965
iter #22 Loss: 7.243432227320174
iter #23 Loss: 6.976235695250459
iter #24 Loss: 6.719543127974861
iter #25 Loss: 6.471540992594131
iter #26 Loss: 6.231936946075
iter #27 Loss: 6.003171642758487
iter #28 Loss: 5.783072447997085
iter #29 Loss: 5.573446093775522
iter #30 Loss: