# Book recommendation exercise

In [1]:
import torch 
from torch import nn, optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
# Load in the dataset
df_ratings = pd.read_csv("BX-Book-Ratings.csv", sep=";", quotechar='"', escapechar="\\", encoding="iso-8859-1")
df_users = pd.read_csv("BX-Users.csv", sep=";", quotechar='"', escapechar="\\", encoding="iso-8859-1")
df_books = pd.read_csv("BX-Books.csv", sep=";", quotechar='"', escapechar="\\", encoding="iso-8859-1", usecols=[0, 1, 2, 4])

In [3]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [4]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ISBN         271379 non-null  object
 1   Book-Title   271379 non-null  object
 2   Book-Author  271377 non-null  object
 3   Publisher    271377 non-null  object
dtypes: object(4)
memory usage: 8.3+ MB


In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [6]:
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,W. W. Norton &amp; Company


In [8]:
# Turn userID and Books ID to categorical label
df_books["Book-ID2"] = pd.Categorical(df_books["ISBN"])
df_books["Book-ID2"] = df_books["Book-ID2"].cat.codes

df_users["User-ID2"] = pd.Categorical(df_users["User-ID"])
df_users["User-ID2"] = df_users["User-ID2"].cat.codes

In [9]:
# Map the IDs in the ratings table to the new IDs
index_user_map = df_users.set_index("User-ID")["User-ID2"]
index_book_map = df_books.set_index("ISBN")["Book-ID2"]
df_ratings["User-ID2"] = df_ratings["User-ID"].map(index_user_map)
df_ratings["Book-ID2"] = df_ratings["ISBN"].map(index_book_map)

In [10]:
len(df_ratings["ISBN"].unique()), len(df_books["ISBN"].unique())

(340556, 271379)

We can see that the provided database doesn't have enough "IDs." I decided that I will **remove** the NaN entries. 

In [11]:
dataset = df_ratings.dropna()[["Book-Rating", "User-ID2", "Book-ID2"]]
dataset["Book-ID2"] = dataset["Book-ID2"].astype('int32')

In [12]:
dataset["Book-Rating"] = dataset["Book-Rating"] - 2.5

In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031175 entries, 0 to 1149778
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Book-Rating  1031175 non-null  float64
 1   User-ID2     1031175 non-null  int32  
 2   Book-ID2     1031175 non-null  int32  
dtypes: float64(1), int32(2)
memory usage: 23.6 MB


In [14]:
# Shuffle and split the data
df_train, df_test = train_test_split(dataset, test_size=0.2)

In [15]:
# Data generator
def data_generator(data: pd.DataFrame, batch_size=128):
    data = shuffle(data)
    num_batches = int(np.ceil(len(data) // batch_size))
    for i in range(num_batches):
        # Grab a batch
        end = min(len(data), (i + 1) * batch_size)
        data_batch = data[i * batch_size: (i+1) * batch_size]
        user_batch = data["User-ID2"].to_numpy(dtype=np.int32)
        book_batch = data["Book-ID2"].to_numpy(dtype=np.int32)
        rating_batch = data["Book-Rating"].to_numpy(dtype=np.float32)
        
        # Convert to torch tensors
        user_batch = torch.from_numpy(user_batch)
        book_batch = torch.from_numpy(book_batch)
        rating_batch = torch.from_numpy(rating_batch)
        
        yield user_batch, book_batch, rating_batch

In [16]:
# Alias for data generator
train_generator = lambda: data_generator(df_train)
test_generator = lambda: data_generator(df_test)

### Build the model

A simple ANN with 2 embeddings layers for the book and the data

In [17]:
class BookRecommender(nn.Module):
    def __init__(self, num_users, num_books, num_embed, num_hiddens, drop_out):
        super(BookRecommender, self).__init__()
        
        self.V1 = num_users
        self.V2 = num_books
        self.E = num_embed
        self.H = num_hiddens
        self.p = drop_out
        
        # Embedding layers
        self.user_embed = nn.Embedding(self.V1, self.E)
        self.book_embed = nn.Embedding(self.V2, self.E)
        
        # Dense layers
        self.fc = nn.Sequential(
            nn.Linear(2 * self.E, self.H),
            nn.ReLU(),
            nn.Dropout(self.p),
            nn.Linear(self.H, 1)
        )
    
    def forward(self, users, books):
        # Embed the users and books
        users_embed = self.user_embed(users)
        books_embed = self.user_embed(books)
        
        # Concat the embedding vectors
        out = torch.cat((users_embed, books_embed), dim=1)
        
        # Pass through dense layer
        out = self.fc(out)
        return out

In [18]:
def configure(model: nn.Module, device: torch.device, lr, momentum, optim_medthod):
    model.to(device)
    criterion = nn.MSELoss()
    if optim_medthod == "adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        optimize = optim.SGD(model.parameters(), lr, momentum)
    
    return criterion, optimizer

In [29]:
def batch_gd(model:nn.Module, criterion, optimizer, device, train_loader, test_loader, num_epochs):
    loss_his, test_his = np.zeros(num_epochs), np.zeros(num_epochs)
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_losses = []
        for users, books, ratings in train_loader():
            # Zero the gradient
            optimizer.zero_grad()

            # Move items to device
            users, books, ratings = users.to(device), books.to(device), ratings.to(device).view(-1, 1)
            
            # Forward pass
            outs = model(users, books)
            train_loss = criterion(outs, ratings)
            train_losses.append(train_loss.item())
            
            # Update parameters
            train_loss.backward()
            optimizer.step()
        
        loss_his[epoch] = np.mean(train_losses).item()
            
        # Evaluate
        model.eval()
        test_losses = []
        for users, books, ratings in test_loader():
            # Move items to device
            users, books, ratings = users.to(device), books.to(device), ratings.to(device)
            #Forward pass
            outs = model(users, books)
            test_loss = criterion(outs, ratings)
            test_losses.append(test_loss.item())
        
        test_his[epoch] = np.mean(test_losses).item()
    
    return loss_his, test_his

### Train the model

In [32]:
# Hyperparameters
lr = 0.01
optim_medthod = 'adam'
momentum = 0.99
num_epochs = 16
num_hiddens = 256
num_embed = 1
drop_out = 0.2

In [33]:
model = BookRecommender(len(df_users), len(df_books), num_embed, num_hiddens, drop_out)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion, optimizer = configure(model, device, lr, momentum, optim_medthod)

In [25]:
from time import perf_counter

In [34]:
# Train the model
start = perf_counter()
loss_his, test_his = batch_gd(model, criterion, optimizer, device, train_generator, test_generator, 
                                num_epochs) 
end = perf_counter()
f"{end - start:.4f}s"

KeyboardInterrupt: 

### Evaluate the model

In [None]:
# Plot the losses overtime
plt.tilte("Loss over time")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.plot(loss_his, label="train loss")
plt.plot(test_his, label="test loss")
plt.legend()
plt.show()

In [None]:
# Do some prediction
test_data = test_generator()[1]

In [None]:
from torchsummary import summary

summary(model, [(1,), (1,)], device=device)