# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

#### Loading User interaction Dataset as pandas dataframe

In [2]:
user_interaction_df = pd.read_csv("data/user_interaction.csv")
user_interaction_df

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at
0,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291
1,5506791971543560,1377786223038206,40.0,2022-03-19 13:49:25.660
2,5506791996468218,1377786227025240,100.0,2022-03-21 17:28:47.288
3,5506791978752866,1377786222398208,65.0,2022-03-21 07:39:25.183
4,5506791978962946,1377786228157051,100.0,2022-03-22 17:32:44.777
...,...,...,...,...
2499995,5506791965506371,1377786228243175,100.0,2022-03-19 04:37:10.568
2499996,5506791966125995,1377786221431279,100.0,2022-03-21 17:24:46.089
2499997,5506791964496442,1377786226829597,100.0,2022-03-22 08:04:29.219
2499998,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083


#### Loading metadata as pandas dataframe

In [3]:
metadata_df = pd.read_csv("data/metadata.csv")
metadata_df

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23
...,...,...,...,...,...,...
954496,-2270332337845247,1377786228358627,Horror-Marathon,304,2022-03-22 17:40:22,2022-03-22 17:40:22
954497,-2270332334263077,1377786228362002,Horror-Marathon,588,2022-03-22 11:44:39,2022-03-22 11:44:39
954498,-2270332350350076,1377786228362682,Horror-Marathon,359,2022-03-22 12:39:41,2022-03-22 12:38:40
954499,-2270332337845247,1377786228375726,Horror-Marathon,310,2022-03-23 15:55:11,2022-03-23 15:55:11


#### Converting to datetime

In [4]:
user_interaction_df['updated_at'] = pd.to_datetime(user_interaction_df['updated_at'])
metadata_df['updated_at'] = pd.to_datetime(metadata_df['updated_at'])
metadata_df['published_at'] = pd.to_datetime(metadata_df['published_at'])

In [5]:
user_interaction_df  = user_interaction_df.sort_values('updated_at')
user_interaction_df

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at
1033131,5506791954036110,1377786225804654,100.0,2022-03-18 15:14:41.827
1415300,5506791980439899,1377786228150074,100.0,2022-03-18 15:14:42.120
2318259,5506791979182708,1377786218415632,100.0,2022-03-18 15:14:42.134
952322,5506791996330389,1377786219497547,100.0,2022-03-18 15:14:42.170
2114134,5506791961370166,1377786224952303,100.0,2022-03-18 15:14:42.282
...,...,...,...,...
624343,5506791959874343,1377786225302354,99.0,2022-03-23 00:08:09.845
446414,5506791959279525,1377786225901639,100.0,2022-03-23 00:08:16.603
561039,5506791996088677,1377786223947072,84.0,2022-03-23 00:08:22.177
426598,5506791980825783,1377786227076616,100.0,2022-03-23 00:08:24.364


In [6]:
metadata_df

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23
...,...,...,...,...,...,...
954496,-2270332337845247,1377786228358627,Horror-Marathon,304,2022-03-22 17:40:22,2022-03-22 17:40:22
954497,-2270332334263077,1377786228362002,Horror-Marathon,588,2022-03-22 11:44:39,2022-03-22 11:44:39
954498,-2270332350350076,1377786228362682,Horror-Marathon,359,2022-03-22 12:39:41,2022-03-22 12:38:40
954499,-2270332337845247,1377786228375726,Horror-Marathon,310,2022-03-23 15:55:11,2022-03-23 15:55:11


#### Creating user and item mappings

In [7]:
user_mapping = dict(enumerate(user_interaction_df['user_id'].unique()))
user_mapping = {v: k for k, v in user_mapping.items()}

item_mapping = dict(enumerate(user_interaction_df['pratilipi_id'].unique()))
item_mapping = {v: k for k, v in item_mapping.items()}

user_indices = [user_mapping[user] for user in user_interaction_df['user_id']]
item_indices = [item_mapping[item] for item in user_interaction_df['pratilipi_id']]

ratings = user_interaction_df['read_percent'].values

### Creating pytorch dataset

In [8]:
from torch.utils.data import Dataset,DataLoader

In [9]:
class ReadingDataset(Dataset):
    def __init__(self,user_indices,item_indices,ratings):
        self.user_indices = torch.tensor(user_indices,dtype=torch.long)
        self.item_indices = torch.tensor(item_indices,dtype=torch.long)
        self.ratings = torch.tensor(ratings,dtype=torch.float)/100 
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        return (
            self.user_indices[index],
            self.item_indices[index],
            self.ratings[index]
        )


## Neural Collaborative Filtering

![image](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*aP-Mx266ExwoWZPSdHtYpA.png)

In [10]:
class NeuralCollaborativeFilter(nn.Module):
    """
        Neural Collaborative Filtering\n
        Inputs: num_users, num_items, embedding_dim=100
    """
    def __init__(self,num_users,num_items,embedding_dim=100):
       
        super(NeuralCollaborativeFilter,self).__init__()

        self.user_embedding = nn.Embedding(num_users,embedding_dim)
        self.item_embedding = nn.Embedding(num_items,embedding_dim)
        
        self.network = nn.Sequential(
            nn.Linear(2*embedding_dim,128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self,user_input,item_input):
        x = self.user_embedding(user_input)
        y = self.user_embedding(item_input)

        input_vector = torch.cat([x,y],dim=1)
        preds = self.network(input_vector)
        return preds.squeeze()


In [11]:
def evaluate_model(model,test_dataloader,device):
    """Evaluates Model\n Returns batch_loss"""
    model.eval()
    criterion = nn.MSELoss()
    batch_loss = 0
    with torch.no_grad():
        for users,items,ratings in test_dataloader:
            users = users.to(device)
            items = items.to(device)
            ratings = ratings.to(device)
            preds = model(users,items)
            loss = criterion(preds,ratings)
            batch_loss += loss.item()
    return batch_loss/len(test_dataloader)

In [12]:
train_size = int(0.75 * len(user_indices))
train_size

1875000

#### Creating training and validation dataset

In [13]:
train_dataset = ReadingDataset(
    user_indices[:train_size],
    item_indices[:train_size],
    ratings[:train_size]
)
val_dataset = ReadingDataset(
    user_indices[train_size:],
    item_indices[train_size:],
    ratings[train_size:]
)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)

#### Finetuning parameters

In [15]:
EPOCHS = 1
LR = 0.001
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
num_users = len(user_mapping)
num_items = len(item_mapping)

In [16]:
model = NeuralCollaborativeFilter(num_users,num_items)
model

NeuralCollaborativeFilter(
  (user_embedding): Embedding(243606, 100)
  (item_embedding): Embedding(241405, 100)
  (network): Sequential(
    (0): Linear(in_features=200, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=1, bias=True)
    (9): Sigmoid()
  )
)

In [17]:
def train(
         model,
         train_loader,
         val_loader,
         EPOCHS,
         LR,
         DEVICE
         ):
     model = model.to(DEVICE)
     criterion = nn.MSELoss()
     optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LR,
        weight_decay = 1e-5
     )
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=2)
     max_loss = float('inf')

     for epoch in range(EPOCHS):
          model.train()
          train_loss = 0
          for batch_idx, (users, items, ratings) in enumerate(train_loader):
               users = users.to(DEVICE)
               items = items.to(DEVICE)
               ratings = ratings.to(DEVICE)

               predictions = model(users, items)
               loss = criterion(predictions, ratings)

               optimizer.zero_grad()
               loss.backward()
               torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
               optimizer.step()

               train_loss += loss.item()

               if batch_idx % 100 == 0:
                    print(
                    f'Epoch {epoch+1}/{EPOCHS} - Batch {batch_idx}/{len(train_loader)} - Loss: {loss.item():.4f}')
     
     val_loss = evaluate_model(model, val_loader, DEVICE)
     scheduler.step(val_loss)

     print(f'Epoch {epoch+1}/{EPOCHS}:')
     print(f'Training Loss: {train_loss/len(train_loader):.4f}')
     print(f'Validation Loss: {val_loss:.4f}')

     if val_loss < max_loss:
          max_loss = val_loss
          torch.save(model.state_dict(), 'ncf_weights.pth')

In [18]:
train(model=model,train_loader=train_loader,val_loader=val_loader,EPOCHS=EPOCHS,LR=LR,DEVICE=DEVICE)

Epoch 1/1 - Batch 0/7325 - Loss: 0.2487
Epoch 1/1 - Batch 100/7325 - Loss: 0.0448
Epoch 1/1 - Batch 200/7325 - Loss: 0.0388
Epoch 1/1 - Batch 300/7325 - Loss: 0.0593
Epoch 1/1 - Batch 400/7325 - Loss: 0.0483
Epoch 1/1 - Batch 500/7325 - Loss: 0.0454
Epoch 1/1 - Batch 600/7325 - Loss: 0.0472
Epoch 1/1 - Batch 700/7325 - Loss: 0.0599
Epoch 1/1 - Batch 800/7325 - Loss: 0.0355
Epoch 1/1 - Batch 900/7325 - Loss: 0.0497
Epoch 1/1 - Batch 1000/7325 - Loss: 0.0397
Epoch 1/1 - Batch 1100/7325 - Loss: 0.0567
Epoch 1/1 - Batch 1200/7325 - Loss: 0.0438
Epoch 1/1 - Batch 1300/7325 - Loss: 0.0417
Epoch 1/1 - Batch 1400/7325 - Loss: 0.0363
Epoch 1/1 - Batch 1500/7325 - Loss: 0.0437
Epoch 1/1 - Batch 1600/7325 - Loss: 0.0525
Epoch 1/1 - Batch 1700/7325 - Loss: 0.0506
Epoch 1/1 - Batch 1800/7325 - Loss: 0.0585
Epoch 1/1 - Batch 1900/7325 - Loss: 0.0234
Epoch 1/1 - Batch 2000/7325 - Loss: 0.0580
Epoch 1/1 - Batch 2100/7325 - Loss: 0.0519
Epoch 1/1 - Batch 2200/7325 - Loss: 0.0285
Epoch 1/1 - Batch 2300/

In [19]:
def get_recommendations(model,user_id,num_recommendations,device):
    """
    Return recommendations for a user id\n
    Input : model,user_id,num_recommendations,device
    """
    model.eval()
    recommendations = []
    with torch.no_grad():
        user_index = user_mapping[user_id]
        user_input = torch.tensor([user_index],dtype=torch.long,device=device)
        items_list = list(item_mapping.values())
        items_input = torch.tensor(items_list,dtype=torch.long,device=device)

        user_input = user_input.expand(len(items_input))

        preds = model(user_input,items_input)

        top_recommendations = torch.topk(preds,num_recommendations)
        reverse_item_mapping = {v: k for k, v in item_mapping.items()}
        
        for i in top_recommendations.indices:
            recommendations.append(reverse_item_mapping[i.item()])
    
    return recommendations


In [20]:
def inference(user_id,num_user,num_items,weight_path,device):
    model = NeuralCollaborativeFilter(num_users=num_user,num_items=num_items)
    model.load_state_dict(torch.load(weight_path))
    model.to(device)
    recommendations = get_recommendations(model,user_id,num_recommendations=5,device=device)
    print("----- Recommended Pratilipis -----\n")
    for r in recommendations:
        print(f"ID:{r}")

In [21]:
user_id = 5506791961876448
inference(user_id=user_id,num_user=num_users,num_items=num_items,weight_path="ncf_weights.pth",device=DEVICE)

----- Recommended Pratilipis -----

ID:1377786228240538
ID:1377786228211516
ID:1377786228204335
ID:1377786226447185
ID:1377786228253938
