In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [9]:
PATH = "./preprocessed_data/"
behaviors = pd.read_csv(PATH + "user_behaviors.csv")
articles = pd.read_csv(PATH + "articles.csv")

In [11]:
behaviors.columns

Index(['Unnamed: 0', 'eventId', 'city', 'activeTime', 'url', 'region', 'time',
       'userId', 'canonicalUrl', 'publishtime', 'keywords', 'id', 'title',
       'author', 'category1'],
      dtype='object')

### Preprocessing
1. Transform wierd userId to an integer ID using LabelEncoder <br>
2. Groupby on number of times a user appears in df and sort <br> 
3. Remove users that appear only 1 time <br>
4. Remove unimportant columns <br>
5. Reset column index


In [37]:
def preprocess(df):
    # 1.
    temp = behaviors.copy()
    user_enc = LabelEncoder()
    temp["user"] = user_enc.fit_transform(temp["userId"].values)
    
    article_enc = LabelEncoder()
    temp["articleId"] = article_enc.fit_transform(temp["id"].values)

    n_users = temp["user"].nunique()
    n_articles = temp["articleId"].nunique()
    print("Number of unique users before removal: ", n_users)
    print("Number of unique articles before removal", n_articles)

    # 2.
    temp["userFreq"] = temp.groupby("user")["user"].transform("count")
    #cols = temp.columns.tolist()
    temp = temp.sort_values("user")

    # 3.
    trimmed = temp[temp["userFreq"] > 1]
    n_users = trimmed["user"].nunique()
    n_articles = trimmed["articleId"].nunique()
    print("Number of unique users after removal: ", n_users)
    print("Number of unique articles after removal", n_articles)

    # 4. Keep only some columns
    # ['Unnamed: 0', 'eventId', 'city', 'activeTime', 'url', 'region', 'time','userId', 'canonicalUrl', 'publishtime', 'keywords', 'id', 'title','author', 'category1']
    trimmed = trimmed[["user", "userId","userFreq" ,"articleId" ,"title", "author", "id", "time"]]

    # 5.
    trimmed.reset_index(drop=True, inplace=True)
    return trimmed
behaviors_prepped = preprocess(behaviors)
behaviors_prepped.head()

Number of unique users before removal:  164365
Number of unique articles before removal 4641
Number of unique users before removal:  84045
Number of unique articles before removal 2555


Unnamed: 0,user,userId,userFreq,articleId,title,author,id,time
0,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,2787,Bil av veien i Meråker,,9d615dd08d92c8e9670fb72b5c78cbc6b52501c4,1483305801
1,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,3181,To av tre Northug-brødre tilbake for fullt: Ti...,ole k. sagbakken,b28e7c163c39941aa1cbd0b7b3a821576771f893,1483305738
2,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,1346,Kvinne omkommet etter ulykke på E39,,4c1af3633d74ba9eb77b2c07c7a8171ba27b10f7,1483305786
3,2,cx:02b046b0db0d02e5dadcdb2d4c3d7e53:24ik5bzhn1crw,3,1861,Her koker det over for Tønseth. Så stakk han f...,birger løfaldli,68d1503c73ad169dcfff48214fd0274c4d612e63,1483260265
4,2,cx:02b046b0db0d02e5dadcdb2d4c3d7e53:24ik5bzhn1crw,3,3181,To av tre Northug-brødre tilbake for fullt: Ti...,ole k. sagbakken,b28e7c163c39941aa1cbd0b7b3a821576771f893,1483302497


## Leave-one-out 
Using leave-one-out methodology for the train-test-split. For each user, the most recent review is used as the test set, while the rest will be used as training data. 

In [20]:
test = behaviors_prepped.copy()
test["rank_latest"] = test.groupby(["user"])["time"].rank(method="first", ascending=False)

train_ratings = test[test["rank_latest"] != 1]
test_ratings = test[test["rank_latest"] == 1]

train_ratings.head()

Unnamed: 0,user,userId,userFreq,articleId,title,author,id,time,rank_latest
1,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,3181,To av tre Northug-brødre tilbake for fullt: Ti...,ole k. sagbakken,b28e7c163c39941aa1cbd0b7b3a821576771f893,1483305738,3.0
2,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,1346,Kvinne omkommet etter ulykke på E39,,4c1af3633d74ba9eb77b2c07c7a8171ba27b10f7,1483305786,2.0
3,2,cx:02b046b0db0d02e5dadcdb2d4c3d7e53:24ik5bzhn1crw,3,1861,Her koker det over for Tønseth. Så stakk han f...,birger løfaldli,68d1503c73ad169dcfff48214fd0274c4d612e63,1483260265,3.0
5,2,cx:02b046b0db0d02e5dadcdb2d4c3d7e53:24ik5bzhn1crw,3,2284,Nå må du punge ut for piggdekkene,martin andersen,7fee41607635cb686671c95fa0dabe16a2a128f5,1483271454,2.0
6,4,cx:0527c024027f6ed8f9ceb9823c11575c:1tl0o9i5sreol,3,2787,Bil av veien i Meråker,,9d615dd08d92c8e9670fb72b5c78cbc6b52501c4,1483309383,2.0


In [21]:
test_ratings.head()

Unnamed: 0,user,userId,userFreq,articleId,title,author,id,time,rank_latest
0,0,cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov,3,2787,Bil av veien i Meråker,,9d615dd08d92c8e9670fb72b5c78cbc6b52501c4,1483305801,1.0
4,2,cx:02b046b0db0d02e5dadcdb2d4c3d7e53:24ik5bzhn1crw,3,3181,To av tre Northug-brødre tilbake for fullt: Ti...,ole k. sagbakken,b28e7c163c39941aa1cbd0b7b3a821576771f893,1483302497,1.0
8,4,cx:0527c024027f6ed8f9ceb9823c11575c:1tl0o9i5sreol,3,2241,Kalla slet med å holde tilbake tårene: – Jeg s...,halvor ekeland,7dbbd7f3a7ec287bdcbdaf8b8f042732074bf2bd,1483309520,1.0
10,5,cx:05b69c4d92618d7000dacd2ca2eceb06:3q2kyszi5628a,2,1861,Her koker det over for Tønseth. Så stakk han f...,birger løfaldli,68d1503c73ad169dcfff48214fd0274c4d612e63,1483234977,1.0
15,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,9,3050,Hvem syns du er Årets trønder?,espen rasmussen,ac6aacb71fb09db2bb79554e6bc5ecdb95103ea2,1483295183,1.0


In [22]:
# drop columns that we no longer need
train_ratings = train_ratings[["user","articleId", "time"]]
test_ratings = test_ratings[["user", "articleId", "time"]]

In [23]:
# convert time to 1
train_ratings.loc[:, "time"] = 1

In [27]:
# Generate 4 negative samples for each row of data

# List of all movie IDs
all_articleId = behaviors_prepped["articleId"].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

#The set of items that each user has interacted with
user_item_set = set(zip(train_ratings["user"], train_ratings["articleId"]))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u,i) in user_item_set:
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        #randomly select an item
        negative_item = np.random.choice(all_articleId)
        #check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_articleId)
        users.append(u)
        items.append(negative_item)
        labels.append(0) #items not interacted with are negative

## Pytorch

In [29]:
import torch
from torch.utils.data import Dataset

In [49]:

class AddressaTrainDataset(Dataset):
    """Addressa PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the click event
        all_articleIds (list): List containing all articleIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_articleIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_articleIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user'], ratings['articleId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_articleIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_articleIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [66]:
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_articleIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_articleIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_articleIds = all_articleIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(AddressaTrainDataset(self.ratings, self.all_articleIds))

In [71]:
num_users = behaviors_prepped["user"].max() + 1
num_items = behaviors_prepped["articleId"].max() + 1

all_articleIds = behaviors_prepped["articleId"].unique()

In [72]:
#num_users = 164365

164365

In [73]:
model = NCF(num_users, num_items, train_ratings, all_articleIds)

In [74]:
trainer = pl.Trainer(max_epochs=5, gpus=0, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.3 M 
1 | item_embedding | Embedding | 37.1 K
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params


Training: |          | 0/? [00:00<?, ?it/s]



1