In [40]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import json

In [41]:
print(f'Tensorflow version: {tf.__version__}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')

Tensorflow version: 2.19.0
Pandas version: 2.2.3
Numpy version: 2.0.2


In [42]:
data = []
with open('../json/reviews_Kindle_Store_5.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))


df = pd.DataFrame(data)
df = df[['reviewerID', 'asin', 'overall', 'unixReviewTime']]
df.dropna(inplace=True)

In [43]:
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
item_enc = LabelEncoder()

clean_df = pd.DataFrame()

df['user'] = user_enc.fit_transform(df['reviewerID'])
df['item'] = item_enc.fit_transform(df['asin'])
df['rating'] = df['overall']


In [44]:
clean_df = df[['user', 'item', 'rating']][df['rating'] >= 3.0]
clean_df

Unnamed: 0,user,item,rating
0,7773,0,5.0
1,61894,0,4.0
2,53977,0,4.0
3,8128,0,5.0
4,50527,0,4.0
...,...,...,...
982614,35142,61933,5.0
982615,4097,61933,5.0
982616,18464,61933,5.0
982617,5981,61933,5.0


In [45]:
import torch
import torch.nn as nn
import torch.optim as optim

In [46]:
class RatingDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        # Make sure user and item IDs are within bounds
        assert df['user'].max() < 68223, "User ID out of bounds"
        assert df['item'].max() < 61934, "Item ID out of bounds"
        
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float).view(-1, 1)
        # Normalize ratings for BCE loss
        self.ratings = (self.ratings - 1) / 4.0  # Assuming ratings are 1-5
        
    def __len__(self):
        return len(self.users)
        
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [47]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dimension = 8, mlp_layers=[16,8]):
        """ 
        num_users: number of users
        num_items: number of items
        embedding_dimension: dimension of the embeddings for the matrix factorization
        mlp_layers: sizes of hidden mulit-layer-perceptron
        """
        super(NCF, self).__init__()

        #Matrix factorization embeddings
        self.user_embeddings_mf = nn.Embedding(num_users, embedding_dimension)
        self.item_embeddings_mf = nn.Embedding(num_items, embedding_dimension)

        #multi-layer perceptron embeddings
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dimension)
        self.item_embeddings_mlp = nn.Embedding(num_items, embedding_dimension)

        # Initialize embeddings (optional, but good practice)
        nn.init.normal_(self.user_embeddings_mf.weight, std=0.01)
        nn.init.normal_(self.item_embeddings_mf.weight, std=0.01)
        nn.init.normal_(self.user_embeddings_mlp.weight, std=0.01)
        nn.init.normal_(self.item_embeddings_mlp.weight, std=0.01)

        #MLP layers
        mlp_modules = []
        input_size = embedding_dimension * 2
        for layer_size in mlp_layers:
            mlp_modules.append(nn.Linear(input_size, layer_size))
            mlp_modules.append(nn.ReLU())
            input_size = layer_size
        
        self.mlp = nn.Sequential(*mlp_modules)

        predict_size = embedding_dimension + mlp_layers[-1]
        
        # Final prediction layer -> output single score
        self.final_layer = nn.Linear(predict_size, 1)

        # You can use a Sigmoid at the end if doing binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, user_ids, item_ids):
        """
        user_ids: [batch_size]
        item_ids: [batch_size]
        returns: predicted score [batch_size, 1]
        """
        # 1) GMF part
        user_gmf = self.user_embeddings_mf(user_ids)        # [batch_size, embedding_dim]
        item_gmf = self.item_embeddings_mf(item_ids)        # [batch_size, embedding_dim]
        gmf_output = user_gmf * item_gmf                    # element-wise product [batch_size, embedding_dim]

        # 2) MLP part
        user_mlp = self.user_embeddings_mlp(user_ids)        # [batch_size, embedding_dim]
        item_mlp = self.item_embeddings_mlp(item_ids)        # [batch_size, embedding_dim]
        mlp_input = torch.cat((user_mlp, item_mlp), dim=1)  # [batch_size, embedding_dim*2]
        mlp_output = self.mlp(mlp_input)                    # [batch_size, mlp_layers[-1]]

        # 3) Concatenate GMF & MLP
        concat = torch.cat((gmf_output, mlp_output), dim=1) # [batch_size, embedding_dim + mlp_layers[-1]]

        # 4) Final layer
        logits = self.final_layer(concat)    # [batch_size, 1]
        preds = self.sigmoid(logits)         # apply sigmoid for probability

        return preds

In [48]:
def leave_one_out_split(df):
    # Sort by user and (optionally) timestamp or item if available
    df = df.sort_values(by=["user", "item"])  # if no timestamp

    test_rows = []
    train_rows = []

    # Leave the last interaction (or any 1) per user for test set
    for user, group in df.groupby("user"):
        test_rows.append(group.iloc[-1])      # Last rating to test
        train_rows.append(group.iloc[:-1])    # All others to train

    test_df = pd.DataFrame(test_rows)
    train_df = pd.concat(train_rows)

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [55]:
train, test = leave_one_out_split(clean_df)

train_dataset = RatingDataset(train)
test_dataset = RatingDataset(test)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=128, shuffle=False)

# Sample hyperparameters
num_users = clean_df['user'].max() + 1  # +1 because 0-indexed
num_items = clean_df['item'].max() + 1  # +1 because 0-indexed
embedding_dimension = 8

model = NCF(num_users, num_items, embedding_dimension=embedding_dimension, mlp_layers=[16, 8])
model.train()  # set to training mode

criterion = nn.BCELoss()  # for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training loop
epochs = 5
for epoch in range(epochs):
    total_loss = 0.0
    for batch_users, batch_items, batch_ratings in train_loader:
        optimizer.zero_grad()
        predictions = model(batch_users, batch_items)
        loss = criterion(predictions, batch_ratings)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * batch_users.size(0)  # Weighted by batch size
    
    avg_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/5, Loss: 0.3446
Epoch 2/5, Loss: 0.3088
Epoch 3/5, Loss: 0.2840
Epoch 4/5, Loss: 0.2688
Epoch 5/5, Loss: 0.2602


In [56]:
model.eval()
with torch.no_grad():
    test_loss = 0.0
    for batch_users, batch_items, batch_ratings in test_loader:
        predictions = model(batch_users, batch_items)
        test_loss += criterion(predictions, batch_ratings).item() * batch_users.size(0)
    
    avg_test_loss = test_loss / len(test_dataset)
    print(f"Test Loss: {avg_test_loss:.4f}")


Test Loss: 0.3602
