In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/readyforrecommendationmodel/user_book_matrix.npz
/kaggle/input/readyforrecommendationmodel/avg_embeddings_matrix.npz
/kaggle/input/readyforrecommendationmodel/emotion_matrix.npz


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
from scipy.sparse import load_npz

def load_data():
    user_book_matrix = load_npz('/kaggle/input/readyforrecommendationmodel/user_book_matrix.npz')
    emotion_matrix = load_npz('/kaggle/input/readyforrecommendationmodel/emotion_matrix.npz')
    book_embeddings = load_npz('/kaggle/input/readyforrecommendationmodel/avg_embeddings_matrix.npz')
    
    return user_book_matrix, emotion_matrix, book_embeddings

In [4]:
class NCFDataset(Dataset):
    def __init__(self, interactions, emotions, review_embeddings):
        self.users, self.items = interactions.nonzero()
        self.ratings = interactions[self.users, self.items].A1
        self.emotions = emotions[self.items].toarray()  # Convert to dense array
        self.review_embeddings = review_embeddings[self.items].toarray()  # Convert to dense array

    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return (self.users[idx], self.items[idx], self.emotions[idx], 
                self.review_embeddings[idx], self.ratings[idx])

In [5]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, num_emotions, review_embedding_dim, 
                 embedding_dim=64, mlp_dims=[256, 128, 64], dropout=0.2):
        super(NCF, self).__init__()
        
        # Embedding layers
        self.user_embedding_mf = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mf = nn.Embedding(num_items, embedding_dim)
        self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)
        self.emotion_embedding = nn.Embedding(num_emotions, embedding_dim)
        
        # MF layer
        self.mf_output = embedding_dim
        
        # MLP layers
        self.mlp = nn.ModuleList()
        input_dim = embedding_dim * 3 + review_embedding_dim  # user + item + emotion + review
        mlp_dims = [input_dim] + mlp_dims
        for i in range(len(mlp_dims) - 1):
            self.mlp.append(nn.Linear(mlp_dims[i], mlp_dims[i+1]))
            self.mlp.append(nn.ReLU())
            self.mlp.append(nn.BatchNorm1d(mlp_dims[i+1]))
            self.mlp.append(nn.Dropout(dropout))
        
        # Final prediction layer
        self.final = nn.Linear(self.mf_output + mlp_dims[-1], 1)
        
    def forward(self, user_indices, item_indices, emotion_indices, review_embeddings):
        # MF component
        user_embedding_mf = self.user_embedding_mf(user_indices)
        item_embedding_mf = self.item_embedding_mf(item_indices)
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf)
        
        # MLP component
        user_embedding_mlp = self.user_embedding_mlp(user_indices)
        item_embedding_mlp = self.item_embedding_mlp(item_indices)
        emotion_embedding = self.emotion_embedding(emotion_indices)
        emotion_embedding = self.emotion_embedding(emotion_indices).mean(dim=1)  # Adjusted for mean over emotions
        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp, emotion_embedding, review_embeddings], dim=-1)
 
        for layer in self.mlp:
            mlp_vector = layer(mlp_vector)
        
        # Combine MF and MLP
        combined = torch.cat([mf_vector, mlp_vector], dim=-1)
        
        # Final prediction
        prediction = self.final(combined)
        
        return prediction.squeeze()

In [6]:
def get_data_info(user_item_interactions, emotion_labels):
    print(user_item_interactions.shape)
    # Get the number of unique users and items
    num_users = user_item_interactions.shape[0]
    num_items = user_item_interactions.shape[1]
    
    # Get the number of unique emotions
    num_emotions = emotion_labels.shape[1]
    
    return num_users, num_items, num_emotions

In [7]:
def custom_collate_fn(batch):
    users, items, emotions, review_embeddings, ratings = zip(*batch)
    
    users = torch.tensor(users, dtype=torch.long)
    items = torch.tensor(items, dtype=torch.long)
    emotions = torch.tensor(emotions, dtype=torch.long)
    review_embeddings = torch.tensor(review_embeddings, dtype=torch.float)
    ratings = torch.tensor(ratings, dtype=torch.float)
    
    # Move tensors to device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    users = users.to(device)
    items = items.to(device)
    emotions = emotions.to(device)
    review_embeddings = review_embeddings.to(device)
    ratings = ratings.to(device)
    
    return users, items, emotions, review_embeddings, ratings

In [8]:
embedding_dim = 32
review_embedding_dim = 100
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Assume we have these functions to load and preprocess data
user_item_interactions, emotion_labels, review_embeddings = load_data()
num_users, num_items, num_emotions = get_data_info(user_item_interactions, emotion_labels)

# Create dataset and dataloader
dataset = NCFDataset(user_item_interactions, emotion_labels, review_embeddings)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)

(575887, 74298)


In [9]:
# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_items, num_emotions, 
            embedding_dim=32, 
            review_embedding_dim=100, 
            mlp_dims=[256, 128, 64], 
            dropout=0.2).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Saves model when validation loss decreases."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [11]:
# Initialize EarlyStopping object
early_stopping = EarlyStopping(patience=5, verbose=True, path='/kaggle/working/NCF_model.pth')

for epoch in range(num_epochs):
    model.train()
    for user, item, emotion, review_emb, rating in train_loader:
        optimizer.zero_grad()
        prediction = model(user, item, emotion, review_emb)
        loss = criterion(prediction, rating)
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for user, item, emotion, review_emb, rating in test_loader:
            prediction = model(user, item, emotion, review_emb)
            loss = criterion(prediction, rating)
            val_loss += loss.item() * user.size(0)
        val_loss /= len(test_loader.dataset)
    
    # Print validation loss
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

    # Update early stopping object
    early_stopping(val_loss, model)

    # Check if early stopping criterion is met
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model checkpoint
model.load_state_dict(torch.load('/kaggle/working/NCF_model.pth'))

  emotions = torch.tensor(emotions, dtype=torch.long)


KeyboardInterrupt: 