In [1]:
import sys
sys.path.append('..')

In [2]:
from src.utils import data_preprocessing as prep

In [3]:
import numpy as np
import pandas as pd

In [5]:
books_df = pd.read_csv('../data/Books.csv', delimiter=';', low_memory=False)
ratings_df = pd.read_csv('../data/Ratings.csv', delimiter=';')
users_df = pd.read_csv('../data/Users.csv', delimiter=';')

  users_df = pd.read_csv('../data/Users.csv', delimiter=';')


In [43]:
user_ids_str = ratings_df['User-ID']
book_ids_str = ratings_df['ISBN']
ratings = ratings_df['Rating'].to_numpy()

book_str_to_int = {book_id: i for i, book_id in enumerate(book_ids_str.unique())}
user_str_to_int = {user_id: i for i, user_id in enumerate(user_ids_str.unique())}

book_ids = book_ids_str.map(book_str_to_int).to_numpy()
user_ids = user_ids_str.map(user_str_to_int).to_numpy()

In [44]:
for u in book_ids:
    if type(u) != np.int64:
        print(u, type(u))
        break

In [45]:
book_ids

array([     0,      1,      2, ...,  12065,  78598, 340555])

In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

class BookCrossingDataset(Dataset):
    def __init__(self, user_ids, book_ids, ratings):
        self.user_ids = user_ids
        self.book_ids = book_ids
        self.ratings = ratings
    
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        return self.user_ids[idx], self.book_ids[idx], self.ratings[idx]

class LTRModel(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim):
        super(LTRModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user_id, book_id):
        user_emb = self.user_embedding(user_id)
        book_emb = self.book_embedding(book_id)
        x = torch.cat([user_emb, book_emb], dim=-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        score = 5*F.sigmoid(self.fc3(x))
        return score

def pairwise_hinge_loss(pos_scores, neg_scores, margin=1.0):
    loss = torch.mean(torch.clamp(margin - pos_scores + neg_scores, min=0))
    return loss

# Example of training the model
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for user_id, book_id, rating in tqdm(train_loader):
        user_id = user_id.to(device)
        book_id = book_id.to(device)
        rating = rating.to(device)

        optimizer.zero_grad()

        pos_scores = model(user_id, book_id)
        neg_scores = model(user_id, torch.randint(0, num_books, book_id.size(), device=device))

        loss = pairwise_hinge_loss(pos_scores, neg_scores)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

# Hyperparameters and dataset setup
embedding_dim = 16
num_users = len(np.unique(user_ids))  # Replace with actual number of users
num_books = len(np.unique(book_ids))  # Replace with actual number of books
batch_size = 64
learning_rate = 0.003


# Replace these with actual data
# user_ids = torch.randint(0, num_users, (100000,))
# book_ids = torch.randint(0, num_books, (100000,))
# ratings = torch.randint(0, 2, (100000,))

dataset = BookCrossingDataset(user_ids, book_ids, ratings)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

device = torch.device('mps' if torch.backends.mps.is_available else 'cpu')
print(device)

model = LTRModel(num_users, num_books, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



mps


In [77]:
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_model(model, train_loader, optimizer, device)


Epoch 1/5


100%|██████████| 17966/17966 [05:51<00:00, 51.06it/s]


Average training loss: 0.3609
Epoch 2/5


100%|██████████| 17966/17966 [06:13<00:00, 48.12it/s]


Average training loss: 0.3501
Epoch 3/5


100%|██████████| 17966/17966 [06:02<00:00, 49.54it/s]


Average training loss: 0.3399
Epoch 4/5


100%|██████████| 17966/17966 [06:05<00:00, 49.13it/s]


Average training loss: 0.3303
Epoch 5/5


100%|██████████| 17966/17966 [05:50<00:00, 51.29it/s]

Average training loss: 0.3208





In [79]:
ratings

array([ 0,  5,  0, ..., 10, 10,  8])

In [78]:
model(torch.Tensor(np.arange(64)).to(torch.int).to(device), torch.Tensor(np.arange(64)).to(torch.int).to(device))

tensor([[ 1.8237],
        [ 0.5236],
        [ 2.4722],
        [-0.0794],
        [-1.2224],
        [-0.0794],
        [ 1.1973],
        [ 0.9286],
        [ 2.1904],
        [-1.4322],
        [ 2.0835],
        [ 2.3873],
        [ 2.2468],
        [ 2.8970],
        [ 1.4793],
        [ 1.0199],
        [ 2.3285],
        [ 2.1610],
        [ 1.4643],
        [ 2.5709],
        [ 1.1719],
        [ 0.8368],
        [-0.0794],
        [-0.0794],
        [ 0.5975],
        [ 2.4500],
        [-0.0794],
        [ 1.4538],
        [ 1.9972],
        [ 2.2352],
        [-0.9513],
        [ 1.5490],
        [-3.6300],
        [ 1.7268],
        [ 2.7080],
        [-0.6589],
        [-2.2945],
        [-1.1603],
        [-2.0338],
        [ 1.4952],
        [-1.1852],
        [-1.7919],
        [-2.9719],
        [-0.0794],
        [-0.0794],
        [-3.5794],
        [-1.3562],
        [-2.5260],
        [-2.0439],
        [-2.7487],
        [-0.6022],
        [-2.6267],
        [-2.

In [39]:
np.unique(book_ids)

array([     0,      1,      2, ..., 340553, 340554, 340555])

NameError: name 'user_id' is not defined

In [5]:
books, ratings, users,  = prep.preprocess(books, ratings, users)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['Original_NaN'] = df_result['Age'].isna()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['Age'] = pd.to_numeric(df_result['Age'], errors='coerce')
  df_result = df_result[~(users['Age'].isna() & ~df_result['Original_NaN'])]


In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# # Sample data for demonstration purposes
# books_data = {
#     'ISBN': ['0002005018', '0002005019', '0002005020'],
#     'Title': ['Book1', 'Book2', 'Book3'],
#     'Author': ['Author1', 'Author2', 'Author3'],
#     'Year': [2001, 2002, 2003]
# }

# users_data = {
#     'User-ID': [1, 2, 3],
#     'Age': [23, 34, 45]
# }

# ratings_data = {
#     'User-ID': [1, 1, 2, 2, 3, 3],
#     'ISBN': ['0002005018', '0002005019', '0002005018', '0002005020', '0002005019', '0002005020'],
#     'Rating': [5, 3, 4, 2, 1, 5]
# }

# books = pd.DataFrame(books_data)
# users = pd.DataFrame(users_data)
# ratings = pd.DataFrame(ratings_data)
# Preprocessing
label_encoders = {
    'ISBN': LabelEncoder(),
    'User-ID': LabelEncoder()
}

books['ISBN'] = label_encoders['ISBN'].fit_transform(books['ISBN'])
ratings['ISBN'] = label_encoders['ISBN'].transform(ratings['ISBN'])
ratings['User-ID'] = label_encoders['User-ID'].fit_transform(ratings['User-ID'])
users['User-ID'] = label_encoders['User-ID'].transform(users['User-ID'])

# Merge data
data = ratings.merge(books, on='ISBN').merge(users, on='User-ID')

# Feature engineering
features = data[['User-ID', 'ISBN', 'Age', 'Year']].values
labels = data['Rating'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

class LambdaRankLoss(nn.Module):
    def __init__(self):
        super(LambdaRankLoss, self).__init__()

    def forward(self, scores, labels):
        loss = 0.0
        n = scores.size(0)
        for i in range(n):
            for j in range(n):
                if labels[i] > labels[j]:
                    S_ij = 1
                elif labels[i] < labels[j]:
                    S_ij = -1
                else:
                    S_ij = 0
                score_diff = scores[i] - scores[j]
                loss += 0.5 * (1.0 - S_ij) - torch.sigmoid(S_ij * score_diff)
        loss = loss / (n * (n - 1))
        return loss

class Learn2RankModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Learn2RankModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Check if MPS is available and set the device
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Model parameters
input_dim = X_train.shape[1]
hidden_dim = 128

model = Learn2RankModel(input_dim, hidden_dim).to(device)
criterion = LambdaRankLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
epochs = 20
for epoch in range(epochs):
    model.train()
    features_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    labels_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
    
    optimizer.zero_grad()
    scores = model(features_tensor).squeeze()
    loss = criterion(scores, labels_tensor)
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Evaluation
model.eval()
with torch.no_grad():
    test_features_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    test_labels_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
    test_scores = model(test_features_tensor).squeeze()
    test_loss = criterion(test_scores, test_labels_tensor)
    print(f"Test Loss: {test_loss.item()}")


In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np

class L2RModel:
    def __init__(self):
        self.model = None

    def prepare_data(self, user_features, book_features, labels):
        """
        Prepares the training data for the L2R model.
        Args:
            user_features (pd.DataFrame): DataFrame containing user features.
            book_features (pd.DataFrame): DataFrame containing book features.
            labels (pd.Series): Series containing relevance labels for training.
        Returns:
            pd.DataFrame: Combined DataFrame of user and book features.
        """
        data = pd.concat([user_features, book_features], axis=1)
        return data, labels

    def train(self, user_features, book_features, labels):
        """
        Trains the L2R model.
        Args:
            user_features (pd.DataFrame): DataFrame containing user features.
            book_features (pd.DataFrame): DataFrame containing book features.
            labels (pd.Series): Series containing relevance labels for training.
        """
        data, labels = self.prepare_data(user_features, book_features, labels)
        
        train_data = lgb.Dataset(data, label=labels)
        
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'ndcg_at': [1, 3, 5],
            'learning_rate': 0.1,
            'num_leaves': 31,
            'min_data_in_leaf': 20
        }
        
        self.model = lgb.train(params, train_data, num_boost_round=100)
    
    def predict(self, user_features, book_features):
        """
        Predicts the relevance scores for the given features.
        Args:
            user_features (pd.DataFrame): DataFrame containing user features.
            book_features (pd.DataFrame): DataFrame containing book features.
        Returns:
            np.ndarray: Predicted relevance scores.
        """
        data, _ = self.prepare_data(user_features, book_features, None)
        return self.model.predict(data)
    
    def rank_books(self, user_features, book_features):
        """
        Ranks the books for a given user based on predicted relevance scores.
        Args:
            user_features (pd.DataFrame): DataFrame containing user features.
            book_features (pd.DataFrame): DataFrame containing book features.
        Returns:
            pd.Series: Series containing the ranking of books.
        """
        scores = self.predict(user_features, book_features)
        rankings = np.argsort(scores)[::-1]
        return rankings

# Example usage:
# Assuming user_features_df and book_features_df are prepared DataFrames
# and labels is a Series of relevance labels.

# l2r = L2RModel()
# l2r.train(user_features_df, book_features_df, labels)
# rankings = l2r.rank_books(user_features_df, book_features_df)


In [None]:
I

In [None]:
books