In [108]:
import numpy as np
import pandas as pd

import sys
sys.path.append('../..')
from src.recommenders.dnn_recommender import DNNRecommender

users = pd.read_csv('../../data/Users.csv', delimiter=';')
books = pd.read_csv('../../data/Books.csv', delimiter=';', dtype={'ISBN': str, 'Title': str, 'Author': str, 'Year': np.int16, 'Publisher': str})
ratings = pd.read_csv('../../data/Ratings.csv', delimiter=';', dtype={'User-ID': np.int32, 'ISBN': str, 'Rating': np.int8})

  users = pd.read_csv('../../data/Users.csv', delimiter=';')


In [109]:
print('Users records:', len(users))
users['Countries'] = [x if (not isinstance(x, (int, float)) and not x.isnumeric()) else None for x in users['Age']]
users['Age'] = [float(x) if (isinstance(x, (str)) and x.isnumeric()) else None for x in users['Age']]
print('Countries', users['Countries'].notna().sum(), 'Ages', users['Age'].notna().sum())

users.drop(columns=['Countries'], inplace=True)

Users records: 278859
Countries 1476 Ages 167151


In [99]:
ratings.head()
ratings['User-ID'].unique().shape

(105283,)

In [115]:
print(ratings.shape)
print(ratings[ratings.Rating > 0].shape)

ratings = ratings[ratings.Rating > 0]

(1149780, 4)
(433671, 4)


In [None]:
books.drop_duplicates(subset='ISBN', inplace=True)
books = books.reset_index()

In [105]:
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [106]:
users.head()

Unnamed: 0,User-ID,Age,Countries
0,1,,
1,2,18.0,
2,3,,
3,4,17.0,
4,5,,


In [107]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [116]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle missing values
books['Year'] = books['Year'].fillna(books['Year'].median())
users['Age'] = users['Age'].fillna(users['Age'].median())

books.dropna(inplace=True)
users.dropna(inplace=True)
ratings.dropna(inplace=True)

users['User-ID'] = pd.to_numeric(users['User-ID'], errors='coerce')

# Encode categorical variables
isbn_encoder = LabelEncoder()
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()
user_encoder = LabelEncoder()
country_encoder = LabelEncoder()

# Merge datasets
ratings_users_merged = ratings.merge(users, left_on='User-ID', right_on='User-ID')
final_df = ratings_users_merged.merge(books, left_on='ISBN', right_on='ISBN')

final_df['ISBN_encoded'] = isbn_encoder.fit_transform(final_df['ISBN'])
final_df['Author_encoded'] = author_encoder.fit_transform(final_df['Author'])
final_df['Publisher_encoded'] = publisher_encoder.fit_transform(final_df['Publisher'])
final_df['User_ID_encoded'] = user_encoder.fit_transform(final_df['User-ID'])

# Select relevant columns
final_df = final_df[['User_ID_encoded', 'ISBN_encoded', 'Rating', 'Age', 'Author_encoded', 'Year', 'Publisher_encoded']]

print(final_df.head())


   User_ID_encoded  ISBN_encoded  Rating   Age  Author_encoded  Year  \
0            67542         13121       5  32.0           31470  2001   
1            67543         61684       3  16.0           47694  1999   
2            67543         61704       6  16.0           56133  2001   
3            67544         38768       7  32.0           24910  2001   
4            67545          3136       9  25.0           49200  2003   

   Publisher_encoded  
0               4785  
1               1805  
2               1805  
3               2953  
4               4635  


In [None]:
user_features = np.array(final_df[['User_ID_encoded', 'Age']])
item_features = np.array(final_df[['ISBN_encoded', 'Author_encoded', 'Year', 'Publisher_encoded']])
ratings = np.array(final_df['Rating'])

In [128]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split data
train_data, test_data, train_ratings, test_ratings = train_test_split(
    final_df.drop('Rating', axis=1),
    final_df['Rating'],
    test_size=0.2,
    random_state=42
)

# Convert data to PyTorch tensors
train_data_tensor = torch.tensor(train_data.values, dtype=torch.float32)
train_ratings_tensor = torch.tensor(train_ratings.values, dtype=torch.float32)
test_data_tensor = torch.tensor(test_data.values, dtype=torch.float32)
test_ratings_tensor = torch.tensor(test_ratings.values, dtype=torch.float32)

# Define the model
class DNNRecommenderWithFeatures(nn.Module):
    def __init__(self, num_users, num_items, num_user_features, num_item_features, embedding_dim, hidden_dim):
        super(DNNRecommenderWithFeatures, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        self.user_feature_layer = nn.Linear(num_user_features, embedding_dim)
        self.item_feature_layer = nn.Linear(num_item_features, embedding_dim)
        
        self.fc1 = nn.Linear(embedding_dim * 2 * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)
        
    def forward(self, user_ids, item_ids, user_features, item_features):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        
        user_feature_embeds = self.user_feature_layer(user_features)
        item_feature_embeds = self.item_feature_layer(item_features)
        
        user_combined = torch.cat([user_embeds, user_feature_embeds], dim=1)
        item_combined = torch.cat([item_embeds, item_feature_embeds], dim=1)
        
        x = torch.cat([user_combined, item_combined], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

# Initialize model, optimizer, and loss function
num_users = len(user_encoder.classes_)
num_items = len(isbn_encoder.classes_)

num_user_features = user_features.shape[1]  # Age
num_item_features = item_features.shape[1]  # Author_encoded, Year, and Publisher_encoded

embedding_dim = 50
hidden_dim = 128

model = DNNRecommenderWithFeatures(num_users, num_items, num_user_features, num_item_features, embedding_dim, hidden_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    user_ids = train_data_tensor[:, 0].long()
    item_ids = train_data_tensor[:, 1].long()
    user_features = train_data_tensor[:, 2:3]
    item_features = train_data_tensor[:, 3:]

    predictions = model(user_ids, item_ids, user_features, item_features)
    loss = criterion(predictions, train_ratings_tensor)
    
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Evaluation
model.eval()
with torch.no_grad():
    user_ids = test_data_tensor[:, 0].long()
    item_ids = test_data_tensor[:, 1].long()
    user_features = test_data_tensor[:, 2:3]
    item_features = test_data_tensor[:, 3:]
    
    predictions = model(user_ids, item_ids, user_features, item_features)
    test_loss = criterion(predictions, test_ratings_tensor)
    print(f'Test Loss: {test_loss.item()}')

Epoch 1/20, Loss: 14759.5400390625
Epoch 2/20, Loss: 590756.9375
Epoch 3/20, Loss: 35257.85546875
Epoch 4/20, Loss: 152305.125
Epoch 5/20, Loss: 229049.3125
Epoch 6/20, Loss: 130084.3828125
Epoch 7/20, Loss: 35356.265625
Epoch 8/20, Loss: 1559.8394775390625
Epoch 9/20, Loss: 32877.8828125
Epoch 10/20, Loss: 73503.203125
Epoch 11/20, Loss: 76066.109375
Epoch 12/20, Loss: 47980.29296875
Epoch 13/20, Loss: 16931.10546875
Epoch 14/20, Loss: 1486.2379150390625
Epoch 15/20, Loss: 5017.234375
Epoch 16/20, Loss: 16844.78125
Epoch 17/20, Loss: 22456.65625
Epoch 18/20, Loss: 17835.423828125
Epoch 19/20, Loss: 8459.88671875
Epoch 20/20, Loss: 1716.762451171875
Test Loss: 575.8331298828125
