In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Load Bert Model
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda()

  torch.utils._pytree._register_pytree_node(


In [3]:
# Calculate Tag Embeddings
loaded_data = pd.read_csv('data\selected_book_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = " ".join(rows.Tags)
        # Use BERT to get tag embedding
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Book] = tag_embedding

1200it [00:21, 56.79it/s]


In [4]:
# Save embeddings dict
with open('data/tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)

In [5]:
# Load embeddings dict
with open('data/tag_embedding_dict.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [6]:
# Calculate Score Tag Embeddings
loaded_data = pd.read_csv('data\\book_score.csv')

rating_embedding_dict = {}
string_to_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = '' if rows.Tag != rows.Tag else rows.Tag
        if tags_str in string_to_embedding_dict:
            tag_embedding = string_to_embedding_dict[tags_str]
        else:
            # Use BERT to get tag embedding
            inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
            outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
            tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
            string_to_embedding_dict[tags_str] = tag_embedding
        rating_embedding_dict[(rows.User, rows.Book)] = tag_embedding

21105it [00:33, 625.56it/s] 


KeyboardInterrupt: 

In [None]:
# Save rating embeddings dict
with open('data/rating_embedding_dict.pkl', 'wb') as f:
    pickle.dump(rating_embedding_dict, f)

In [7]:
# Load rating embeddings dict
with open('data/rating_embedding_dict.pkl', 'rb') as f:
    rating_embedding_dict = pickle.load(f)

In [8]:
# Create Dataset Class
class BookRatingDataset(Dataset):
    def __init__(self, data, user_idx, book_idx, tag_embedding_dict, rating_embedding_dict):
        self.data = data
        self.user_idx = user_idx
        self.book_idx = book_idx
        self.tag_embedding_dict = tag_embedding_dict
        self.rating_embedding_dict = rating_embedding_dict

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_idx[row['User']]
        book = self.book_idx[row['Book']]
        rating = row['Rate'].astype('float32')
        tag_embedding = self.tag_embedding_dict[row['Book']]
        rating_embedding = self.rating_embedding_dict[(row['User'], row['Book'])]
        return user, book, rating, tag_embedding, rating_embedding

    def __len__(self):
        return len(self.data)

In [9]:
# Create Model
class RatingPredictionModel(nn.Module):
    def __init__(self, user_count, book_count, entity_embeddings_dim, text_embeddings_dim):
        super(RatingPredictionModel, self).__init__()
        self.user_embeddings = nn.Embedding(user_count, entity_embeddings_dim)
        self.book_embeddings = nn.Embedding(book_count, entity_embeddings_dim)
        self.book_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.rating_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.book_integrated = nn.Linear(entity_embeddings_dim * 3, entity_embeddings_dim)
        self.integrated = nn.Linear(entity_embeddings_dim * 2, 16)
        self.predict_rating = nn.Linear(16, 1)
        self.activation = nn.Sigmoid()
    
    def forward(self, user, book, tag_embedding, rating_embedding):
        user_embedding = self.user_embeddings(user)
        book_embedding = self.book_embeddings(book)
        book_tag_embedding = self.activation(self.book_tag_embeddings(tag_embedding))
        rating_tag_embedding = self.activation(self.rating_tag_embeddings(rating_embedding))
        book_embeddings_integrated = torch.cat([book_embedding, book_tag_embedding, rating_tag_embedding], dim=1)
        book_integrated_result = self.activation(self.book_integrated(book_embeddings_integrated))
        embeddings_integrated = torch.cat([user_embedding, book_integrated_result], dim=1)
        integrated_result = self.activation(self.integrated(embeddings_integrated))
        return self.predict_rating(integrated_result)

In [10]:
# Given uniqued id list, returns two direction mappings
def id_map(ids):
    id_to_idx = {v: k for k, v in enumerate(ids)}
    idx_to_id = {k: v for k, v in enumerate(ids)}
    return id_to_idx, idx_to_id

In [20]:
# Initializing data
user_to_idx, idx_to_user = id_map(loaded_data['User'].unique())
book_to_idx, idx_to_book = id_map(loaded_data['Book'].unique())

# Split data
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# Create Dataset and DataLoader
train_dataset = BookRatingDataset(train_data, user_to_idx, book_to_idx, tag_embedding_dict, rating_embedding_dict)
test_dataset = BookRatingDataset(test_data, user_to_idx, book_to_idx, tag_embedding_dict, rating_embedding_dict)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, drop_last=True)

# Create Model
model = RatingPredictionModel(len(user_to_idx), len(book_to_idx), 50, 768).cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

In [21]:
# Training
norm_penalty = 0.001
total_epoches = 20

for epoch in range(total_epoches):
    model.train()
    train_loss, test_loss = 0., 0.
    for batch_id, (user, book, rating, tag_embedding, rating_embedding) in tqdm(enumerate(train_loader)):
        user = user.cuda()
        book = book.cuda()
        rating = rating.cuda()
        tag_embedding = tag_embedding.squeeze(1).cuda()
        rating_embedding = rating_embedding.squeeze(1).cuda()
        optimizer.zero_grad()
        pred = model(user, book, tag_embedding, rating_embedding)
        loss = criterion(pred, rating.unsqueeze(1))
        l2_loss = norm_penalty * sum(p.pow(2.0).sum().sqrt() for p in model.parameters())
        loss += l2_loss
    
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    model.eval()
    average_train_loss = train_loss / (batch_id + 1)
    results = []

    with torch.no_grad():
        for batch_id, (user, book, rating, tag_embedding, rating_embedding) in tqdm(enumerate(test_loader)):
            user = user.cuda()
            book = book.cuda()
            rating = rating.cuda()
            tag_embedding = tag_embedding.squeeze(1).cuda()
            rating_embedding = rating_embedding.squeeze(1).cuda()
            pred = model(user, book, tag_embedding, rating_embedding)
            loss = criterion(pred, rating.unsqueeze(1))
            test_loss += loss.item()

            user_ids = user.cpu().unsqueeze(1)
            # pred_ratings = pred.cpu().unsqueeze(1)
            pred_ratings = pred.cpu()
            true_ratings = rating.cpu().unsqueeze(1)
            results.append(torch.cat([user_ids, pred_ratings, true_ratings], dim=1))
        
        results = torch.cat(results).numpy()
        results_dataframe = pd.DataFrame(results, columns=['user_id', 'pred_rating', 'true_rating'])
        avg_ndcg_scores = []
        
        for user_id in results_dataframe['user_id'].unique():
            user_data = results_dataframe[results_dataframe['user_id'] == user_id]
            if len(user_data) > 1:
                avg_ndcg_scores.append(torch.tensor(ndcg_score([user_data['true_rating'].values], [user_data['pred_rating'].values], k=50)))
    
        avg_ndcg_scores = torch.stack(avg_ndcg_scores)
        avg_ndcg_score = avg_ndcg_scores.mean()
        
        average_test_loss = test_loss / (batch_id + 1)
    print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}, Avg NDCG Score: {:.4f}'.format(epoch+1, total_epoches, average_train_loss, average_test_loss, avg_ndcg_score))
    # torch.save(model, 'data/models/sodiumcl10_model_{}.pth'.format(epoch))
    # print("Model saved to data/models/sodiumcl10_model_{}.pth".format(epoch))

1244it [00:29, 42.31it/s]
1244it [00:23, 53.44it/s]


Epoch [1/20], Train Loss: 2.8002, Test Loss: 2.1234, Avg NDCG Score: 0.7426


1244it [00:29, 42.23it/s]
1244it [00:23, 53.51it/s]


Epoch [2/20], Train Loss: 2.4439, Test Loss: 2.0867, Avg NDCG Score: 0.7489


1244it [00:28, 43.23it/s]
1244it [00:23, 53.76it/s]


Epoch [3/20], Train Loss: 2.3989, Test Loss: 2.0422, Avg NDCG Score: 0.7549


1244it [00:29, 42.24it/s]
1244it [00:22, 54.20it/s]


Epoch [4/20], Train Loss: 2.3662, Test Loss: 2.0175, Avg NDCG Score: 0.7592


1244it [00:31, 39.55it/s]
1244it [00:23, 52.03it/s]


Epoch [5/20], Train Loss: 2.3394, Test Loss: 1.9931, Avg NDCG Score: 0.7631


1244it [00:29, 42.14it/s]
1244it [00:23, 52.85it/s]


Epoch [6/20], Train Loss: 2.3122, Test Loss: 1.9969, Avg NDCG Score: 0.7631


1244it [00:30, 41.23it/s]
1244it [00:23, 53.17it/s]


Epoch [7/20], Train Loss: 2.3041, Test Loss: 1.9887, Avg NDCG Score: 0.7643


1244it [00:29, 42.15it/s]
1244it [00:23, 53.88it/s]


Epoch [8/20], Train Loss: 2.2921, Test Loss: 2.0100, Avg NDCG Score: 0.7643


1244it [00:29, 42.14it/s]
1244it [00:23, 53.55it/s]


Epoch [9/20], Train Loss: 2.2953, Test Loss: 1.9889, Avg NDCG Score: 0.7654


1244it [00:29, 42.18it/s]
1244it [00:23, 53.41it/s]


Epoch [10/20], Train Loss: 2.2975, Test Loss: 1.9786, Avg NDCG Score: 0.7643


1244it [00:29, 42.23it/s]
1244it [00:23, 53.35it/s]


Epoch [11/20], Train Loss: 2.2967, Test Loss: 1.9926, Avg NDCG Score: 0.7667


1244it [00:28, 42.94it/s]
1244it [00:23, 53.81it/s]


Epoch [12/20], Train Loss: 2.2891, Test Loss: 1.9810, Avg NDCG Score: 0.7642


1244it [00:29, 42.20it/s]
1244it [00:23, 53.62it/s]


Epoch [13/20], Train Loss: 2.2917, Test Loss: 1.9800, Avg NDCG Score: 0.7630


1244it [00:29, 42.38it/s]
1244it [00:23, 53.14it/s]


Epoch [14/20], Train Loss: 2.2940, Test Loss: 1.9795, Avg NDCG Score: 0.7658


1244it [00:29, 42.19it/s]
1244it [00:23, 53.54it/s]


Epoch [15/20], Train Loss: 2.2923, Test Loss: 2.0163, Avg NDCG Score: 0.7648


1244it [00:29, 42.26it/s]
1244it [00:23, 53.75it/s]


Epoch [16/20], Train Loss: 2.2934, Test Loss: 1.9747, Avg NDCG Score: 0.7648


1244it [00:29, 42.73it/s]
1244it [00:23, 53.83it/s]


Epoch [17/20], Train Loss: 2.2976, Test Loss: 2.0053, Avg NDCG Score: 0.7653


1244it [00:28, 43.14it/s]
1244it [00:23, 53.49it/s]


Epoch [18/20], Train Loss: 2.2916, Test Loss: 1.9738, Avg NDCG Score: 0.7637


1244it [00:28, 43.11it/s]
1244it [00:23, 53.29it/s]


Epoch [19/20], Train Loss: 2.2913, Test Loss: 1.9777, Avg NDCG Score: 0.7647


1244it [00:29, 42.17it/s]
1244it [00:23, 53.70it/s]


Epoch [20/20], Train Loss: 2.2968, Test Loss: 1.9917, Avg NDCG Score: 0.7669
