In [None]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load Bert Model
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda()

In [None]:
# Calculate Tag Embeddings
loaded_data = pd.read_csv('data\selected_book_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = " ".join(rows.Tags)
        # Use BERT to get tag embedding
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Book] = tag_embedding

In [None]:
# Save embeddings dict
with open('data/tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)

In [None]:
# Load embeddings dict
with open('data/tag_embedding_dict.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [None]:
# Calculate Score Tag Embeddings
loaded_data = pd.read_csv('data\\book_score.csv')

rating_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = " ".join(rows.Tags)
        # Use BERT to get tag embedding
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        rating_embedding_dict[(rows.User, rows.Book)] = tag_embedding

In [None]:
# Save rating embeddings dict
with open('data/rating_embedding_dict.pkl', 'wb') as f:
    pickle.dump(rating_embedding_dict, f)

In [None]:
# Load rating embeddings dict
with open('data/rating_embedding_dict.pkl', 'rb') as f:
    rating_embedding_dict = pickle.load(f)

In [None]:
# Create Dataset Class
class BookRatingDataset(Dataset):
    def __init__(self, data, user_idx, book_idx, tag_embedding_dict, rating_embedding_dict):
        self.data = data
        self.user_idx = user_idx
        self.book_idx = book_idx
        self.tag_embedding_dict = tag_embedding_dict
        self.rating_embedding_dict = rating_embedding_dict

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_idx[row['User']]
        book = self.book_idx[row['Book']]
        rating = row['Rate'].astype('float32')
        tag_embedding = self.tag_embedding_dict[row['Book']]
        rating_embedding = self.rating_embedding_dict[(row['User'], row['Book'])]
        return user, book, rating, tag_embedding, rating_embedding

    def __len__(self):
        return len(self.data)

In [None]:
# Create Model
class RatingPredictionModel(nn.Module):
    def __init__(self, user_count, book_count, entity_embeddings_dim, text_embeddings_dim):
        super(RatingPredictionModel, self).__init__()
        self.user_embeddings = nn.Embedding(user_count, entity_embeddings_dim)
        self.book_embeddings = nn.Embedding(book_count, entity_embeddings_dim)
        self.book_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.rating_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.predict_rating = nn.Linear(entity_embeddings_dim * 4, 1)
    
    def forward(self, user, book, tag_embedding, rating_embedding):
        user_embedding = self.user_embeddings(user)
        book_embedding = self.book_embeddings(book)
        book_tag_embedding = self.book_tag_embeddings(tag_embedding)
        rating_tag_embedding = self.rating_tag_embeddings(rating_embedding)  
        book_embeddings_integrated = torch.cat([user_embedding, book_embedding, book_tag_embedding, rating_tag_embedding], dim=1)
        return self.predict_rating(book_embeddings_integrated)