In [4]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Load Bert Model
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda()

In [6]:
# Calculate Tag Embeddings
loaded_data = pd.read_csv('data\selected_movie_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = " ".join(rows.Tags)
        # Use BERT to get tag embedding
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
        outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Movie] = tag_embedding

1200it [00:37, 32.12it/s]


In [7]:
# Save embeddings dict
with open('data/movie_tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)

In [8]:
# Load embeddings dict
with open('data/movie_tag_embedding_dict.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [9]:
# Calculate Score Tag Embeddings
loaded_data = pd.read_csv('data\\movie_score.csv')

rating_embedding_dict = {}
string_to_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # Convert tag list to string
        tags_str = '' if rows.Tag != rows.Tag else rows.Tag
        if tags_str in string_to_embedding_dict:
            tag_embedding = string_to_embedding_dict[tags_str]
        else:
            # Use BERT to get tag embedding
            inputs = tokenizer(tags_str, truncation=True, return_tensors='pt')
            outputs = model(inputs.input_ids.cuda(), inputs.token_type_ids.cuda(), inputs.attention_mask.cuda())
            tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
            string_to_embedding_dict[tags_str] = tag_embedding
        rating_embedding_dict[(rows.User, rows.Movie)] = tag_embedding

715024it [22:09, 537.87it/s] 


In [10]:
# Save rating embeddings dict
with open('data/movie_rating_embedding_dict.pkl', 'wb') as f:
    pickle.dump(rating_embedding_dict, f)

In [11]:
# Load rating embeddings dict
with open('data/movie_rating_embedding_dict.pkl', 'rb') as f:
    rating_embedding_dict = pickle.load(f)

In [12]:
# Create Dataset Class
class MovieRatingDataset(Dataset):
    def __init__(self, data, user_idx, movie_idx, tag_embedding_dict, rating_embedding_dict):
        self.data = data
        self.user_idx = user_idx
        self.movie_idx = movie_idx
        self.tag_embedding_dict = tag_embedding_dict
        self.rating_embedding_dict = rating_embedding_dict

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_idx[row['User']]
        movie = self.movie_idx[row['Movie']]
        rating = row['Rate'].astype('float32')
        tag_embedding = self.tag_embedding_dict[row['Movie']]
        rating_embedding = self.rating_embedding_dict[(row['User'], row['Movie'])]
        return user, movie, rating, tag_embedding, rating_embedding

    def __len__(self):
        return len(self.data)

In [13]:
# Create Model
class RatingPredictionModel(nn.Module):
    def __init__(self, user_count, movie_count, entity_embeddings_dim, text_embeddings_dim):
        super(RatingPredictionModel, self).__init__()
        self.user_embeddings = nn.Embedding(user_count, entity_embeddings_dim)
        self.movie_embeddings = nn.Embedding(movie_count, entity_embeddings_dim)
        self.movie_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.rating_tag_embeddings = nn.Linear(text_embeddings_dim, entity_embeddings_dim)
        self.movie_integrated = nn.Linear(entity_embeddings_dim * 3, entity_embeddings_dim)
        self.integrated = nn.Linear(entity_embeddings_dim * 2, 16)
        self.predict_rating = nn.Linear(16, 1)
        self.activation = nn.Sigmoid()
    
    def forward(self, user, movie, tag_embedding, rating_embedding):
        user_embedding = self.user_embeddings(user)
        movie_embedding = self.movie_embeddings(movie)
        movie_tag_embedding = self.activation(self.movie_tag_embeddings(tag_embedding))
        rating_tag_embedding = self.activation(self.rating_tag_embeddings(rating_embedding))
        movie_embeddings_integrated = torch.cat([movie_embedding, movie_tag_embedding, rating_tag_embedding], dim=1)
        movie_integrated_result = self.activation(self.movie_integrated(movie_embeddings_integrated))
        embeddings_integrated = torch.cat([user_embedding, movie_integrated_result], dim=1)
        integrated_result = self.activation(self.integrated(embeddings_integrated))
        return self.predict_rating(integrated_result)

In [14]:
# Given uniqued id list, returns two direction mappings
def id_map(ids):
    id_to_idx = {v: k for k, v in enumerate(ids)}
    idx_to_id = {k: v for k, v in enumerate(ids)}
    return id_to_idx, idx_to_id

In [23]:
# Initializing data
user_to_idx, idx_to_user = id_map(loaded_data['User'].unique())
movie_to_idx, idx_to_movie = id_map(loaded_data['Movie'].unique())

# Split data
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# Create Dataset and DataLoader
train_dataset = MovieRatingDataset(train_data, user_to_idx, movie_to_idx, tag_embedding_dict, rating_embedding_dict)
test_dataset = MovieRatingDataset(test_data, user_to_idx, movie_to_idx, tag_embedding_dict, rating_embedding_dict)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, drop_last=True)

# Create Model
model = RatingPredictionModel(len(user_to_idx), len(movie_to_idx), 50, 768).cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

In [24]:
# Training
norm_penalty = 0.001
total_epoches = 20

for epoch in range(total_epoches):
    model.train()
    train_loss, test_loss = 0., 0.
    for batch_id, (user, movie, rating, tag_embedding, rating_embedding) in tqdm(enumerate(train_loader)):
        user = user.cuda()
        movie = movie.cuda()
        rating = rating.cuda()
        tag_embedding = tag_embedding.squeeze(1).cuda()
        rating_embedding = rating_embedding.squeeze(1).cuda()
        optimizer.zero_grad()
        pred = model(user, movie, tag_embedding, rating_embedding)
        loss = criterion(pred, rating.unsqueeze(1))
        l2_loss = norm_penalty * sum(p.pow(2.0).sum().sqrt() for p in model.parameters())
        loss += l2_loss
    
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    model.eval()
    average_train_loss = train_loss / (batch_id + 1)
    results = []

    with torch.no_grad():
        for batch_id, (user, movie, rating, tag_embedding, rating_embedding) in tqdm(enumerate(test_loader)):
            user = user.cuda()
            movie = movie.cuda()
            rating = rating.cuda()
            tag_embedding = tag_embedding.squeeze(1).cuda()
            rating_embedding = rating_embedding.squeeze(1).cuda()
            pred = model(user, movie, tag_embedding, rating_embedding)
            loss = criterion(pred, rating.unsqueeze(1))
            test_loss += loss.item()

            user_ids = user.cpu().unsqueeze(1)
            # pred_ratings = pred.cpu().unsqueeze(1)
            pred_ratings = pred.cpu()
            true_ratings = rating.cpu().unsqueeze(1)
            results.append(torch.cat([user_ids, pred_ratings, true_ratings], dim=1))
        
        results = torch.cat(results).numpy()
        results_dataframe = pd.DataFrame(results, columns=['user_id', 'pred_rating', 'true_rating'])
        avg_ndcg_scores = []
        
        for user_id in results_dataframe['user_id'].unique():
            user_data = results_dataframe[results_dataframe['user_id'] == user_id]
            if len(user_data) > 1:
                avg_ndcg_scores.append(torch.tensor(ndcg_score([user_data['true_rating'].values], [user_data['pred_rating'].values], k=50)))
    
        avg_ndcg_scores = torch.stack(avg_ndcg_scores)
        avg_ndcg_score = avg_ndcg_scores.mean()
        
        average_test_loss = test_loss / (batch_id + 1)
    print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}, Avg NDCG Score: {:.4f}'.format(epoch+1, total_epoches, average_train_loss, average_test_loss, avg_ndcg_score))
    # torch.save(model, 'data/models/sodiumcl10_model_{}.pth'.format(epoch))
    # print("Model saved to data/models/sodiumcl10_model_{}.pth".format(epoch))

1396it [00:32, 42.98it/s]
1396it [00:25, 55.04it/s]


Epoch [1/20], Train Loss: 2.1055, Test Loss: 1.5707, Avg NDCG Score: 0.7343


1396it [00:32, 43.22it/s]
1396it [00:26, 53.33it/s]


Epoch [2/20], Train Loss: 1.8101, Test Loss: 1.5093, Avg NDCG Score: 0.7495


1396it [00:31, 43.67it/s]
1396it [00:24, 56.98it/s]


Epoch [3/20], Train Loss: 1.7799, Test Loss: 1.4846, Avg NDCG Score: 0.7547


1396it [00:32, 43.11it/s]
1396it [00:25, 54.58it/s]


Epoch [4/20], Train Loss: 1.7561, Test Loss: 1.4844, Avg NDCG Score: 0.7615


1396it [00:32, 42.96it/s]
1396it [00:24, 55.89it/s]


Epoch [5/20], Train Loss: 1.7420, Test Loss: 1.4587, Avg NDCG Score: 0.7617


1396it [00:31, 44.02it/s]
1396it [00:25, 55.60it/s]


Epoch [6/20], Train Loss: 1.7358, Test Loss: 1.4654, Avg NDCG Score: 0.7640


1396it [00:32, 42.50it/s]
1396it [00:25, 54.85it/s]


Epoch [7/20], Train Loss: 1.7183, Test Loss: 1.4472, Avg NDCG Score: 0.7692


1396it [00:31, 44.42it/s]
1396it [00:24, 56.92it/s]


Epoch [8/20], Train Loss: 1.7163, Test Loss: 1.4565, Avg NDCG Score: 0.7675


1396it [00:32, 42.51it/s]
1396it [00:24, 56.70it/s]


Epoch [9/20], Train Loss: 1.7177, Test Loss: 1.4439, Avg NDCG Score: 0.7689


1396it [00:32, 43.60it/s]
1396it [00:24, 57.73it/s]


Epoch [10/20], Train Loss: 1.7173, Test Loss: 1.4410, Avg NDCG Score: 0.7692


1396it [00:30, 45.22it/s]
1396it [00:25, 54.11it/s]


Epoch [11/20], Train Loss: 1.7140, Test Loss: 1.4407, Avg NDCG Score: 0.7682


1396it [00:31, 43.90it/s]
1396it [00:24, 57.58it/s]


Epoch [12/20], Train Loss: 1.7062, Test Loss: 1.4358, Avg NDCG Score: 0.7673


1396it [00:32, 43.60it/s]
1396it [00:24, 56.84it/s]


Epoch [13/20], Train Loss: 1.7063, Test Loss: 1.4490, Avg NDCG Score: 0.7667


1396it [00:31, 44.35it/s]
1396it [00:25, 55.42it/s]


Epoch [14/20], Train Loss: 1.7067, Test Loss: 1.4854, Avg NDCG Score: 0.7664


1396it [00:31, 44.31it/s]
1396it [00:24, 56.81it/s]


Epoch [15/20], Train Loss: 1.7049, Test Loss: 1.4429, Avg NDCG Score: 0.7660


1396it [00:31, 43.65it/s]
1396it [00:24, 55.90it/s]


Epoch [16/20], Train Loss: 1.7046, Test Loss: 1.4421, Avg NDCG Score: 0.7689


1396it [00:32, 42.87it/s]
1396it [00:24, 56.41it/s]


Epoch [17/20], Train Loss: 1.7038, Test Loss: 1.4414, Avg NDCG Score: 0.7698


1396it [00:30, 45.67it/s]
1396it [00:24, 56.75it/s]


Epoch [18/20], Train Loss: 1.6954, Test Loss: 1.4602, Avg NDCG Score: 0.7660


1396it [00:30, 45.18it/s]
1396it [00:25, 55.27it/s]


Epoch [19/20], Train Loss: 1.6973, Test Loss: 1.4351, Avg NDCG Score: 0.7708


1396it [00:30, 46.28it/s]
1396it [00:24, 56.37it/s]


Epoch [20/20], Train Loss: 1.6976, Test Loss: 1.4452, Avg NDCG Score: 0.7706
