In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Install the necessary libraries
!pip install pytorch_lightning
!pip install transformers
!pip install dataset


Collecting pytorch_lightning
  Downloading pytorch_lightning-1.6.0-py3-none-any.whl (582 kB)
[K     |████████████████████████████████| 582 kB 31.6 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.7.3-py3-none-any.whl (398 kB)
[K     |████████████████████████████████| 398 kB 62.9 MB/s 
Collecting typing-extensions>=4.0.0
  Downloading typing_extensions-4.1.1-py3-none-any.whl (26 kB)
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 66.9 MB/s 
[?25hCollecting pyDeprecate<0.4.0,>=0.3.1
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting PyYAML>=5.4
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12

In [None]:
#import the necessary packages
import pickle
import torch
import random
import pandas as pd
import numpy as np
import os
import math
from torch.nn import BCEWithLogitsLoss,MSELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertForMaskedLM,AdamW

%matplotlib inline


In [None]:
#Get the data 
with open('./drive/MyDrive/Data/English_TGD2_TGD_All_data.pkl','rb') as file:
    TGD_All_data=pickle.load(file)


In [None]:
# get all the movie names and ids in our dataset so that we will add them later to the Pre-trained model vocabulary
TGD_movie_names=[v['name'] for k, v in TGD_All_data['all_movies'].items()]
TGD_movie_ids=[k for k, v in TGD_All_data['all_movies'].items()]

In [None]:
# reshape all the movies for each user in a sequence way. example:"user1" : @movie1,@movie2,@movie3 ...
def create_rows_movie_sequences(TGD_All_data,typee='train'):
  list_of_all_sequences=[]
  for i, dial in enumerate(TGD_All_data[typee]):
    movies_seq= dial['user_history_movies_interaction_ids']
    movies_seq=' '.join(x for x in movies_seq)
    if movies_seq=='':
      continue
    else:
      list_of_all_sequences.append(movies_seq)
  
  return list_of_all_sequences

train_list_of_sequences=create_rows_movie_sequences(TGD_All_data,typee='train')
test_list_of_sequences=create_rows_movie_sequences(TGD_All_data,typee='test')
valid_list_of_sequences=create_rows_movie_sequences(TGD_All_data,typee='valid')

In [None]:
# create the train, test, and validation dataframes
train_df= pd.DataFrame({'interactions':train_list_of_sequences})
test_df= pd.DataFrame({'interactions':test_list_of_sequences})
valid_df= pd.DataFrame({'interactions':valid_list_of_sequences})




In [None]:
#randomly shuffle the dataset
from sklearn.utils import shuffle
train_df=shuffle(train_df)
test_df=shuffle(test_df)
valid_df=shuffle(valid_df)

In [None]:
#Instansiate the distilbert tokenizer and add the movie name to the vocabulary
tokenizer_4rec = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
num_added_toks = tokenizer_4rec.add_tokens(TGD_movie_ids)


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
#Create a dictionary in the form of  movie_id:movie_name
idtotoken={value:key for key,value in tokenizer_4rec.get_vocab().items()}

In [None]:
# create a function to tokenize the movie sequences
def sequence_Input_Tokenization(liste, tokenizer):
    
    outputs = tokenizer(liste,truncation=True,return_tensors='pt',return_attention_mask=True,max_length=512, padding='max_length')    
    outputs['labels']=outputs.input_ids.detach().clone()
    return outputs

In [None]:
#for each user we keep only the movie sequences 
train_data=train_df['interactions']
test_data=test_df['interactions']
valid_data=valid_df['interactions']


In [None]:
len(train_data)

8328

In [None]:
#store the tokenizer 

tokenizer_4rec.save_pretrained('drive/MyDrive/Models/NextMovie/Tokenizer')

('drive/MyDrive/Models/NextMovie/Tokenizer/tokenizer_config.json',
 'drive/MyDrive/Models/NextMovie/Tokenizer/special_tokens_map.json',
 'drive/MyDrive/Models/NextMovie/Tokenizer/vocab.txt',
 'drive/MyDrive/Models/NextMovie/Tokenizer/added_tokens.json')

In [None]:
MLM_train_inputs=sequence_Input_Tokenization(list(train_data.values), tokenizer_4rec) # tokenize the training set 

In [None]:
MLM_test_inputs=sequence_Input_Tokenization(list(test_data.values), tokenizer_4rec)# tokenize the testing set 

In [None]:
MLM_valid_inputs=sequence_Input_Tokenization(list(valid_data.values), tokenizer_4rec) # tokenize the validation set 

In [None]:
#Create the first type of inputs : The first network input : the Cusal network input
# Mask random 15% movies for each sequence 
#This code borrow ideas from : https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c
def mask_train_data(inputs):
  rand=torch.rand(inputs.input_ids.shape)
  maskArray= (rand<0.15) *(inputs.input_ids!=tokenizer_4rec.get_vocab()['[CLS]'])*(inputs.input_ids!=tokenizer_4rec.get_vocab()['[SEP]'])*(inputs.input_ids!=tokenizer_4rec.get_vocab()['[PAD]'])
  selection=[]
  for i in range(inputs.input_ids.shape[0]):
    indexesThatShouldBeMasked=torch.flatten(maskArray[i].nonzero()).tolist()
    selection.append(indexesThatShouldBeMasked)#
    inputs.input_ids[i,indexesThatShouldBeMasked]=tokenizer_4rec.get_vocab()['[MASK]'] # maske the input
  return inputs
MLM_train_inputs=mask_train_data(MLM_train_inputs)
MLM_valid_inputs=mask_train_data(MLM_valid_inputs)


In [None]:
#Create the second type of inputs : the second network iinput: the prefix network input
# Mask only the last movie for each sequence of movies 
MLM_train_inputs_mask_last_input_ids = MLM_train_inputs.input_ids.detach().clone()
MLM_train_inputs_mask_last_mask = MLM_train_inputs.attention_mask.detach().clone()
MLM_train_inputs_mask_last_labels = MLM_train_inputs.labels.detach().clone()
MLM_valid_inputs_mask_last_input_ids = MLM_valid_inputs.input_ids.detach().clone()
MLM_valid_inputs_mask_last_mask = MLM_valid_inputs.attention_mask.detach().clone()
MLM_valid_inputs_mask_last_labels = MLM_valid_inputs.labels.detach().clone()
def mask_last_train_data(inputs):

  rand=torch.rand(inputs.shape)
  maskArray= (rand<0.15) *(inputs!=tokenizer_4rec.get_vocab()['[CLS]'])*(inputs!=tokenizer_4rec.get_vocab()['[SEP]'])*(inputs!=tokenizer_4rec.get_vocab()['[PAD]'])
  selection=[]
  for i in range(inputs.shape[0]):
    index_sep=(inputs[i] == tokenizer_4rec.get_vocab()['[SEP]']).nonzero(as_tuple=True)[0]
    inputs[i,index_sep-1]=tokenizer_4rec.get_vocab()['[MASK]'] # masker the last element input  return inputs

  return inputs
MLM_train_inputs_mask_last_input_ids=mask_last_train_data(MLM_train_inputs_mask_last_input_ids)
MLM_valid_inputs_mask_last_input_ids=mask_last_train_data(MLM_valid_inputs_mask_last_input_ids)


In [None]:
# change the names of our input and labels for simplicity

MLM_X_train,MLM_train_masks,MLM_Y_train, = (MLM_train_inputs.input_ids,MLM_train_inputs.attention_mask,MLM_train_inputs.labels)
MLM_X_valid,MLM_valid_masks,MLM_Y_valid = (MLM_valid_inputs.input_ids,MLM_valid_inputs.attention_mask,MLM_valid_inputs.labels)


In [None]:

# Formulate the input in the batch form to pass it for the pytorch neural network framework
batch_size=5
MLM_train_data = TensorDataset(MLM_X_train,MLM_train_masks,MLM_Y_train,MLM_train_inputs_mask_last_input_ids,MLM_train_inputs_mask_last_mask,MLM_train_inputs_mask_last_labels)
MLM_train_sampler = RandomSampler(MLM_train_data)
MLM_train_dataloader = DataLoader(MLM_train_data,\
                              sampler=MLM_train_sampler,\
                              batch_size=batch_size)

MLM_validation_data = TensorDataset(MLM_X_valid,MLM_valid_masks,MLM_Y_valid,MLM_valid_inputs_mask_last_input_ids,MLM_valid_inputs_mask_last_mask,MLM_valid_inputs_mask_last_labels)
MLM_validation_sampler = SequentialSampler(MLM_validation_data)
MLM_validation_dataloader = DataLoader(MLM_validation_data,\
                                   sampler=MLM_validation_sampler,\
                                   batch_size=batch_size)

In [None]:
# define the training device 
device =torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 


#OnlyNextItemToPredict

In [None]:
#Create our model
class DistilBert_TGD_Ensemble_no_BIG5(torch.nn.Module):
  
  def __init__(self):
    super(DistilBert_TGD_Ensemble_no_BIG5, self).__init__()
    
    model=DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')# Instanciate the first DistilBer model (Causal)
    model.resize_token_embeddings(len(tokenizer_4rec))# Update the covabulary of the pre-trqined model to match the vocabulary of the tokenizer


    model_mask_last=DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') # Instanciate the second DistilBer model (Prefix)
    model_mask_last.resize_token_embeddings(len(tokenizer_4rec))# Update the covabulary of the pre-trqined model to match the vocabulary of the tokenizer


    self.electra = model
    self.electra_mask_last = model_mask_last



   
   
  def forward(self, input_ids, token_type_ids=None,attention_mask=None, labels=None,input_ids_mask_last=None, token_type_ids_mask_last=None,attention_mask_mask_last=None, labels_mask_last=None):
    #Now that we defined the architecture and initialized the weights we need to define the forward function
    #and how each defined variable will be used in the architecture
    
    
    
    position_ids=torch.arange(start=0, end = input_ids.shape[1], step=1).expand_as(input_ids).to(device)
    
   
    # last hidden layer
    #for ech row we will use the ELECTRa model to create an embedding vector given the tokenized input


    #Electra predfoned positions

    MLM_outputs = self.electra(input_ids, attention_mask=attention_mask,labels=labels)
    self.electra_mask_last.state_dict()['vocab_layer_norm.bias']=self.electra.state_dict()['vocab_layer_norm.bias']

    MLM_outputs_mask_last= self.electra_mask_last(input_ids_mask_last, attention_mask=attention_mask_mask_last,labels=labels_mask_last)


    

    if ((labels is not None) ):


      loss=(MLM_outputs.loss + MLM_outputs_mask_last.loss)
      return loss
    else:
      return MLM_outputs.logits,MLM_outputs_mask_last.logits



In [None]:
#Define the storing path and the trqining epochs
num_epochs=20

cwd = os.getcwd()
model_save_path = output_model_file = os.path.join(cwd, "drive/MyDrive/Models/NextMovie/DistilBert_TGD_Ensemble_no_BIG5_last_masked_and_NormalMask_OnlyNextItemToPredict.bin")# badlouu fi drixe explicitemnt



In [None]:
# instanciate our model
model = DistilBert_TGD_Ensemble_no_BIG5()
model.to(device) # specify that the model will be running on the perdefined device that we have 


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

DistilBert_TGD_Ensemble_no_BIG5(
  (distilbert): DistilBertForMaskedLM(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(64356, 768)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
  

In [None]:
#Define the training function : this code is copied from https://towardsdatascience.com/multi-label-text-classification-with-xlnet-b5f5755302df


def train(model, num_epochs,\
          optimizer,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="cpu"
          ):
  """
  Train the model and save the model with the lowest validation loss
  """

  model.to(device)

  for i in trange(num_epochs, desc="Epoch"):
    
    actual_epoch = start_epoch + i

    

    model.train()

    tr_loss = 0
    num_train_samples = 0

    for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels,mask_last_input_ids,mask_last_attention_mask,mask_last_labels= batch
      # Clear out the gradients (by default they accumulate)

      optimizer.zero_grad()
      # Forward pass
      loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels,input_ids_mask_last=mask_last_input_ids,attention_mask_mask_last=mask_last_attention_mask,labels_mask_last=mask_last_labels)
      # store train loss
      tr_loss += loss.item()
      num_train_samples += b_labels.size(0)
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient/
      optimizer.step()
      #scheduler.step()

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)

    print("Train loss: {}".format(epoch_train_loss))

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss = 0
    num_eval_samples = 0

    # Evaluate data for one epoch
    for batch in valid_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels,mask_last_input_ids,mask_last_attention_mask,mask_last_labels= batch
      # Telling the model not to compute or store gradients,
      # saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate validation loss
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels,input_ids_mask_last=mask_last_input_ids,attention_mask_mask_last=mask_last_attention_mask,labels_mask_last=mask_last_labels)
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

    print("Valid loss: {}".format(epoch_eval_loss))


    if lowest_eval_loss == None:
      lowest_eval_loss = epoch_eval_loss
      # save model
      save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
      if epoch_eval_loss < lowest_eval_loss:
        lowest_eval_loss = epoch_eval_loss
        # save model
        save_model(model, model_save_path, actual_epoch,\
                   lowest_eval_loss, train_loss_set, valid_loss_set)
    print("\n")

  return model, train_loss_set, valid_loss_set


def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
  """
  Save the model to the path directory provided
  """
  model_to_save = model.module if hasattr(model, 'module') else model
  checkpoint = {'epochs': epochs, \
                'lowest_eval_loss': lowest_eval_loss,\
                'state_dict': model_to_save.state_dict(),\
                'train_loss_hist': train_loss_hist,\
                'valid_loss_hist': valid_loss_hist
               }
  torch.save(checkpoint, save_path)
  print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                                                     lowest_eval_loss))
  return
  
def load_model(save_path):
  """
  Load the model from the path directory provided
  """
  checkpoint = torch.load(save_path)
  model_state_dict = checkpoint['state_dict']
  model = DistilBert_TGD_Ensemble_no_BIG5(num_labels=model_state_dict["classifier.weight"].size()[0])
  model.load_state_dict(model_state_dict)

  epochs = checkpoint["epochs"]
  lowest_eval_loss = checkpoint["lowest_eval_loss"]
  train_loss_hist = checkpoint["train_loss_hist"]
  valid_loss_hist = checkpoint["valid_loss_hist"]
  
  return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

In [None]:
#Define the optemizer, the learning rate and the weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)




In [None]:
# Train the model
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=MLM_train_dataloader,\
                                              valid_dataloader=MLM_validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              device=device)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Train loss: 0.07577978054063321
Valid loss: 0.03401650510230299


Epoch:   5%|▌         | 1/20 [16:58<5:22:28, 1018.33s/it]

Saving model at epoch 0 with validation loss of 0.03401650510230299


Train loss: 0.03749836801607645
Valid loss: 0.03339392716245469


Epoch:  10%|█         | 2/20 [33:36<5:02:00, 1006.69s/it]

Saving model at epoch 1 with validation loss of 0.03339392716245469


Train loss: 0.03607006206675491
Valid loss: 0.03332759413009288


Epoch:  15%|█▌        | 3/20 [50:15<4:44:08, 1002.84s/it]

Saving model at epoch 2 with validation loss of 0.03332759413009288


Train loss: 0.0343210718039366


Epoch:  20%|██        | 4/20 [1:06:50<4:26:40, 1000.06s/it]

Valid loss: 0.03363639440534873


Train loss: 0.031031275649826103


Epoch:  25%|██▌       | 5/20 [1:23:26<4:09:38, 998.58s/it] 

Valid loss: 0.033831656980360024


Train loss: 0.02721212798189175


Epoch:  30%|███       | 6/20 [1:40:02<3:52:47, 997.65s/it]

Valid loss: 0.03364173538911717


Train loss: 0.023448641542801553


Epoch:  35%|███▌      | 7/20 [1:56:38<3:36:02, 997.08s/it]

Valid loss: 0.03344249094126169


Train loss: 0.01983260258913262
Valid loss: 0.03317237190526146


Epoch:  40%|████      | 8/20 [2:13:17<3:19:30, 997.57s/it]

Saving model at epoch 7 with validation loss of 0.03317237190526146


Train loss: 0.01649349638932777
Valid loss: 0.03295272796593942


Epoch:  45%|████▌     | 9/20 [2:29:55<3:02:57, 997.91s/it]

Saving model at epoch 8 with validation loss of 0.03295272796593942


Train loss: 0.0134889589858355
Valid loss: 0.03278924507504433


Epoch:  50%|█████     | 10/20 [2:46:34<2:46:21, 998.15s/it]

Saving model at epoch 9 with validation loss of 0.03278924507504433


Train loss: 0.010846197792113285
Valid loss: 0.03260557732124283


Epoch:  55%|█████▌    | 11/20 [3:03:13<2:29:45, 998.34s/it]

Saving model at epoch 10 with validation loss of 0.03260557732124283


Train loss: 0.008512571834243152
Valid loss: 0.032496952389507956


Epoch:  60%|██████    | 12/20 [3:19:52<2:13:07, 998.49s/it]

Saving model at epoch 11 with validation loss of 0.032496952389507956


Train loss: 0.006497149448808261
Valid loss: 0.03236977822092676


Epoch:  65%|██████▌   | 13/20 [3:36:31<1:56:30, 998.61s/it]

Saving model at epoch 12 with validation loss of 0.03236977822092676


Train loss: 0.00477981110122775
Valid loss: 0.03226642369573471


Epoch:  70%|███████   | 14/20 [3:53:09<1:39:51, 998.61s/it]

Saving model at epoch 13 with validation loss of 0.03226642369573471


Train loss: 0.003401274373067103
Valid loss: 0.03226317176633447


Epoch:  75%|███████▌  | 15/20 [4:09:48<1:23:12, 998.58s/it]

Saving model at epoch 14 with validation loss of 0.03226317176633447


Train loss: 0.002334149602151321
Valid loss: 0.032261354581924286


Epoch:  80%|████████  | 16/20 [4:26:26<1:06:34, 998.62s/it]

Saving model at epoch 15 with validation loss of 0.032261354581924286


Train loss: 0.0015942168866016526
Valid loss: 0.03225603537052097


Epoch:  85%|████████▌ | 17/20 [4:43:05<49:56, 998.75s/it]  

Saving model at epoch 16 with validation loss of 0.03225603537052097


Train loss: 0.0010976909572183889


Epoch:  90%|█████████ | 18/20 [4:59:41<33:15, 997.86s/it]

Valid loss: 0.03231957634608547


Train loss: 0.000775743217634711


Epoch:  95%|█████████▌| 19/20 [5:16:17<16:37, 997.22s/it]

Valid loss: 0.0323848969888801


Train loss: 0.0005873433442882622


Epoch: 100%|██████████| 20/20 [5:32:53<00:00, 998.67s/it]

Valid loss: 0.032421887144548016







In [None]:
#Restore the model best version 
#Define the training function : this code is copied from https://towardsdatascience.com/multi-label-text-classification-with-xlnet-b5f5755302df

checkpoint = torch.load(model_save_path)
model_state_dict = checkpoint['state_dict']
model = DistilBert_TGD_Ensemble_no_BIG5()
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [None]:
model.to(device) # specify that our model will be runing on the specefied device

DistilBert_TGD_Ensemble_no_BIG5(
  (electra): DistilBertForMaskedLM(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(64356, 768)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
     

In [None]:
model_save_path #DistilBert_TGD_Ensemble_no_BIG5_last_masked_and_NormalMask_OnlyNextItemToPredict

'/content/drive/MyDrive/Models/NextMovie/DistilBert_TGD_Ensemble_no_BIG5_last_masked_and_NormalMask_OnlyNextItemToPredict.bin'

In [None]:
# define the word prediction function :
#This code borrow ideas from  : https://colab.research.google.com/github/YuvalPeleg/transformers-workshop/blob/master/MLM.ipynb
def word_prediction(text, model, tokenizer, topn=10,device="cuda"):
  tokenized_text = tokenizer.tokenize('[CLS] '+ text+ ' [SEP]')
  masked_index = -1
  for i, token in enumerate(tokenized_text):
    if token=='[MASK]':
      masked_index = i
      break
  assert i>=0

  tok_t = torch.tensor([tokenizer.convert_tokens_to_ids(tokenized_text)]).to(device)
  seg_t = torch.tensor([[0]*len(tokenized_text)]).to(device)

  with torch.no_grad():
      out =model(input_ids=tok_t,input_ids_mask_last=tok_t)
      pred = out[0]
  pre_inds = torch.argsort(-pred[0, masked_index])
  pred_tokens = tokenizer.convert_ids_to_tokens([ind.item() for ind in pre_inds])
  pre_probs = [round(p.item(),4) for p in torch.softmax(pred[0, masked_index], 0)[pre_inds]]
  l=list(zip(pred_tokens, pre_probs))[:topn]
  ret=[c[0] for c in l]
  return ret



In [None]:
#Collect the test sequences and maske the last token and use the masked token as the next movie label
def get_test_data_and_labels(data,topn=1):
  test=[]
  labels=[]
  for d in data:
    splited=d.split()
    labels.append(splited[-1])
    test.append(" ".join(x for x in splited[:-1])+" [MASK]")

  return test,labels
test,test_lab=get_test_data_and_labels(test_data,topn=5)

In [None]:
# Define the Hit ratio function
def hit_ratio(data_row,label,topn=1):

  hit=0
 
  predictions=word_prediction(data_row,model,tokenizer_4rec,topn=topn)
  predicted_items=predictions
  #print(f'label :{label}')
  #print(f'predicted items :{predicted_items}')
  if label in predicted_items:
    hit=1
  
  
  
  
  return hit



In [None]:
# Define the Mean reciprocal rank function

def MRR(data_row,label, topn=10):
  mrr=0
 
  predictions=word_prediction(data_row,model,tokenizer_4rec,topn=topn)
  predicted_items=predictions
  #print(f'label :{label}')
  #print(f'predicted items :{predicted_items}')
  if label in predicted_items:
    mrr=1/(list(predicted_items).index(label)+1)
  
  
  
  
  return mrr



In [None]:
#Define the evaluation function for all instances
def evaluation_all_users(data,topn=1,typee='HR'):
  
  test,test_lab=get_test_data_and_labels(data,topn=topn)

  fc=hit_ratio
  if typee=='MRR':
    fc=MRR

  all_hits=0
  for i,row in enumerate(test):

    user_hit=fc(row, test_lab[i],topn=topn)
    all_hits=all_hits+user_hit
  return all_hits/len(data)

In [None]:
#MRR@1
all_hits=evaluation_all_users(test_data,topn=1,typee='MRR')
all_hits
#0.07366984993178717

0.07366984993178717

In [None]:
#MRR@3

all_hits=evaluation_all_users(test_data,topn=3,typee='MRR')
all_hits
#0.08481127785356982

0.08481127785356982

In [None]:
#MRR@5

all_hits=evaluation_all_users(test_data,topn=5,typee='MRR')
all_hits
#0.0884947703501592


0.08849477035015918

In [None]:
#MRR@10

all_hits=evaluation_all_users(test_data,topn=10,typee='MRR')
all_hits
#0.09128121007384313


0.09128121007384313

In [None]:
#HRR@1

all_hits=evaluation_all_users(test_data,topn=1)
all_hits
#0.07366984993178717


0.07366984993178717

In [None]:
#HRR@3

all_hits=evaluation_all_users(test_data,topn=3)
all_hits
#0.09959072305593451


0.09959072305593451

In [None]:
#HRR@5

all_hits=evaluation_all_users(test_data,topn=5)
all_hits
#0.11596180081855388


0.11596180081855388

In [None]:
#HRR@10

all_hits=evaluation_all_users(test_data,topn=10)
all_hits
#0.1364256480218281

0.1364256480218281