In [1]:
from sklearn.metrics import matthews_corrcoef
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import numpy as np
import pickle
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import h3
import folium



In [2]:
# This file test the first version of the model: classification with context

PRETRAINED_MODEL_NAME = '/home/daril_kw/data/savings_for_60_rows/model_before_training_opti_full_for_para_60'
TOKENIZER_DIR = '/home/daril_kw/data/savings_for_60_rows/tokenizer_final_opti_full_for_para_60'
DATALOADER_DIR = "/home/daril_kw/data/savings_for_60_rows/test_dataloader_60.pt"


In [4]:
# device = torch.device("cpu")
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# to use a specific GPU, use cuda:1
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# load the prediction_dataloader
prediction_dataloader = torch.load(DATALOADER_DIR)
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_DIR)


# we load the model
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME)
model.to(device)
print("we evaluate")
model.eval()

# Tracking variables
predictions, true_labels, list_inputs_test = [], [], []


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


we evaluate


In [5]:
len(prediction_dataloader)

1

## Test for building autoregression

In [6]:
# get the first batch in the prediction_dataloader

first_batch = next(iter(prediction_dataloader))
first_batch # this is a tuple of 3 elements: input_ids, attention_mask, labels
# first_batch[0].shape, first_batch[1].shape, first_batch[2].shape
first_batch = tuple(t.to(device) for t in first_batch) # we put the batch on the device


In [7]:
first_batch

(tensor([[  101, 29308,   128,  ...,     0,     0,     0],
         [  101, 29174,   128,  ...,     0,     0,     0],
         [  101, 29125,   128,  ...,     0,     0,     0],
         ...,
         [  101, 29282,   128,  ...,     0,     0,     0],
         [  101, 29850,   128,  ...,     0,     0,     0],
         [  101, 29725,   128,  ...,     0,     0,     0]], device='cuda:1'),
 tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:1'),
 tensor([39, 34, 18, 14, 12, 48, 10,  0,  9, 40, 42, 36], device='cuda:1'))

In [8]:
b_input_ids, b_input_mask, b_labels = first_batch


In [9]:
b_input_ids

tensor([[  101, 29308,   128,  ...,     0,     0,     0],
        [  101, 29174,   128,  ...,     0,     0,     0],
        [  101, 29125,   128,  ...,     0,     0,     0],
        ...,
        [  101, 29282,   128,  ...,     0,     0,     0],
        [  101, 29850,   128,  ...,     0,     0,     0],
        [  101, 29725,   128,  ...,     0,     0,     0]], device='cuda:1')

In [10]:

# move the batch to the device because we are using the GPU. the previous instruction tuple(t.to(device) for t in first_batch) is just a short cut for this
b_input_ids = b_input_ids.to(device)
b_input_mask = b_input_mask.to(device)
b_labels = b_labels.to(device)

In [14]:
b_input_ids[3]

tensor([  101, 29230,   128,  1406,  1744,   123, 29901, 29186, 29186, 29526,
        29526, 29013, 29013, 29013, 29526, 29206, 29206, 29735, 29833, 29740,
        29740, 29505, 29437, 29035, 29828, 29818, 29497, 29497, 29135, 29209,
        29594, 29806, 29827,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

Clone the b_input

In [25]:
b_input_ids_clone = b_input_ids.clone()
b_input_mask_clone = b_input_mask.clone()


In [22]:
def get_start_of_trajectory_based_on_proportion(input_ids, input_mask, proportion,context_length=6 ):
    """
    Cette fonction permet de recuperer le debut de la trajectoire en fonction de la proportion donnée et de la longueur du contexte
    La stratégie est la suivante:
    - On garde les tokens avant la proportion donnée et on remplace les tokens apres par des [PAD]: 0 mais tout en gardant ces tokens la car on va les utiliser plus tard pour tester la prediction

    Args:
        input_ids (torch.tensor): les ids des tokens
        input_mask (torch.tensor): le mask des tokens
        proportion (float): la proportion de la trajectoire que l'on veut garder: elle doit etre entre 0 et 1
        context_length (int): la longueur du contexte
        
    Returns:
        start_of_trajectory (torch.tensor): les ids des tokens de la trajectoire
        start_of_trajectory_mask (torch.tensor): le mask des tokens de la trajectoire
        tokens_to_predict list: les ids des tokens a predire: ce sont les tokens qui sont apres la proportion donnée qui sont remplacés par des [PAD]: 0
    """
    
    
    # On va verifier au la proportion est bien entre 0 et 1 si ce n'est pas le cas on va la mettre a 1 et retourner la trajectoire entiere sans token [SEP]
    if proportion < 0 or proportion > 1:
        proportion = 1
    
    # if proportion == 1:
        #On retire le token [SEP] à la fin de la trajectoire et ca sera ce token qui sera remplacé par des [PAD]: 0 et qui sera predit
    # La proportion a prendre est apres les tokens de contexte sachant que le format est le suivant: [CLS] contexte trajectoire [PAD]...
    # On detecte lepremier token [PAD] 
    first_pad = torch.where(input_ids == 0)[0]
    # la longueur est juste avant le premier token [PAD]
    length = first_pad[0] - context_length - 1 # on retire 1 pour le token [CLS] et 1 pour le token [SEP]
    # Calcul de la proportion de debut de trajectoire
    start_of_trajectory_length = int(length * proportion) # le resu
    # On met les tokens apres start_of_trajectory_length a 0 en les gardant pour la prediction
    tokens_to_predict = input_ids[start_of_trajectory_length + context_length + 1:]
    # On retire les 0 de la prediction
    tokens_to_predict = tokens_to_predict[tokens_to_predict != 0]
    input_ids[start_of_trajectory_length + context_length + 1:] = 0
    # On met le mask a 0 pour les tokens a predire
    input_mask[start_of_trajectory_length + context_length + 1:] = 0
    return input_ids, input_mask, tokens_to_predict        

In [33]:
get_start_of_trajectory_based_on_proportion(b_input_ids_clone.clone()[0], b_input_mask_clone.clone()[0], 0.1, context_length=6)

(tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
         29830,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [16]:
int(11*0.3)

3

# Inference

## Inference time

In [23]:
# We will compute the inference time
import time
t0 = time.time()
with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
t1 = time.time()
infer_time = t1-t0
print("inference time: ", infer_time) # this time is given in seconds

inference time:  0.016776323318481445


In [30]:
# the size of b_input_ids is 
b_input_ids.size()

torch.Size([12, 512])

In [84]:
# Then for one trajectory, the inference time is 
trajectory_inference_time = infer_time/len(b_input_ids)
# If we have 300 points in the trajectory, the inference time is 300*trajectory_inference_time
print(f"One trajectory inference time: {trajectory_inference_time}\n Total inference time for 300 points: {300*trajectory_inference_time}")

One trajectory inference time: 0.003326117992401123
 Total inference time for 300 points: 0.9978353977203369


Details of the outputs


In [96]:
# The outputs are the logits(=scores) for each class. We take the class with the highest score as the prediction
# For each input, we take the class with the highest score as the prediction
logits #logits means the scores for each class. Then if we have 10 classes, we have 10 scores for each input
# For example 


tensor([[ 0.1756,  0.0554, -0.2076,  ..., -0.3755,  0.3280, -0.0929],
        [ 0.1270,  0.0911, -0.2356,  ..., -0.4460,  0.2182, -0.0972],
        [ 0.1986,  0.1220, -0.2504,  ..., -0.4339,  0.1873,  0.0108],
        ...,
        [ 0.1416,  0.1100, -0.2492,  ..., -0.4008,  0.2238, -0.1127],
        [ 0.1544,  0.2864, -0.3465,  ..., -0.3810,  0.1673, -0.1443],
        [ 0.1566,  0.1921, -0.4025,  ..., -0.3802,  0.1699, -0.0809]],
       device='cuda:0')

In [97]:
logits[0] # this is the scores for the first input

tensor([ 1.7556e-01,  5.5411e-02, -2.0764e-01, -5.3356e-02, -4.8283e-01,
         3.9959e-01,  2.4936e-01, -6.1580e-02,  1.2208e+00,  8.8233e-01,
         7.9652e-02, -9.1809e-01, -6.1648e-01,  2.2795e-02, -4.9137e-01,
         2.5167e-02,  5.0348e-01,  8.9886e-01,  5.3918e-01,  4.7088e-01,
        -2.3605e-01,  3.0360e-01,  6.9202e-01, -2.3386e-01,  4.7819e-01,
         6.2067e-01, -2.8057e-01,  5.3282e-01, -1.3414e-01,  7.5720e-02,
         2.0141e-01, -7.7584e-01, -3.0211e-01,  3.8304e-02, -2.9572e-01,
        -3.7977e-01, -2.4700e-01, -6.0180e-01,  6.9280e-01, -7.9707e-01,
        -1.0991e+00, -6.2875e-02,  2.0101e-01,  2.4983e-01,  1.3069e-01,
        -1.1322e-01, -2.4029e-01, -4.3113e-02, -4.4805e-01,  1.3670e-01,
         5.0791e-02,  5.8126e-03,  1.4891e-01,  1.6357e-01, -6.0650e-03,
        -7.7350e-02, -3.3025e-01, -3.7892e-01, -2.1078e-01,  1.2388e-01,
        -7.3392e-01,  5.1020e-01, -3.2051e-02, -3.1992e-01, -5.2206e-01,
        -6.3849e-01,  4.4133e-01, -4.0169e-01, -3.0

In [98]:
# The number of classes is the number of columns in the logits
num_classes = logits.shape[1]
num_classes

892

In [31]:
logits.shape

torch.Size([12, 892])

In [100]:
# for each input, we take the class with the highest score as the prediction
_, current_prediction = torch.max(logits, 1)
current_prediction

tensor([  8,   8, 827, 725, 725, 827, 827, 827, 725, 827, 725, 725],
       device='cuda:0')

In [101]:
len(current_prediction)

12

## Addition of the predicted values to the initial trajectory

In [32]:
# Now, we add these prediction to the initial inputs and try to predict the next point.
# We will move the sep token (102)  on position and replace it position with the predicted class.
# For example, if we have the input [101, 5, 7, 8, 9 ,102, 0,0,0,0,0], and the predicted class is 3 then we'll have [101,5, 7,8,9,3,102,0,0,0,0] as the new input


def add_prediction_to_input(input_ids, prediction, attention_masks, sep_token_id=102):
    """
    Cette fonction permet de remplacer le token de séparation par la prédiction et de déplacer le token de séparation à la position suivante
    
    Args:
    input_ids (torch.Tensor): les ids des tokens
    prediction (int): la prédiction
    attention_masks (torch.Tensor): le masque d'attention
    sep_token_id (int): l'id du token de séparation par défaut 102
    
    Returns:
    input_ids (torch.Tensor): les ids des tokens avec la prédiction ajoutée et le token de séparation déplacé
    """
    
    
    
    
 
    #On cherche la position du token de séparation
    sep_token_position = (input_ids == sep_token_id).nonzero(as_tuple=True)[0]
    # On remplace le token de séparation par la prédiction
    input_ids[sep_token_position] = prediction
    # On déplace le token de séparation à la position suivante
    input_ids[sep_token_position+1] = sep_token_id

    # Mise à jour du masque d'attention
    attention_masks[sep_token_position+1] = 1 # Dans le masque d'attention, on met à 1 tous les tokens n´étant pas des pads
    return input_ids , attention_masks

In [33]:
def add_prediction_to_input_batch(input_idss, predictions, attention_maskss, sep_token_id=102):
    """
    Ajoute une prédiction au lot d'entrée en modifiant les input_ids et les attention_masks et en utilisant la fonction add_prediction_to_input.

    Args:
        input_idss (list): La liste des input_ids pour chaque exemple du lot.
        predictions (list): Une liste de prédictions pour chaque exemple du lot.
        attention_maskss (list): Une liste de masques d'attention pour chaque exemple du lot.
        sep_token_id (int, facultatif): L'ID du token de séparation. Par défaut à 102.

    Returns:
        tuple: Un tuple contenant les input_idss et les attention_maskss mis à jour.

    """
    for i in range(len(predictions)):
        if predictions[i] == sep_token_id or (input_idss[i] == sep_token_id).nonzero(as_tuple=True)[0] == len(input_idss[i])-1:
            # Si la prédiction est le token de séparation ou si le token de séparation est déjà à la fin de la séquence, on ne fait rien
            continue
        else:
            input_idss[i], attention_maskss[i] = add_prediction_to_input(input_idss[i], predictions[i], attention_maskss[i])
    return input_idss, attention_maskss

In [145]:
inputs , attentions = add_prediction_to_input(b_input_ids[0], current_prediction[0], b_input_mask[0])
attentions

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [168]:
b_input_ids[1] * b_input_mask[1] == b_input_ids[1]

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [167]:
inputs * attentions == inputs

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [146]:
(attentions == 1).nonzero(as_tuple=True) 

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 61, 62, 63, 64, 65, 66, 67],
        device='cuda:0'),)

In [148]:
(attentions == 0).nonzero(as_tuple=True) 


(tensor([ 47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
          68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
          82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
          96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
         110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
         124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
         138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
         152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
         166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
         180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
         194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
         208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
         222, 223, 224, 225, 226, 227, 228, 229, 230

In [140]:
# # This show the position of token sep before and after the modification of b_input_ids
# old_positions = [ (b_input_ids[i] == 102).nonzero(as_tuple=True)[0] for i in range(len(b_input_ids))]
# new_positions = [ (add_prediction_to_input(b_input_ids[i], current_prediction[i]) == 102).nonzero(as_tuple=True)[0] for i in range(len(b_input_ids))]
# print(f"{old_positions}\n {new_positions}\n")
# # The position shoudn't be the same

# bools = [old_positions[i] == new_positions[i] for i in range(len(old_positions))]
# print(f"Are the positions the same?\n {bools}\n")
# supposed_bools =[ old_positions[i] == new_positions[i] - 1 for i in range(len(old_positions))]
# print(f"Are the positions the same?\n {supposed_bools}\n")

# print("And of supposed_bools, are they all True? ", all(supposed_bools))

old_positions_ids = [ (b_input_ids[i] == 102).nonzero(as_tuple=True)[0] for i in range(len(b_input_ids))]
old_positions_attention = [ (b_input_mask[i] == 0).nonzero(as_tuple=True)[0] for i in range(len(b_input_mask))]
print(f"{old_positions_ids}\n {old_positions_attention}\n")

# new_input_ids, new_attention_masks = add_prediction_to_input_batch(b_input_ids, current_prediction, b_input_mask)
# new_positions_ids = [ (new_input_ids[i] == 102).nonzero(as_tuple=True)[0] for i in range(len(new_input_ids))]
# new_positions_attention = [ (new_attention_masks[i] == 0).nonzero(as_tuple=True)[0] for i in range(len(new_attention_masks))]
# print(f"{new_positions_ids}\n {new_positions_attention}\n")

# bools_ids = [old_positions_ids[i] == new_positions_ids[i] for i in range(len(old_positions_ids))]
# bools_attention = [old_positions_attention[i] == new_positions_attention[i] for i in range(len(old_positions_attention))]
# print(f"Are the positions the same?\n {bools_ids}\n")
# print(f"Are the positions the same?\n {bools_attention}\n")

# supposed_bools_ids =[ old_positions_ids[i] == (new_positions_ids[i] - 1) for i in range(len(old_positions_ids))]
# supposed_bools_attention =[ old_positions_attention[i] == (new_positions_attention[i] - 1) for i in range(len(old_positions_attention))]
# print(f"Are the positions the same?\n {supposed_bools_ids}\n")
# print(f"Are the positions the same?\n {supposed_bools_attention}\n")

# print("And of supposed_bools, are they all True? ", all(supposed_bools_ids))
# print("And of supposed_bools, are they all True? ", all(supposed_bools_attention))




[tensor([62], device='cuda:0'), tensor([60], device='cuda:0'), tensor([36], device='cuda:0'), tensor([47], device='cuda:0'), tensor([51], device='cuda:0'), tensor([44], device='cuda:0'), tensor([41], device='cuda:0'), tensor([42], device='cuda:0'), tensor([38], device='cuda:0'), tensor([48], device='cuda:0'), tensor([73], device='cuda:0'), tensor([39], device='cuda:0')]
 [tensor([ 47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
         63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
         77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
        147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
 

In [113]:
b_input_ids = add_prediction_to_input_batch(b_input_ids, current_prediction)

## Test of the add predicted values to the previous submission

In [34]:
# get the first batch in the prediction_dataloader

second_batch = next(iter(prediction_dataloader))
second_batch = next(iter(prediction_dataloader))

second_batch = tuple(t.to(device) for t in second_batch) # we put the batch on the device
second_batch_input_ids, second_batch_input_mask, second_batch_labels = second_batch

In [35]:
(second_batch_input_ids[0])

tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
        29830, 29830, 29830, 29830, 29830, 29830, 29830, 29830, 29532, 29532,
        29502, 29479, 29479, 29201, 29748, 29820, 29040, 29823, 29112, 29831,
        29193, 29212, 29372, 29146, 29015, 29240, 29593, 29356, 29526, 29206,
        29206, 29308, 29308, 29308, 29308, 29308,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [173]:
(second_batch_input_ids[0 ] == 102).nonzero(as_tuple=True) 

(tensor([46], device='cuda:0'),)

In [36]:
(second_batch_input_ids[0 ] == 102)

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [178]:
(second_batch_input_ids[0 ] == 0).nonzero(as_tuple=True)[0][0]

tensor(47, device='cuda:0')

In [184]:
(second_batch_input_ids * second_batch_input_mask == second_batch_input_ids).all()

tensor(True, device='cuda:0')

In [37]:
# make sure that all is true
are_the_same = [ (second_batch_input_ids[i] * second_batch_input_mask[i] == second_batch_input_ids[i]).all() for i in range(len(second_batch_input_ids))]

In [40]:
# On teste si tous les éléments sont True
all(are_the_same)

True

In [189]:
second_batch_cloned = second_batch_input_ids.clone()
second_batch_input_mask_cloned = second_batch_input_mask.clone()

In [190]:
second_batch_cloned, second_batch_input_mask_cloned = add_prediction_to_input_batch(second_batch_cloned, current_prediction, second_batch_input_mask_cloned)

In [192]:
(second_batch_cloned[0] == 102).nonzero(as_tuple=True)

(tensor([47], device='cuda:0'),)

In [194]:
(second_batch_cloned[0] == 0).nonzero(as_tuple=True)[0][0]

tensor(48, device='cuda:0')

In [191]:
(second_batch_cloned * second_batch_input_mask_cloned == second_batch_cloned).all()

tensor(True, device='cuda:0')

In [195]:
second_batch_cloned_prime = second_batch_input_ids.clone()
second_batch_input_mask_cloned_prime = second_batch_input_mask.clone()

## Autoregressive prediction

In [65]:
def should_predict(input_idss, predictions):
    """
    This function return true if for all the inputs, for at least one, the last token is not the sep token
    or the predicted token is not the sep token.
     Cette fonction permet de dire si oui ou non on devrait continuer à faire la prediction auto-régressive.
     Alors, la prédiction auto-régressive s'arrête si le token Sep est le dernier token de la séquence ou si la prédiction est le token Sep.
     
    Args:
        input_ids (list): La liste des input_ids pour chaque exemple du lot.
        predictions (list): Une liste de prédictions pour chaque exemple du lot.
    """
    for i in range(len(input_idss)):
        if (input_idss[i] == 102).nonzero(as_tuple=True)[0] == len(input_idss[i])-1 or predictions[i] == 102:
            return False
    return True


# Maintenant, on doit definir une fonction qui verifie si tous les inputs ont le token sep comme dernier token ou la prediction est le token sep
# C´est le contraire de should_predict

def all_have_sep_as_last_token_or_prediction_is_sep(input_idss, predictions):
    """
    Cette fonction renvoi True si pour tous les inputs, le token sep est le dernier token ou la prédiction est le token sep.
    
    Args:
    input_ids (list): La liste des input_ids pour chaque exemple du lot.
    predictions (list): Une liste de prédictions pour chaque exemple du lot.
    
    Returns:
    bool: True si pour tous les inputs, le token sep est le dernier token ou la prédiction est le token sep.
    
    """

    for i in range(len(input_idss)):
        if (input_idss[i] == 102).nonzero(as_tuple=True)[0] != len(input_idss[i])-1 and predictions[i] != 102:
            return False
    return True

    


def batch_auto_regressive_prediction(batch_input_ids, batch_input_masks, model, device):
    """
      
         
        Cette fonction prend les batch_input_ids, les batch_input_masks, le modèle et le device
        Prédit récursivement le prochain token pour chaque input dans le lot jusqu'à ce que 
        pour chaque input, le token prédit soit le token sep ou la position du token sep soit la dernière position
        
        Args:
        batch_input_ids (torch.Tensor): les ids des tokens
        batch_input_masks (torch.Tensor): les masques d'attention
        model (torch.nn.Module): le modèle
        device (torch.device): le device
        
        Returns:
        batch_input_ids (torch.Tensor): les ids des tokens avec les prédictions ajoutées
        batch_input_masks (torch.Tensor): les masques d'attention mis à jour
        
    """
    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_masks)
        logits = outputs[0]
        _, current_prediction = torch.max(logits, 1)
        batch_input_ids, batch_input_masks = add_prediction_to_input_batch(batch_input_ids, current_prediction, batch_input_masks)
        if should_predict(batch_input_ids, current_prediction):
            print("Prediction is not finished. We continue")
            return batch_auto_regressive_prediction(batch_input_ids, batch_input_masks, model, device)
        else:
            print("Prediction is finished. We stop")
            return batch_input_ids, batch_input_masks

In [66]:
preds = batch_auto_regressive_prediction(second_batch_input_ids, second_batch_input_mask, model, device)

In [67]:
pred_input_ids, pred_input_masks = preds

In [77]:
second_batch_input_ids[0]

tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
        29830, 29830, 29830, 29830, 29830, 29830, 29830, 29830, 29532, 29532,
        29502, 29479, 29479, 29201, 29748, 29820, 29040, 29823, 29112, 29831,
        29193, 29212, 29372, 29146, 29015, 29240, 29593, 29356, 29526, 29206,
        29206, 29308, 29308, 29308, 29308, 29308,     8,     8,   827,   725,
          827,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   827,   827,   827,   827,
          827,   827,   827,   827,     8,   725,   827,   827,   827,    83,
          725,   827,   827,    83,   827,   827,    83,   725,   827,   827,
           83,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725, 

In [75]:
pred_input_ids[0]

tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
        29830, 29830, 29830, 29830, 29830, 29830, 29830, 29830, 29532, 29532,
        29502, 29479, 29479, 29201, 29748, 29820, 29040, 29823, 29112, 29831,
        29193, 29212, 29372, 29146, 29015, 29240, 29593, 29356, 29526, 29206,
        29206, 29308, 29308, 29308, 29308, 29308,     8,     8,   827,   725,
          827,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   827,   827,   827,   827,
          827,   827,   827,   827,     8,   725,   827,   827,   827,    83,
          725,   827,   827,    83,   827,   827,    83,   725,   827,   827,
           83,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725,   725,   725,
          725,   725,   725,   725,   725,   725,   725,   725, 

In [203]:
pred_input_masks

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0')

In [213]:
pred_input_ids

tensor([[  101, 29308,   128,  ...,     0,     0,     0],
        [  101, 29174,   128,  ...,     0,     0,     0],
        [  101, 29125,   128,  ...,     0,     0,     0],
        ...,
        [  101, 29282,   128,  ...,     0,     0,     0],
        [  101, 29850,   128,  ...,   827,     8,   102],
        [  101, 29725,   128,  ...,     0,     0,     0]], device='cuda:0')

## Detokenization: Reverse the prediction to get the original values

In [68]:
# to get the original values of ids, we can use the tokenizer
tokenizer.decode(pred_input_ids[0])
# to recongnize h3 tokens, we can call


'[CLS] 8a39220f0667fff 7 20 26 2 20000571 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68cffff 8a39220c68c7fff 8a39220c68c7fff 8a39220c68effff 8a39220c6bb7fff 8a39220c6bb7fff 8a39220c68e7fff 8a39220c680ffff 8a39220c6817fff 8a39220c699ffff 8a39220f16cffff 8a39220c6d2ffff 8a39220c6d37fff 8a39220f3a77fff 8a39220f3a2ffff 8a39220f3b07fff 8a39220f3b37fff 8a39220f3867fff 8a39220f3827fff 8a39220f391ffff 8a39220f064ffff 8a39220f065ffff 8a39220f066ffff 8a39220f066ffff 8a39220f0667fff 8a39220f0667fff 8a39220f0667fff 8a39220f0667fff 8a39220f0667fff [unused8] [unused8] ₓ ᶠ ₓ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ₓ ₓ ₓ ₓ ₓ ₓ ₓ ₓ [unused8] ᶠ ₓ ₓ ₓ [unused83] ᶠ ₓ ₓ [unused83] ₓ ₓ [unused83] ᶠ ₓ ₓ [unused83] ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ₁ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ᶠ ₁ ₁ [unused83] ᶠ ᶠ [unused8] ᶠ [unused8] ᶠ [unused

In [70]:
# check if an ID is an h3 token
h3.h3_is_valid("8a39220c68cffff")

True

In [63]:
def get_h3_token_without_context(input_ids, tokenizer, context_token_number=6):
    """
   
    Cette fonction prend les ids des tokens, le tokenizer et le nombre de tokens de contexte,
    Commence à vérifier les tokens après le nombre de tokens de contexte +1 
    token CLS s'il y a des tokens h3 et les renvoie
    Ceci sachant que les inputs sont sous la forme: [CLS] jetons_de_context jetons_h3 [SEP]
    
    Args:
    input_ids (torch.Tensor): les ids des tokens
    tokenizer (transformers.tokenizer): le tokenizer
    context_token_number (int): le nombre de tokens de contexte
    
    Returns:
    list: une liste des tokens h3
    
    """
    h3_tokens = []
    detokenized_tokens = tokenizer.decode(input_ids)
    tokens = detokenized_tokens.split()
    for i in range(context_token_number+1, len(tokens)):
        if h3.h3_is_valid(tokens[i]):
            h3_tokens.append(tokens[i])
    return h3_tokens

In [221]:
get_h3_token_without_context(pred_input_ids[0], tokenizer)

['8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68cffff',
 '8a39220c68c7fff',
 '8a39220c68c7fff',
 '8a39220c68effff',
 '8a39220c6bb7fff',
 '8a39220c6bb7fff',
 '8a39220c68e7fff',
 '8a39220c680ffff',
 '8a39220c6817fff',
 '8a39220c699ffff',
 '8a39220f16cffff',
 '8a39220c6d2ffff',
 '8a39220c6d37fff',
 '8a39220f3a77fff',
 '8a39220f3a2ffff',
 '8a39220f3b07fff',
 '8a39220f3b37fff',
 '8a39220f3867fff',
 '8a39220f3827fff',
 '8a39220f391ffff',
 '8a39220f064ffff',
 '8a39220f065ffff',
 '8a39220f066ffff',
 '8a39220f066ffff',
 '8a39220f0667fff',
 '8a39220f0667fff',
 '8a39220f0667fff',
 '8a39220f0667fff',
 '8a39220f0667fff']

## Show H3 trajectory on the map

In [253]:
m = folium.Map(location=[41.156183, -8.620241], zoom_start=13)

In [59]:
test_list_for_duplicate_and_order = [7,1,5,6,7,6,4,4,3,2,7,7,3,1,6,7,2,73, 72,9,86,5,4,3,2]

# On enleve les doublons
# test_list_for_duplicate_and_order_without_duplicate = list(set(test_list_for_duplicate_and_order))
# test_list_for_duplicate_and_order_without_duplicate

In [61]:
list(dict.fromkeys(test_list_for_duplicate_and_order))

[7, 1, 5, 6, 4, 3, 2, 73, 72, 9, 86]

In [62]:
def add_h3_tokens_on_map(h3_tokens=[],add_centers=True, polyline_color="red", polygon_color="green", remove_duplicates=True):
    
    
    if remove_duplicates:
        # On retire les doublons et on garde l'ordre
        h3_tokens = list(dict.fromkeys(h3_tokens))
  
    h3_boundaries = [h3.h3_to_geo_boundary(h3_token) for h3_token in h3_tokens]

    polyline = []
    if add_centers:
        polyline = [h3.h3_to_geo(h3_token) for h3_token in h3_tokens]
        folium.PolyLine(
            locations=polyline,
            color=polyline_color,
            opacity=1,
            weight=2.5,
        ).add_to(m)
    
    for i in range(len(h3_boundaries)):
        folium.Polygon(
            locations=h3_boundaries[i],
            color=polygon_color,
            fill=True,
            fill_color=polygon_color,
            fill_opacity=0.4,
        ).add_to(m)
       

In [257]:
add_h3_tokens_on_map(get_h3_token_without_context(pred_input_ids[0], tokenizer), add_centers=True, polyline_color="cyan", polygon_color="purple")

In [234]:
# get the center of each h3 token
def get_centers_of_h3_tokens(h3_tokens):
    centers = [h3.h3_to_geo(h3_token) for h3_token in h3_tokens]
    return centers

In [245]:
b_input_ids[0] == pred_input_ids[0]

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [246]:
# get the initial ids up to sep token
b_input_ids[0][0: (b_input_ids[0] == 102).nonzero(as_tuple=True)[0][0]]

tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
        29830, 29830, 29830, 29830, 29830, 29830, 29830, 29830, 29532, 29532,
        29502, 29479, 29479, 29201, 29748, 29820, 29040, 29823, 29112, 29831,
        29193, 29212, 29372, 29146, 29015, 29240, 29593, 29356, 29526, 29206,
        29206, 29308, 29308, 29308, 29308, 29308,     8,     8,     8,     8,
            8,     8,     8,     8,     8,     8,     8,     8,     8,     8,
            8,     8,     8,     8,     8,     8,     8], device='cuda:0')

In [255]:
# get the initial ids
initial_input_ids = b_input_ids[0]
# get the h3 tokens
h3_tokens = get_h3_token_without_context(pred_input_ids[1], tokenizer)
# add the h3 tokens on the map
add_h3_tokens_on_map(h3_tokens, add_centers=True, polyline_color="blue", polygon_color="red")

In [230]:
h3.h3_to_geo("8a39220f0667fff")

(41.142923000922266, -8.615950254277127)

In [258]:
m

In [87]:
# get the foruth batch in the prediction_dataloader

fourth_batch = next(iter(prediction_dataloader))
fourth_batch = tuple(t.to(device) for t in fourth_batch) # we put the batch on the device


In [90]:
len(prediction_dataloader)

1

In [83]:
fourth_batch_input_ids, fourth_batch_input_mask, fourth_batch_labels = fourth_batch

In [80]:
# clone the input_ids and input_masks
fourth_batch_cloned = fourth_batch_input_ids.clone()
fourth_batch_input_mask_cloned = fourth_batch_input_mask.clone()


In [88]:
all(fourth_batch_input_ids[0] == fourth_batch_cloned[0])

True

In [81]:
fourth_batch_cloned[0]

tensor([  101, 29308,   128,  1406,  1744,   123, 29912, 29830, 29830, 29830,
        29830, 29830, 29830, 29830, 29830, 29830, 29830, 29830, 29532, 29532,
        29502, 29479, 29479, 29201, 29748, 29820, 29040, 29823, 29112, 29831,
        29193, 29212, 29372, 29146, 29015, 29240, 29593, 29356, 29526, 29206,
        29206, 29308, 29308, 29308, 29308, 29308,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [264]:
# get the h3 tokens
fourth_h3_tokens = get_h3_token_without_context(fourth_batch_input_ids[1], tokenizer)
# add the h3 tokens on the map
add_h3_tokens_on_map(fourth_h3_tokens, add_centers=True, polyline_color="green", polygon_color="orange")

In [265]:
m

## Next predictions

In [None]:

# losses
losses = 0
print("We predict")
# Predict
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # move to device
    b_input_ids = b_input_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    b_labels = b_labels.to(device)

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        # the ouputs are a tuple with the loss and the logits
        # the losses are the item 0 of the tuple
        # and the logits are the item 1 of the tuple
        # The loss is computed with the CrossEntropyLoss

    logits = outputs[0]
    losses += outputs[0].mean().item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to("cpu").numpy()

    # Store predictions and true labels
    # we have to append  the max of the logits
    # because the logits are the output of the softmax
    # and the max of the logits is the class with the highest probability
    predictions.append(logits)
    true_labels.append(label_ids)

    # Store the inputs

    list_inputs_test.append(b_input_ids.tolist())

print("DONE.")


matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print("Calculating Matthews Corr. Coef. for each batch...")

pred_label = []
# compute the loss

# For each input batch...
for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0"
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    pred_label.append(pred_labels_i)
    # Calculate and store the coef for this batch.
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
    matthews_set.append(matthews)


# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]


# Combine the inputs for each batch into a single list.
flat_list_inputs_test = [item for sublist in list_inputs_test for item in sublist]

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print("MCC: %.3f" % mcc)


# compute the accuracy
accuracy = (flat_true_labels == flat_predictions).mean()
print("accuracy: %.3f" % accuracy)

# print the loss
print("loss: %.3f" % (losses / len(true_labels)))


# save flat_list_inputs_test
