# Imports

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
import random
import json
import time
import os
import datetime
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
import torch
import pickle
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.nn.parallel import DistributedDataParallel

NOMBRE =2

In [None]:
#load the tokenizer from /home/daril_kw/data/tokenizer_final
tokenizer = BertTokenizer.from_pretrained('/home/daril_kw/data/tokenizer_final')
#load the dataset from home/daril_kw/data/data_with_time_info_ok.json
data_format = json.load(open('/home/daril_kw/data/data_with_time_info_ok.json', 'r'))

In [None]:
with open('/home/daril_kw/data//home/daril_kw/data/data_with_time_info_ok.json', 'r') as openfile:
    json_loaded = json.load(openfile)
    
data_format = pd.DataFrame(data=json_loaded)

Contexte : le même qu'on soit dans cas train/validation ou test


In [None]:
data_format['HOUR']=data_format['HOUR'].apply(lambda x: ' '+x)
data_format['WEEK']=data_format['WEEK'].apply(lambda x: ' '+x)
data_format['CALL_TYPE']=data_format['CALL_TYPE'].apply(lambda x: ' '+x)
data_format['TAXI_ID']=data_format['TAXI_ID'].apply(lambda x: ' '+x)
data_format['DAY']=data_format['DAY'].apply(lambda x: ' '+x)

In [None]:
# la colonne CONTEXT_INPUT sera la concaténation du jour de la semaine, de l'heure et de la semaien de l'année pui de la colonne CALL_TYPE, de la colonne TAXI_ID, d'un espace et du dernier token de la colonne Tokenization
data_format['CONTEXT_INPUT'] =data_format['Tokenization_2'].apply(lambda x: x[-1]) + data_format['DAY'] + data_format['HOUR'] + data_format['WEEK'] + data_format['CALL_TYPE'] + data_format['TAXI_ID']

In [None]:
#on récupère le nombre d'informations dans la colonne CONTEXT_INPUT
#Comme cette colonne contiient les informations en string séparé par un espace, on récupère la liste correspondante puis on compte le nombre d'éléments de cette liste
len_context_info = len(data_format['CONTEXT_INPUT'][0].split(' '))

# Séparation du dataframe   


In [None]:
#we separate the dataframe into train and test 
data_train, data_test = train_test_split(data_format, test_size=0.2, random_state=2023)

## Gestion du dataframe de training

In [None]:
#we define a function that take a dataframe and five other parameters
#and that decides which token to put into the target
#the last token before the target is the last token taken in DEB_TRAJ
#three of the parameters of the function will determine the probability to choose the target between the  last 30% and 60% / in the last 10% and 30%/ in the 10% last tokens
#one parameter is for the proportion we will take the very last token as target
#the last one is for the proportion we will take the [SEP] token (after the very last token) as target

def prepare(dataframe, sixty_percent, thirty_percent, ten_percent, last_prob, sep_prob ):
    #sixty_percent is the proportion of the time we will take the target in the 60% last tokens
    #thirty_percent is the proportion of the time we will take the target in the 30% last tokens
    #ten_percent is the proportion of the time we will take the target in the 10% last tokens
    #last_prob is the proportion of the time we will take the very last token as target
    #sep_prob is the proportion of the time we will take the [SEP] token (after the very last token) as target
    #the sum of the five parameters must be equal to 1, we verify it
    #if it is not the case, we raise an error
    if sixty_percent+thirty_percent+ten_percent+last_prob+sep_prob!=1:
        raise ValueError('The sum of the five parameters must be equal to 1')
    #random.random() return a random float number between 0 and 1
    list_random=[random.random() for i in range(len(dataframe))]


        


    



data_train['DEB_TRAJ']=data_train['Tokenization_2'].apply(lambda x: x[:-NOMBRE])
data_train['DEB_TRAJ']=data_train['DEB_TRAJ'].apply(lambda x: x[-(512-len_context_info-2):] if len(x)>512-len_context_info-2 else x)
#then we keep the column in form of a string
data_train['DEB_TRAJ']=data_train['DEB_TRAJ'].apply(lambda x: ' '.join(x))


data_train['TARGET']=data_train['Tokenization_2'].apply(lambda x: x[-NOMBRE:-1])

In [None]:
#on enlève les colonnes inutiles
if 'Tokenization' in data_train.columns:
    data_train.drop(['Tokenization'],axis=1,inplace=True)
if 'CALL_TYPE' in data_train.columns:
    data_train.drop(['CALL_TYPE'],axis=1,inplace=True)
if 'TAXI_ID' in data_train.columns:
    data_train.drop(['TAXI_ID'],axis=1,inplace=True)
if 'DAY' in data_train.columns:
    data_train.drop(['DAY'],axis=1,inplace=True)
if 'HOUR' in data_train.columns:
    data_train.drop(['HOUR'],axis=1,inplace=True)
if 'WEEK' in data_train.columns:
    data_train.drop(['WEEK'],axis=1,inplace=True)
if 'Nb_points_token' in data_train.columns:
    data_train.drop(['Nb_points_token'],axis=1,inplace=True)


In [None]:
#on sauvegarde le fichier au format json
data_train.to_json('/home/daril_kw/data/data_train.json',orient='records')

### Concaténation, padding et ajout tokens spéciaux

In [None]:
c_inputs=data_train.CONTEXT_INPUT.values
traj_inputs=data_train.DEB_TRAJ.values
targets=data_train.TARGET.values

Gestion de l'entrée : 

In [None]:
input_ids = []
full_inputs = []
attention_masks = []
for i in tqdm(range(len(c_inputs))):
    #no truncation is needed because we managed it before

    #we concatenate the context input and the trajectory input adding manually the CLS token and the SEP token
    full_input = '[CLS] ' + c_inputs[i] + ' ' + traj_inputs[i] + ' [SEP]'
    full_inputs.append(full_input)
    #encoded_c_input=tokenizer.encode(c_inputs[i], add_special_tokens=False)
    #encoded_traj_input=tokenizer.encode(traj_inputs[i], add_special_tokens=False)
    #we add manually the CLS token and the SEP token when we concatenate the two inputs
    #encoded_full_input=[101] + encoded_c_input + encoded_traj_input + [102]
    #the[101] token is the CLS token and the [102] token is the SEP token

    encoded_full_input=tokenizer.encode(full_input, add_special_tokens=False)
    #we pad the input to the maximum length of 512
    encoded_full_input=encoded_full_input + [0]*(512-len(encoded_full_input))
    input_ids.append(encoded_full_input)
    #we create the attention mask
    att_mask = [float(i>0) for i in encoded_full_input]
    attention_masks.append(att_mask)
    #the attention mask is a list of 0 and 1, 0 for the padded tokens and 1 for the other tokens
    #the float(i>0) is 0 if i=0 (ie if the token is a padded token) and 1 if i>0 (ie if the token is not a padded token)
    

Gestion des targets :

In [None]:
targets_dict={}
for i in range(len(targets)):
    if targets[i] not in targets_dict:
        targets_dict[targets[i]]=len(targets_dict)

targets_input=[targets_dict[targets[i]] for i in range(len(targets))]

#le dictionnaire est sauvegardé au format json
with open('/home/daril_kw/data/targets_dict.json', 'w') as fp:
    json.dump(targets_dict, fp)

# le dictionnaire s'utilisera comme suit : 

# soit un entier 'target' non compris entre 0 et len(targets_dict)-1
# on récupère la valeur correspondante dans le dictionnaire avec targets_dict[target]
# inversement, si on a un entier 'target_encoded' compris entre 0 et len(targets_dict)-1
# on récupère la clé correspondante dans le dictionnaire avec list(targets_dict.keys())[target_encoded]

#la liste targets_input contient les targets du dataset encodées avec le dictionnaire targets_dict ie dans leur nouvel espace
#save in pickle the targets

with open('/home/daril_kw/data/targets_input.pkl', 'wb') as f:
    pickle.dump(targets_input, f)

"""data_train['DEB_TRAJ']=data_train['Tokenization_2'].apply(lambda x: x[:-NOMBRE])
data_train['DEB_TRAJ']=data_train['DEB_TRAJ'].apply(lambda x: x[-(512-len_context_info-2):] if len(x)>512-len_context_info-2 else x)
#then we keep the column in form of a string
data_train['DEB_TRAJ']=data_train['DEB_TRAJ'].apply(lambda x: ' '.join(x))


data_train['TARGET']=data_train['Tokenization_2'].apply(lambda x: x[-NOMBRE:-1]) pour test""" 