In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ROOT = '/content/drive/MyDrive/A5/Mémoire/notebooks'
import sys
sys.path.append(ROOT)

# GPT preprocessing

## Objectif

L'objectif de ce notebook est d'effectuer le preprocessing et la tokensiations des données afins de pouvoir utiliser un modèle de langage (ici GPT2 small)

Donc la première étape sera de tokeniser les valeurs continues de taux de retour en les discrétisant par intervale de valeur. 

La deuxième étape est de construire le BPE Tokenizer associé (si nécessaire car je ne connais pas encore son utilité. )


## Import des librairies

In [None]:
import os  # for path dirs and files
import tqdm  # progress bar
from tqdm.auto import tqdm

# to deal with data
import numpy as np
import pandas as pd

# plot the results
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 4]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

import torch
import sklearn.model_selection

from dataset import Dataset

## Import du dataset

Dans le notebook précédent numéro 3, nous avons créé une classe dataset pour simplifier les traitement. On peut donc l'importer.

In [None]:
# import the dataset
dataset = Dataset(os.path.join(ROOT, 'data', 'data_optimized.csv'))
print(f'\nLength of the full dataset: {len(dataset)}\n')
dataset.df

Import file...
Convert day to date...
Convert columns to list...

Length of the full dataset: 225208



Unnamed: 0,pair,day,return,set,day_count,look_back_days
0,AUDCAD,2007-10-23,0.007541,unpredictable,0,
1,AUDCAD,2007-10-24,0.003915,unpredictable,1,
2,AUDCAD,2007-10-25,0.002523,unpredictable,2,
3,AUDCAD,2007-10-26,0.005835,unpredictable,3,
4,AUDCAD,2007-10-29,0.007962,unpredictable,4,
...,...,...,...,...,...,...
225203,ZARJPY,2020-12-25,-0.002822,test,2639,"[-0.00248756218905477, -0.009056306601916251, ..."
225204,ZARJPY,2020-12-28,0.004670,test,2640,"[-0.009056306601916251, -0.011788079470198665,..."
225205,ZARJPY,2020-12-29,-0.002113,test,2641,"[-0.011788079470198665, 0.00616539337890365, -..."
225206,ZARJPY,2020-12-30,-0.006775,test,2642,"[0.00616539337890365, -0.009990675369655031, -..."


In [None]:
N_TOKEN = 1000
mini, maxi = -0.03, 0.03

class Tokenizer:

    def __init__(self, mini=-0.03, maxi=0.03, n_token=1000):
        self.mini = mini
        self.maxi = maxi

        self.n_token = n_token
        self.step = (maxi - mini) / (n_token - 1)

    def tokenize(self, x):
        x = np.array(x)
        x = ((x - self.mini) / (self.maxi - self.mini) * (self.n_token - 1)).astype(np.int)
        x = np.clip(x, 0, self.n_token - 1)
        return list(x)

    def decode(self, x, is_torch=False):
        if is_torch:
            return x * self.step + self.mini
        x = np.array(x)
        x = np.array(x * self.step + self.mini)
        return x

        


tokenizer = Tokenizer(mini, maxi, N_TOKEN)

values = np.linspace(mini, maxi, 10)
print(values)
tokens = tokenizer.tokenize(values)
print(tokens)
decoded = tokenizer.decode(tokens)
print(decoded)


[-0.03       -0.02333333 -0.01666667 -0.01       -0.00333333  0.00333333
  0.01        0.01666667  0.02333333  0.03      ]
[0, 111, 222, 333, 444, 555, 666, 776, 888, 999]
[-0.03       -0.02333333 -0.01666667 -0.01       -0.00333333  0.00333333
  0.01        0.01660661  0.02333333  0.03      ]


In [None]:
%%time
tqdm.pandas()
print('Compute the standardized rate of return in features...')
dataset.df['look_back_days'] = dataset.df.progress_apply(
    lambda row: list(tokenizer.tokenize(row['look_back_days']))
    if row['look_back_days'] == row['look_back_days'] else np.nan, 
    axis=1
)



Compute the standardized rate of return in features...


  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=225208.0), HTML(value='')))


CPU times: user 32.3 s, sys: 3.22 s, total: 35.5 s
Wall time: 32.5 s


In [None]:
for split in ['training', 'validation'], 'test':
    _df = dataset.df[dataset.df["set"] == split]

    _data_df = _df[["look_back_days"]]

    _data_df['look_back_days'] = _data_df.progress_apply(
        lambda row: str(row['look_back_days'])[1:-1].replace(', ', ' '),
        axis=1
    )

    _metadata_df = _df[["pair", "day"]]

    _data_df.to_csv(os.path.join(ROOT, 'data', f'tokenized_{split}_data.csv'), index=False, header=False)
    _metadata_df.to_csv(os.path.join(ROOT, 'data', f'tokenized_{split}_metadata.csv'), index=False, header=False)
