In [None]:
# Source: https://medium.com/analytics-vidhya/create-a-tokenizer-and-train-a-huggingface-roberta-model-from-scratch-f3ed1138180c

In [1]:
# Step 1.
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import datasets
import pandas as pd
from pathlib import Path
import os

TRAIN_BATCH_SIZE = 16    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 8    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 10 #15        # number of epochs to train (default: 10)
LEARNING_RATE = 1e-4    # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 7

paths = [str(x) for x in Path("/home/info/MyNotebooks/Datasets/SentencasTRT1/TXT").glob("*.txt")]
tokenizer_folder = "/home/info/MyNotebooks/RobertaSenTRT/Tokenizer"

In [None]:
%%time 
# 2.TREINANDO O TOKENIZADOR e salvando em disco

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

tokenizer.train(files=paths, vocab_size=8192, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])

tokenizer.save_model(tokenizer_folder)


In [2]:
# Step 2. CARREGANDO O TOKENIZADOR
# Load the tokenizer using vocab.json and mrege.txt files
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_folder,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_folder,'merges.txt'))
)

# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)


In [3]:
# Step 3.
from transformers import RobertaTokenizerFast
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [None]:
''' - Language Modeling does not use Labeled data! - '''

In [4]:
# Step 4. Cria DataFrames e as LISTAS contendo dados das sentenças e as classificacoes.
# LOADING DATAFRAMES DATASETDICTS FOR TRAINING "THE MODEL"
#    Ver notebook WorkflowSolucaoTCC como o Corpus foi criado (pdf->txt->DS) e o dataset dividido (train/test)
#
import datasets

FOLDER_BASE = "/home/info/MyNotebooks/Datasets/SentencasTRT1/"
DS_FOLDER   = FOLDER_BASE + "DS/"

DsClassAnot = datasets.load_from_disk(DS_FOLDER+"DsClassAnot/Train_Test")
DsClassAnot

df_train = pd.DataFrame(DsClassAnot['train'])
df_test = pd.DataFrame(DsClassAnot['test'])

df_train = df_train.drop('disp', 1)
df_train = df_train.drop('label', 1)

df_test = df_test.drop('disp', 1)
df_test = df_test.drop('label', 1)

In [5]:
# Step 5

config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

NameError: name 'RobertaConfig' is not defined

In [None]:
# Step 6.
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        
        for example in df.values:
            x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [None]:
# Step 7.
# Create the train and evaluation dataset
train_tokenized_ds = CustomDataset(df_train['text'], tokenizer)
test_tokenized_ds  = CustomDataset(df_test['text'], tokenizer)

In [None]:
# 8.
# CONFIGURING THE MODEL
# initialize our model using the configuration file

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

In [None]:
# Step 9?

from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)



In [None]:
# Step 10.

from transformers import Trainer, TrainingArguments


model_folder = "/home/info/MyNotebooks/RobertaSenTRT/Model"
print(model_folder)

# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_folder,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_ds,
    eval_dataset=test_tokenized_ds,
    #prediction_loss_only=True,
)

In [None]:
#os.environ["WANDB_DISABLED"] = "True"

In [None]:
# Step 11.
# Train the model
import torch

trainer.train()


In [None]:
# 12.
# Saving the model

out_fold = model_folder+"/outs"

trainer.save_model(out_fold)

#trainer.save_model(f'out_fold{i}')
#trainer.save_model(model_folder)

# trainer.save_model()
# trainer.save_model(f'out_fold{i}')

In [None]:
# FINE TUNING DAQUI PARA BAIXO. ABRIR EM OUTRO NOTEBOOK!

In [None]:
# Fine tuning a Roberta Classification
# https://jesusleal.io/2020/10/20/RoBERTA-Text-Classification/

In [None]:
import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
#import wandb
import os

In [None]:
Nom_Classes = ["Acordo ou outros", "Improcedente", "Procedente","Parcialmente procedente"]


In [None]:
# 4.1 Cria DataFrames e as LISTAS contendo dados das sentenças e as classificacoes.
df_train_class = pd.DataFrame(DsClassAnot['train'])
df_train_class = df_train_class.drop('text', 1)
df_train_class.rename(columns = {'disp':'text'}, inplace = True)
Labels_train_class   = df_train_class['label']
#df_train_class = df_train_class.drop('label', 1)

# 4.2 Cria DataFrames e as LISTAS contendo dados das sentenças e as classificacoes.
df_test_class = pd.DataFrame(DsClassAnot['test'])
df_test_class = df_test_class.drop('text', 1)
df_test_class.rename(columns = {'disp':'text'}, inplace = True)
Labels_test   = df_test_class['label']
#df_test_class = df_train_class.drop('label', 1)

In [None]:
print(len(df_train['text'][0]))

In [None]:
# df_train['text'] = df_train['text'][-1000:-1]  # Tomando a ultima parte do Dispositivo melhora a previsão?

In [None]:
print(len(df_train_class['text'][0]))

In [None]:
df_train_class

In [None]:
def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0] * n_labels
        label_indices = df.iloc[i]["label"]
        for index in label_indices:
            temp[index] = 1
        one_hot_encoding.append(temp)
        
    return pd.DataFrame(one_hot_encoding)

train_ohe_labels = one_hot_encoder(train)
valid_ohe_labels = one_hot_encoder(valid)
test_ohe_labels = one_hot_encoder(test)

print(train_ohe_labels.shape)
#(43410, 28)

train = pd.concat([train, train_ohe_labels], axis=1)
valid = pd.concat([valid, valid_ohe_labels], axis=1)
test = pd.concat([test, test_ohe_labels], axis=1)

In [None]:
Nom_Classes = ["Acordo ou outros", "Improcedente", "Procedente","Parcialmente procedente"]
n_labels = len(Nom_Classes)
n_labels

In [None]:
def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0] * n_labels
        label_indices = df.iloc[i]["label"]
        for index in label_indices:
            temp[index] = 1
        one_hot_encoding.append(temp)
        
    df_ohe_labels = pd.DataFrame(one_hot_encoding)    
    return pd.concat([df, df_ohe_labels], axis=1)



In [None]:
train_ohe = one_hot_encoder(df_train_class)

In [None]:
OHE_Classes = [torch.Tensor([1,0,0,0]),torch.Tensor([0,1,0,0]),torch.Tensor([0,0,1,0]),torch.Tensor([0,0,0,1])]

In [None]:
teste = OHE_Classes[3]
teste

In [None]:
import torch.nn.functional as F 

F.one_hot(teste, num_classes=4)

In [None]:
# Agora sim, funcionando!
import pandas as pd

def label_ohe(df):
    Nom_Classes = ["Acordo ou outros", "Improcedente", "Procedente","Parcialmente procedente"]
    OHE_Classes = [(1,0,0,0),(0,1,0,0),(0,0,1,0),(0,0,0,1)]
       
    for i in range(len(df)):
        for j in range(len(Nom_Classes)):
            if df['label'][i] == Nom_Classes[j]:
                df['label'][i] = OHE_Classes[j]
    #print(df['label'][100])
    return(df)


In [None]:
df_train_class = label_ohe(df_train_class)
df_test_class = label_ohe(df_test_class)

In [None]:
type(df_test_class['label'][3])

In [None]:
df_test_class['label'] = list(df_test_class['label'])

In [None]:
df_test_class['label'][3]

In [None]:
# Nao precisa mais deste trecho!
import pandas as pd

def label_ids(df):
    Nom_Classes = ["Acordo ou outros", "Improcedente", "Procedente","Parcialmente procedente"]
       
    for i in range(len(df)):
        for j in range(len(Nom_Classes)):
            if df['label'][i] == Nom_Classes[j]:
                df['label'][i] = j                
    return(df)

# Como ficaria este processamento com list compreension?

In [None]:
df_train_class = label_ids(df_train_class)

In [None]:
df_test_class = label_ids(df_test_class)

In [None]:
df_train_class['label'][13]

In [None]:
from datasets import Dataset, DatasetDict

train_data = Dataset.from_pandas(df_train_class)
test_data  = Dataset.from_pandas(df_test_class)

#ataset_target = DatasetDict()
#ataset_target['train'] = dataset_train
#ataset_target['test'] = dataset_test

In [None]:
train_data

In [None]:
train_data['label'][0]

In [None]:
type(train_data['label'][0])

In [None]:
# Carregar o modelo treinado.
model_path = "/home/info/MyNotebooks/RobertaSenTRT/Model/outs"
vocab_path = "/home/info/MyNotebooks/RobertaSenTRT/Tokenizer"
#'roberta-base'

In [None]:
# load model and tokenizer and define length of the text sequence
roberta_model = RobertaForSequenceClassification.from_pretrained(model_path)

In [None]:
#roberta_tokenizer = RobertaTokenizerFast.from_pretrained(vocab_path, max_length = 512)
roberta_tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

In [None]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return roberta_tokenizer(batched_text['text'], padding = True, truncation=True)

In [None]:
train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

In [None]:
train_data

In [None]:
type(train_data['label'])

In [None]:
type(train_data['label'][2])

In [None]:
type(train_data['label'])

In [None]:
for i in range(len(train_data)):
    train_data['label'][i] = str(train_data['label'][i])

In [None]:
type(train_data['label'][0])

In [None]:
train_data['label'][0]

In [None]:
train_data

In [None]:
train_data.set_format('torch', columns=['attention_mask','input_ids','label'])
#train_data.set_format('torch', columns=['attention_mask','input_ids'])
test_data.set_format('torch', columns=['attention_mask','input_ids','label'])
#test_data.set_format('torch', columns=['attention_mask','input_ids'])

In [None]:
type(train_data['input_ids'].shape)

In [None]:
type(train_data['label'])

In [None]:
train_data['label'].shape

In [None]:
train_data['input_ids'].shape

In [None]:
train_data.features

In [None]:
# Dataset(features: {'text': Value(dtype='string', id=None), 
#'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}, num_rows: 25000)

In [None]:
ClassLabel(num_classes=4, names=["Acordo ou outros", "Improcedente", "Procedente","Parcialmente procedente"]

In [None]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = '/home/info/MyNotebooks/RobertaSenTRT/Model/outs_classification',
    num_train_epochs=5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    #load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = False, # era True
    logging_dir='/home/info/MyNotebooks/RobertaSenTRT/Model/outs_classification/logs',
    #dataloader_num_workers = 1, # Era 8
    run_name = 'roberta-classification'
)


In [None]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_tokenized_ds,
    eval_dataset=test_tokenized_ds
)
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'


In [None]:
train_tokenized_ds[1]

In [None]:
# train the model
trainer.train()


In [None]:
# How about to train a DistilBert (ligthweigth) model from Scratch?
# Or a T5 (multitask) model?
# Ver também as APIs para acesso a modelos que aceleram a sua utilização na produção (Accelerated Inference API):
# https://api-inference.huggingface.co/docs/python/html/index.html