In [2]:
import os, torch
import pandas as pd
import numpy as np
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm
  device: Optional[torch.device] = torch.device("cuda"),


'c:\\Users\\T-Gamer\\Documents\\SideDrive\\UFMA\\2022.1\\Topicos Especiais (NLP)\\Exercicios\\Trabalho Final\\Implementação\\source'

In [3]:
def levenshtein(source:str, target:str) -> int :
    n = len(source)
    m = len(target)
    
    D = np.zeros((n + 1, m + 1), dtype=int)
    for i in range(1, n + 1) :
        D[i][0] = D[i - 1][0] + 1
    for j in range(1, m + 1) :
        D[0][j] = D[0][j - 1] + 1

    subst_cost = lambda x, y : 0 if x == y else 4
    for i in range(1, n + 1) :
        for j in range(1, m + 1) :
            D[i][j] = min([D[i - 1][j    ] + 1,
                           D[i - 1][j - 1] + subst_cost(source[i - 1], target[j - 1]),
                           D[i    ][j - 1] + 1])
    return D[n][m]
def distance(source:str, target:str) -> float :
    lev = levenshtein(source, target)
    if source in target or target in source : lev = lev - .5 
    return lev

In [4]:
class Dataset(torch.utils.data.Dataset) :
    def __init__(self, 
                 dataframe:pd.DataFrame, 
                 vocabulario:list[str],
                 max_length:int=512,
                 text_column:str='text',
                 label_column:str='class',
                 n_classes:int=1) :
        self.dataframe = dataframe
        self.vocabulario = vocabulario
        self.max_length = max_length
        self.text_column = text_column
        self.label_column = label_column
        self.n_classes = n_classes
    def __len__(self) :
        return self.dataframe.shape[0]
    def text_proxessing(self, text:str) :
        tokenlist = text.split()
        
        sequence = []
        for token in tokenlist :
            # if not token in self.vocabulario :
            #     voc_sort = self.vocabulario.copy()
            #     voc_sort.sort(key=lambda x : distance(token, x))
            #     token = voc_sort[0]
            sequence.append(self.vocabulario.index(token) + 1)

        mask = [1 for _ in range(len(sequence))]
        if len(sequence) > self.max_length :
            sequence = sequence[ : self.max_length]
            mask = mask[ : self.max_length]
        else :
            padding = [0 for _ in range(self.max_length - len(sequence))]
            sequence = sequence + padding
            mask = mask + padding
        
        return sequence, mask

    def __getitem__(self, index) :
        # print(index, type(index))
        sequence, mask = self.text_proxessing(self.dataframe.iloc[index][self.text_column])
        label = self.dataframe.iloc[index][self.label_column]
        if self.n_classes > 1 :
            label = [int(label)]
            # label = [([1] if i == int(label) else [0]) for i in range(self.n_classes)]
        else :
            label = [label]
        label = torch.tensor(label)
        return {'input_ids'      : torch.tensor(sequence),
                'attention_mask' : torch.tensor(mask),
                'labels'         : label}
        

In [6]:
def train(dataset:Dataset, embeddings:pd.DataFrame, epochs:int, output_dir:str, logging_dir:str) -> DistilBertForSequenceClassification :
    global resume_from_checkpoint
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=dataset.n_classes
    )
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    embeddings_np_array = embeddings.values
    embeddings_np_array = np.append(np.zeros((1, embeddings_np_array.shape[1])), embeddings_np_array, axis=0)
    embeddings_module = torch.nn.Embedding.from_pretrained(
        embeddings=torch.tensor(embeddings_np_array),
        padding_idx=0
    )
    model.set_input_embeddings(embeddings_module)
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=logging_dir,
        logging_steps=10
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )
    trainer.train()
    return model

In [5]:
# Para dataset simples, sem folding
# Args
dataset_folder = "../resources/datasets/StanfordSentimentTreebank"
dataset_name   = "SST2Processed-train"

# Exec
embeddings_df = pd.read_csv(f"{dataset_folder}/embeddings/{dataset_name}.csv", index_col=0)
dataset_df    = pd.read_csv(f"{dataset_folder}/split/{dataset_name}.csv",      index_col=0)
output_dir    = f"../resources/output/{dataset_name}"
logging_dir   = f"../resources/logs/{dataset_name}"

dataset = Dataset(dataset_df, embeddings_df.index.tolist())
model = train(dataset, embeddings_df, epochs=5, output_dir=output_dir, logging_dir=logging_dir)
model.save_pretrained(output_dir + "/final")

In [7]:
# Para dataset com k-folding
# Args
k = 10
dataset_folder = "../resources/datasets/TwitterAirlines"
dataset_name   = "TweetsProcessed"
n_classes = 3
n_epochs = 3
label_column = "class"
# Exec
folds = []
for i_fold in range(k) :
    dataset_fold_name = f"{dataset_name}_Fold{i_fold + 1}"
    fold_df = pd.read_csv(f"{dataset_folder}/folds/{dataset_fold_name}.csv", index_col = 0)
    folds.append(fold_df)
for i_fold in range(1, k) :
    dataset_fold_name = f"{dataset_name}_Fold{i_fold + 1}"
    embeddings_df = pd.read_csv(f"{dataset_folder}/embeddings/{dataset_fold_name}.csv", index_col=0)
    dataset_df    = pd.concat(folds[ : i_fold] + folds[i_fold + 1 : ])
    
    output_dir    = f"../resources/output/{dataset_fold_name}"
    logging_dir   = f"../resources/logs/{dataset_fold_name}"
    for dir in (output_dir, logging_dir) :
        if not os.path.exists(dir) : 
            os.makedirs(dir)

    dataset = Dataset(dataset_df, embeddings_df.index.tolist(), label_column=label_column, n_classes=n_classes)
    model = train(dataset, embeddings_df, epochs=n_epochs, output_dir=output_dir, logging_dir=logging_dir)
    model.save_pretrained(output_dir + "/final")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

{'loss': 1.1321, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  1%|          | 20/2472 [09:16<18:55:01, 27.77s/it]

{'loss': 1.1136, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  1%|          | 30/2472 [13:47<18:23:53, 27.12s/it]

{'loss': 1.0396, 'learning_rate': 3e-06, 'epoch': 0.04}


  2%|▏         | 40/2472 [18:19<18:27:21, 27.32s/it]

{'loss': 0.9914, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.05}


  2%|▏         | 50/2472 [22:54<18:17:21, 27.18s/it]

{'loss': 0.9461, 'learning_rate': 5e-06, 'epoch': 0.06}


  2%|▏         | 60/2472 [27:25<18:10:49, 27.14s/it]

{'loss': 0.8986, 'learning_rate': 6e-06, 'epoch': 0.07}


  3%|▎         | 70/2472 [31:56<18:01:29, 27.01s/it]

{'loss': 0.8014, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  3%|▎         | 80/2472 [36:31<18:10:39, 27.36s/it]

{'loss': 0.9407, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.1}


  4%|▎         | 90/2472 [41:04<18:20:51, 27.73s/it]

{'loss': 0.7799, 'learning_rate': 9e-06, 'epoch': 0.11}


  4%|▍         | 100/2472 [45:37<17:59:16, 27.30s/it]

{'loss': 0.7643, 'learning_rate': 1e-05, 'epoch': 0.12}


  4%|▍         | 110/2472 [50:13<18:13:46, 27.78s/it]

{'loss': 0.77, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.13}


  5%|▍         | 120/2472 [54:46<17:47:20, 27.23s/it]

{'loss': 0.7709, 'learning_rate': 1.2e-05, 'epoch': 0.15}


  5%|▌         | 130/2472 [59:19<17:41:21, 27.19s/it]

{'loss': 0.7298, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.16}


  6%|▌         | 140/2472 [1:03:51<17:35:12, 27.15s/it]

{'loss': 0.7445, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.17}


  6%|▌         | 150/2472 [1:08:22<17:26:49, 27.05s/it]

{'loss': 0.6836, 'learning_rate': 1.5e-05, 'epoch': 0.18}


  6%|▋         | 160/2472 [1:12:55<17:21:10, 27.02s/it]

{'loss': 0.8125, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.19}


  7%|▋         | 170/2472 [1:17:27<17:25:16, 27.24s/it]

{'loss': 0.771, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.21}


  7%|▋         | 180/2472 [1:21:58<17:10:23, 26.97s/it]

{'loss': 0.7202, 'learning_rate': 1.8e-05, 'epoch': 0.22}


  8%|▊         | 190/2472 [1:26:28<17:10:00, 27.08s/it]

{'loss': 0.8799, 'learning_rate': 1.9e-05, 'epoch': 0.23}


  8%|▊         | 200/2472 [1:31:00<17:04:51, 27.06s/it]

{'loss': 0.7162, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 210/2472 [1:35:30<17:01:52, 27.11s/it]

{'loss': 0.7203, 'learning_rate': 2.1e-05, 'epoch': 0.25}


  9%|▉         | 220/2472 [1:40:01<16:57:37, 27.11s/it]

{'loss': 0.7458, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.27}


  9%|▉         | 230/2472 [1:44:33<16:55:43, 27.18s/it]

{'loss': 0.8245, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.28}


 10%|▉         | 240/2472 [1:49:06<16:58:06, 27.37s/it]

{'loss': 0.7228, 'learning_rate': 2.4e-05, 'epoch': 0.29}


 10%|█         | 250/2472 [1:53:39<16:53:09, 27.36s/it]

{'loss': 0.6751, 'learning_rate': 2.5e-05, 'epoch': 0.3}


 11%|█         | 260/2472 [1:58:15<16:48:18, 27.35s/it]

{'loss': 0.7099, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.32}


 11%|█         | 270/2472 [2:02:44<16:25:24, 26.85s/it]

{'loss': 0.6674, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.33}


 11%|█▏        | 280/2472 [2:07:14<16:24:48, 26.96s/it]

{'loss': 0.8146, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.34}


 12%|█▏        | 290/2472 [2:11:44<16:34:27, 27.35s/it]

{'loss': 0.6554, 'learning_rate': 2.9e-05, 'epoch': 0.35}


 12%|█▏        | 300/2472 [2:16:22<16:48:26, 27.86s/it]

{'loss': 0.6093, 'learning_rate': 3e-05, 'epoch': 0.36}


 13%|█▎        | 310/2472 [2:20:55<16:15:57, 27.09s/it]

{'loss': 0.6759, 'learning_rate': 3.1e-05, 'epoch': 0.38}


 13%|█▎        | 320/2472 [2:25:25<16:03:36, 26.87s/it]

{'loss': 0.7037, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.39}


 13%|█▎        | 330/2472 [2:29:55<16:01:33, 26.93s/it]

{'loss': 0.8191, 'learning_rate': 3.3e-05, 'epoch': 0.4}


 14%|█▍        | 340/2472 [2:34:24<15:55:35, 26.89s/it]

{'loss': 0.6998, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.41}


 14%|█▍        | 350/2472 [2:38:53<16:00:15, 27.15s/it]

{'loss': 0.7815, 'learning_rate': 3.5e-05, 'epoch': 0.42}


 15%|█▍        | 360/2472 [2:43:22<15:42:58, 26.79s/it]

{'loss': 0.7191, 'learning_rate': 3.6e-05, 'epoch': 0.44}


 15%|█▍        | 370/2472 [2:47:52<15:42:31, 26.90s/it]

{'loss': 0.7509, 'learning_rate': 3.7e-05, 'epoch': 0.45}


 15%|█▌        | 380/2472 [2:52:24<15:49:06, 27.22s/it]

{'loss': 0.6984, 'learning_rate': 3.8e-05, 'epoch': 0.46}


 16%|█▌        | 390/2472 [2:56:55<15:35:41, 26.97s/it]

{'loss': 0.8204, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.47}


 16%|█▌        | 400/2472 [3:01:23<15:25:15, 26.79s/it]

{'loss': 0.7405, 'learning_rate': 4e-05, 'epoch': 0.49}


 17%|█▋        | 410/2472 [3:05:51<15:17:08, 26.69s/it]

{'loss': 0.7959, 'learning_rate': 4.1e-05, 'epoch': 0.5}


 17%|█▋        | 420/2472 [3:10:19<15:18:52, 26.87s/it]

{'loss': 0.6365, 'learning_rate': 4.2e-05, 'epoch': 0.51}


 17%|█▋        | 430/2472 [3:14:49<15:15:26, 26.90s/it]

{'loss': 0.7625, 'learning_rate': 4.3e-05, 'epoch': 0.52}


 18%|█▊        | 440/2472 [3:19:16<14:59:34, 26.56s/it]

{'loss': 0.7008, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.53}


 18%|█▊        | 450/2472 [3:23:43<15:00:53, 26.73s/it]

{'loss': 0.6058, 'learning_rate': 4.5e-05, 'epoch': 0.55}


 19%|█▊        | 460/2472 [3:28:11<15:00:19, 26.85s/it]

{'loss': 0.7783, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.56}


 19%|█▉        | 470/2472 [3:32:40<14:56:02, 26.85s/it]

{'loss': 0.7027, 'learning_rate': 4.7e-05, 'epoch': 0.57}


 19%|█▉        | 480/2472 [3:37:09<14:56:19, 27.00s/it]

{'loss': 0.6234, 'learning_rate': 4.8e-05, 'epoch': 0.58}


 20%|█▉        | 490/2472 [3:41:40<14:56:23, 27.14s/it]

{'loss': 0.7099, 'learning_rate': 4.9e-05, 'epoch': 0.59}


 20%|██        | 500/2472 [3:46:09<14:39:47, 26.77s/it]Saving model checkpoint to ../resources/output/TweetsProcessed_Fold2\checkpoint-500
Configuration saved in ../resources/output/TweetsProcessed_Fold2\checkpoint-500\config.json


{'loss': 0.5821, 'learning_rate': 5e-05, 'epoch': 0.61}


Model weights saved in ../resources/output/TweetsProcessed_Fold2\checkpoint-500\pytorch_model.bin
 21%|██        | 510/2472 [3:50:41<14:40:16, 26.92s/it]

{'loss': 0.6887, 'learning_rate': 4.974645030425964e-05, 'epoch': 0.62}


 21%|██        | 520/2472 [3:55:11<14:39:19, 27.03s/it]

{'loss': 0.6864, 'learning_rate': 4.949290060851927e-05, 'epoch': 0.63}


 21%|██▏       | 530/2472 [3:59:41<14:28:12, 26.82s/it]

{'loss': 0.7861, 'learning_rate': 4.923935091277891e-05, 'epoch': 0.64}


 22%|██▏       | 540/2472 [4:04:10<14:25:48, 26.89s/it]

{'loss': 0.7535, 'learning_rate': 4.898580121703854e-05, 'epoch': 0.66}


 22%|██▏       | 550/2472 [4:08:43<14:36:12, 27.35s/it]

{'loss': 0.8945, 'learning_rate': 4.873225152129818e-05, 'epoch': 0.67}


 23%|██▎       | 560/2472 [4:13:13<14:18:33, 26.94s/it]

{'loss': 0.6996, 'learning_rate': 4.847870182555781e-05, 'epoch': 0.68}


 23%|██▎       | 570/2472 [4:17:41<14:08:11, 26.76s/it]

{'loss': 0.672, 'learning_rate': 4.8225152129817444e-05, 'epoch': 0.69}


 23%|██▎       | 580/2472 [4:22:13<14:23:15, 27.38s/it]

{'loss': 0.7025, 'learning_rate': 4.7971602434077076e-05, 'epoch': 0.7}


 24%|██▍       | 590/2472 [4:26:42<14:05:02, 26.94s/it]

{'loss': 0.7745, 'learning_rate': 4.7718052738336714e-05, 'epoch': 0.72}


 24%|██▍       | 600/2472 [4:31:10<13:55:03, 26.76s/it]

{'loss': 0.6667, 'learning_rate': 4.746450304259635e-05, 'epoch': 0.73}


 25%|██▍       | 610/2472 [4:35:37<13:42:37, 26.51s/it]

{'loss': 0.6978, 'learning_rate': 4.7210953346855984e-05, 'epoch': 0.74}


 25%|██▌       | 620/2472 [4:40:04<13:45:49, 26.75s/it]

{'loss': 0.7471, 'learning_rate': 4.695740365111562e-05, 'epoch': 0.75}


 25%|██▌       | 630/2472 [4:44:32<13:42:21, 26.79s/it]

{'loss': 0.6332, 'learning_rate': 4.6703853955375254e-05, 'epoch': 0.76}


 26%|██▌       | 640/2472 [4:49:02<13:49:19, 27.16s/it]

{'loss': 0.663, 'learning_rate': 4.645030425963489e-05, 'epoch': 0.78}


 26%|██▋       | 650/2472 [4:53:32<13:35:25, 26.85s/it]

{'loss': 0.7098, 'learning_rate': 4.6196754563894524e-05, 'epoch': 0.79}


 27%|██▋       | 660/2472 [4:58:02<13:32:05, 26.89s/it]

{'loss': 0.6688, 'learning_rate': 4.594320486815416e-05, 'epoch': 0.8}


 27%|██▋       | 670/2472 [5:02:29<13:17:15, 26.55s/it]

{'loss': 0.6841, 'learning_rate': 4.5689655172413794e-05, 'epoch': 0.81}


 28%|██▊       | 680/2472 [5:06:56<13:21:29, 26.84s/it]

{'loss': 0.746, 'learning_rate': 4.543610547667343e-05, 'epoch': 0.83}


 28%|██▊       | 690/2472 [5:11:26<13:25:17, 27.11s/it]

{'loss': 0.722, 'learning_rate': 4.5182555780933065e-05, 'epoch': 0.84}


 28%|██▊       | 700/2472 [5:15:55<13:11:28, 26.80s/it]

{'loss': 0.7133, 'learning_rate': 4.4929006085192696e-05, 'epoch': 0.85}


 29%|██▊       | 710/2472 [5:20:25<13:17:12, 27.15s/it]

{'loss': 0.6223, 'learning_rate': 4.4675456389452335e-05, 'epoch': 0.86}


 29%|██▉       | 720/2472 [5:24:54<13:03:36, 26.84s/it]

{'loss': 0.9443, 'learning_rate': 4.4421906693711966e-05, 'epoch': 0.87}


 30%|██▉       | 730/2472 [5:29:22<12:53:16, 26.63s/it]

{'loss': 0.7526, 'learning_rate': 4.4168356997971605e-05, 'epoch': 0.89}


 30%|██▉       | 740/2472 [5:33:52<12:58:33, 26.97s/it]

{'loss': 0.6993, 'learning_rate': 4.3914807302231236e-05, 'epoch': 0.9}


 30%|███       | 750/2472 [5:38:20<12:47:22, 26.74s/it]

{'loss': 0.7198, 'learning_rate': 4.3661257606490875e-05, 'epoch': 0.91}


 31%|███       | 760/2472 [5:42:50<12:49:29, 26.97s/it]

{'loss': 0.5971, 'learning_rate': 4.340770791075051e-05, 'epoch': 0.92}


 31%|███       | 770/2472 [5:47:16<12:35:51, 26.65s/it]

{'loss': 0.6404, 'learning_rate': 4.3154158215010145e-05, 'epoch': 0.93}


 32%|███▏      | 780/2472 [5:51:42<12:31:34, 26.65s/it]

{'loss': 0.6117, 'learning_rate': 4.290060851926978e-05, 'epoch': 0.95}


 32%|███▏      | 790/2472 [5:56:10<12:34:04, 26.90s/it]

{'loss': 0.7753, 'learning_rate': 4.2647058823529415e-05, 'epoch': 0.96}


 32%|███▏      | 800/2472 [6:00:37<12:18:14, 26.49s/it]

{'loss': 0.7054, 'learning_rate': 4.2393509127789046e-05, 'epoch': 0.97}


 33%|███▎      | 810/2472 [6:05:04<12:18:30, 26.66s/it]

{'loss': 0.7028, 'learning_rate': 4.213995943204868e-05, 'epoch': 0.98}


 33%|███▎      | 812/2472 [6:05:58<12:23:25, 26.87s/it]

ValueError: 'nan' is not in list