In [1]:
import os, torch
import pandas as pd
import numpy as np
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm
  device: Optional[torch.device] = torch.device("cuda"),


'c:\\Users\\T-Gamer\\Documents\\SideDrive\\UFMA\\2022.1\\Topicos Especiais (NLP)\\Exercicios\\Trabalho Final\\Implementação\\source'

In [2]:
def levenshtein(source:str, target:str) -> int :
    n = len(source)
    m = len(target)
    
    D = np.zeros((n + 1, m + 1), dtype=int)
    for i in range(1, n + 1) :
        D[i][0] = D[i - 1][0] + 1
    for j in range(1, m + 1) :
        D[0][j] = D[0][j - 1] + 1

    subst_cost = lambda x, y : 0 if x == y else 4
    for i in range(1, n + 1) :
        for j in range(1, m + 1) :
            D[i][j] = min([D[i - 1][j    ] + 1,
                           D[i - 1][j - 1] + subst_cost(source[i - 1], target[j - 1]),
                           D[i    ][j - 1] + 1])
    return D[n][m]
def distance(source:str, target:str) -> float :
    lev = levenshtein(source, target)
    if source in target or target in source : lev = lev - .5 
    return lev

In [9]:
class Dataset(torch.utils.data.Dataset) :
    def __init__(self, 
                 dataframe:pd.DataFrame, 
                 vocabulario:list[str],
                 max_length:int=512,
                 text_column:str='text',
                 label_column:str='class',
                 n_classes:int=1) :
        self.dataframe = dataframe
        self.vocabulario = vocabulario
        self.max_length = max_length
        self.text_column = text_column
        self.label_column = label_column
        self.n_classes = n_classes
    def __len__(self) :
        return self.dataframe.shape[0]
    def text_proxessing(self, text:str) :
        tokenlist = text.split()
        
        sequence = []
        for token in tokenlist :
            # if not token in self.vocabulario :
            #     voc_sort = self.vocabulario.copy()
            #     voc_sort.sort(key=lambda x : distance(token, x))
            #     token = voc_sort[0]
            sequence.append(self.vocabulario.index(token) + 1)

        mask = [1 for _ in range(len(sequence))]
        if len(sequence) > self.max_length :
            sequence = sequence[ : self.max_length]
            mask = mask[ : self.max_length]
        else :
            padding = [0 for _ in range(self.max_length - len(sequence))]
            sequence = sequence + padding
            mask = mask + padding
        
        return sequence, mask

    def __getitem__(self, index) :
        # print(index, type(index))
        sequence, mask = self.text_proxessing(self.dataframe.iloc[index][self.text_column])
        label = self.dataframe.iloc[index][self.label_column]
        if self.n_classes > 1 :
            label = [(1 if i == int(label) else 0) for i in range(self.n_classes)]
        else :
            label = [label]
        return {'input_ids'      : torch.tensor(sequence),
                'attention_mask' : torch.tensor(mask),
                'labels'         : torch.tensor(label)}
        

In [10]:
def train(dataset:Dataset, embeddings:pd.DataFrame) :
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=dataset.n_classes
    )
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    embeddings_np_array = embeddings.values
    embeddings_np_array = np.append(np.zeros((1, embeddings_np_array.shape[1])), embeddings_np_array, axis=0)
    embeddings_module = torch.nn.Embedding.from_pretrained(
        embeddings=torch.tensor(embeddings_np_array),
        padding_idx=0
    )
    model.set_input_embeddings(embeddings_module)
    training_args = TrainingArguments(
        output_dir='../resources/output',
        num_train_epochs=10,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='../resources/logs',
        logging_steps=10
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )
    trainer.train()
    return model

In [5]:
dataset_folder = "../resources/datasets/StanfordSentimentTreebank"
dataset_name   = "SST2Processed-train"
embeddings_df = pd.read_csv(f"{dataset_folder}/embeddings/{dataset_name}_dim768.csv", index_col=0)
dataset_df    = pd.read_csv(f"{dataset_folder}/split/{dataset_name}.csv",      index_col=0)

In [6]:
dataset_df.shape

(8544, 3)

In [7]:
dataset = Dataset(dataset_df, embeddings_df.index.tolist())

In [11]:
model = train(dataset, embeddings_df)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\T-Gamer/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users