In [1]:
import numpy as np
import pandas as pd
import os, json, math

os.getcwd()

'c:\\Users\\T-Gamer\\Documents\\SideDrive\\UFMA\\2022.1\\Topicos Especiais (NLP)\\Exercicios\\Trabalho Final\\Implementação\\source'

In [2]:
import torch

In [3]:
from transformers import AutoModelForSequenceClassification

In [4]:
def levenshtein(source:str, target:str) -> int :
    n = len(source)
    m = len(target)
    
    D = np.zeros((n + 1, m + 1), dtype=int)
    for i in range(1, n + 1) :
        D[i][0] = D[i - 1][0] + 1
    for j in range(1, m + 1) :
        D[0][j] = D[0][j - 1] + 1

    subst_cost = lambda x, y : 0 if x == y else 4
    for i in range(1, n + 1) :
        for j in range(1, m + 1) :
            D[i][j] = min([D[i - 1][j    ] + 1,
                           D[i - 1][j - 1] + subst_cost(source[i - 1], target[j - 1]),
                           D[i    ][j - 1] + 1])
    return D[n][m]
def distance(source:str, target:str) -> float :
    lev = levenshtein(source, target)
    if source in target or target in source : lev = lev - .5 
    return lev

In [5]:
class Dataset(torch.utils.data.Dataset) :
    def __init__(self, 
                 dataframe:pd.DataFrame, 
                 vocabulario:list[str],
                 translation_dict:dict[str,str]=None,
                 max_length:int=512,
                 text_column:str='text',
                 label_column:str='class',
                 n_classes:int=1) :
        self.dataframe = dataframe
        self.vocabulario_original = vocabulario
        self.vocabulario_dict = {word : i for i, word in enumerate(self.vocabulario_original)}
        
        vocabulario_real = []
        for txt in list(self.dataframe[text_column]) :
            vocabulario_real += txt.split()
        vocabulario_real = list(pd.Series(vocabulario_real).unique())
        
        if translation_dict is None :
            self.new_words = {}
        else :
            self.new_words = translation_dict
            for new, translation in translation_dict.items() :
                self.vocabulario_dict[new] = self.vocabulario_dict[translation]
        
        for tkn in vocabulario_real :
            if not tkn in self.vocabulario_dict.keys() :
                voc_sort = list(self.vocabulario_dict.keys())
                voc_sort.sort(key=lambda x : distance(tkn, x))
                print(tkn, "to", voc_sort[0])
                self.new_words[tkn] = voc_sort[0]
                self.vocabulario_dict[tkn] = self.vocabulario_dict[voc_sort[0]]
        print("No. new words:", len(self.new_words))
        
        self.max_length = max_length
        self.text_column = text_column
        self.label_column = label_column
        self.n_classes = n_classes
    def __len__(self) :
        return self.dataframe.shape[0]
    def text_processing(self, text:str) :
        tokenlist = text.split()
        
        sequence = []
        for token in tokenlist :
            sequence.append(self.vocabulario_dict[token] + 1)

        mask = [1 for _ in range(len(sequence))]
        if len(sequence) > self.max_length :
            sequence = sequence[ : self.max_length]
            mask = mask[ : self.max_length]
        else :
            padding = [0 for _ in range(self.max_length - len(sequence))]
            sequence = sequence + padding
            mask = mask + padding
        
        return sequence, mask

    def __getitem__(self, index) :
        if type(index) == int :
            sequence, mask = self.text_processing(self.dataframe.iloc[index][self.text_column])
            label = self.dataframe.iloc[index][self.label_column]
            label = [label]
            label = torch.tensor(label)
            return {
                    'sentence_ids'      : self.dataframe.iloc[index].name,
                    'input_ids'      : torch.tensor(sequence),
                    'attention_mask' : torch.tensor(mask),
                    'labels'         : label
                }
        elif type(index) == slice :
            start = index.start
            if start == None : start = 0
            stop = index.stop
            if stop == None or stop > len(self) : stop = len(self)
            
            sentence_ids = []
            input_ids, attention_mask, labels = [], [], []
            for i in range(start, stop) :
                item = self[i]
                sentence_ids.append(item["sentence_ids"])
                input_ids.append(item["input_ids"])
                attention_mask.append(item["attention_mask"])
                labels.append(item["labels"])

            input_ids_tensor = torch.tensor([list(sentence) for sentence in input_ids])
            attention_mask_tensor = torch.tensor([list(sentence) for sentence in attention_mask])
            labels_tensor = torch.tensor([list(sentence) for sentence in labels])
            
            return {'sentence_ids'   : sentence_ids,
                    'input_ids'      : input_ids_tensor,
                    'attention_mask' : attention_mask_tensor,
                    'labels'         : labels_tensor}
        else :
            raise IndexError()
        

In [11]:
# Para dataset simples, sem folding
# Args
resource_folder = "../resources"
dataset_folder = "StanfordSentimentTreebank"
dataset_name   = "SST2Processed2"
dataset_split   = "test"
batch_size = 16

In [14]:
# Exec
embeddings_df = pd.read_csv(f"{resource_folder}/embeddings/{dataset_folder}/{dataset_name}-train_dim768.csv",        index_col=0)
dataset_df    = pd.read_csv(f"{resource_folder}/datasets/{dataset_folder}/split/{dataset_name}-{dataset_split}.csv", index_col=0)

output_dir    = f"../resources/output/{dataset_name}-Eval"

translation_dict_path = f"{resource_folder}/datasets/{dataset_folder}/split/{dataset_name}-test-new-words.json"
if os.path.exists(translation_dict_path) :
    with open(f"{resource_folder}/datasets/{dataset_folder}/split/{dataset_name}-test-new-words.json", "r") as f :
        new_words = json.loads(f.read())
else :
    new_words = None
dataset = Dataset(dataset_df, embeddings_df.index.tolist(), new_words)

wasabi to wasabi's
showed to show
curls to curse
fed to ed
ponders to ponder
it' to it
author to author's
enrapturing to featuring
tempt to attempt
inscrutable to suitable
bleakly to bleak
professionalism to professionals
latent to late
gangsta to angst
shook to hook
rattled to rated
deceptive to deceptively
grimness to dimness
fatalist to fatalism
worldview to world
lobbies to lies
roadside to broadside
cafes to cares
permeate to permeates
outings to outing
cartoonlike to cartoon
magnet to magnetic
fuller to full
karmen to karen
chanting to enchanting
braided to aided
wipe to swipe
jeweled to welled
beads to ads
lacerating to lactating
old' to old
50's to 50s
cheesiness to cheesiest
johnson's to son's
orchestrates to orchestrated
siberian to brian
sheep to she
hosts to host
parka to park
everytime to every
rips to rip
1998's to 1998
uncontrolled to controlled
farther to father
quietness to quiet
buildup to build
maverick to eric
shadings to shaking
coloring to color
treebeard to beard

In [15]:
with open(f"{resource_folder}/datasets/{dataset_folder}/split/{dataset_name}-test-new-words.json", "w") as f :
    f.write(json.dumps(dataset.new_words, indent=4))

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(
    f"{resource_folder}/output/{dataset_name}-train/checkpoint-1500",
    num_labels=dataset.n_classes
)

  t = torch.tensor([], dtype=storage.dtype, device=storage._untyped().device)


In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(16176, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [20]:
sentence_ids, predictions = [], []
n_batches = math.ceil(len(dataset) / batch_size)
for batch in range(n_batches) :
    print("Batch", batch + 1, "of", n_batches)
    input_dict = dataset[batch * batch_size : (batch + 1) * batch_size]
    preds = model.forward(input_ids=input_dict['input_ids'], attention_mask=input_dict['attention_mask'])
    sentence_ids += input_dict['sentence_ids']
    predictions  += [float(logit[0]) for logit in preds['logits']]
prediction_column = pd.Series(predictions, index=sentence_ids, name="predictions")

Batch 1 of 139
Batch 2 of 139
Batch 3 of 139
Batch 4 of 139
Batch 5 of 139
Batch 6 of 139
Batch 7 of 139
Batch 8 of 139
Batch 9 of 139
Batch 10 of 139
Batch 11 of 139
Batch 12 of 139
Batch 13 of 139
Batch 14 of 139
Batch 15 of 139
Batch 16 of 139
Batch 17 of 139
Batch 18 of 139
Batch 19 of 139
Batch 20 of 139
Batch 21 of 139
Batch 22 of 139
Batch 23 of 139
Batch 24 of 139
Batch 25 of 139
Batch 26 of 139
Batch 27 of 139
Batch 28 of 139
Batch 29 of 139
Batch 30 of 139
Batch 31 of 139
Batch 32 of 139
Batch 33 of 139
Batch 34 of 139
Batch 35 of 139
Batch 36 of 139
Batch 37 of 139
Batch 38 of 139
Batch 39 of 139
Batch 40 of 139
Batch 41 of 139
Batch 42 of 139
Batch 43 of 139
Batch 44 of 139
Batch 45 of 139
Batch 46 of 139
Batch 47 of 139
Batch 48 of 139
Batch 49 of 139
Batch 50 of 139
Batch 51 of 139
Batch 52 of 139
Batch 53 of 139
Batch 54 of 139
Batch 55 of 139
Batch 56 of 139
Batch 57 of 139
Batch 58 of 139
Batch 59 of 139
Batch 60 of 139
Batch 61 of 139
Batch 62 of 139
Batch 63 of 139
B

In [21]:
dataset_with_preds_df = pd.concat([dataset_df, prediction_column], axis=1)

In [22]:
if not os.path.exists(output_dir) :
    os.makedirs(output_dir)
dataset_with_preds_df.to_csv(f"{output_dir}/Predictions.csv")

In [23]:
dataset_with_preds_df

Unnamed: 0,text,phrase_id,class,predictions
3,<start> effective but too tepid biopic,13995,0.513890,0.356394
4,<start> if you sometimes like to go to the mov...,14123,0.736110,0.739910
5,<start> emerges as something rare an issue mov...,13999,0.861110,0.601264
6,<start> the film provides some great insight i...,14498,0.597220,0.697847
7,<start> offers that rare combination of entert...,14351,0.833330,0.775601
...,...,...,...,...
11621,<start> an imaginative comedythriller,13851,0.777780,0.768816
11623,<start> a rare beautiful film,18182,0.916670,0.827225
11626,<start> an hilarious romantic comedy,23211,0.888890,0.738332
11628,<start> never sinks into exploitation,26177,0.625000,0.232675


In [24]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label : 
        hits += 1
accuracy = hits / total
accuracy

0.7330316742081447

In [25]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"]
    label = row["class"]
    diff = label - pred
    hits += abs(diff)
avg_dev = hits / total
avg_dev

0.1626037349799219

In [26]:
true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label :
        if pred :
            true_positives  += 1
        else :
            true_negatives  += 1
    else :
        if pred :
            false_positives += 1
        else :
            false_negatives += 1
print(true_positives, true_negatives, false_positives, false_negatives)

694 926 173 417


In [27]:
precision = true_positives / (true_positives + false_positives)
precision

0.8004613610149942

In [28]:
recall = true_positives / (true_positives + false_negatives)
recall

0.6246624662466247

In [29]:
f1 = (2 * precision * recall) / (precision + recall)
f1

0.7017189079878666