# Γιάννης Δαλιάνης
# 1115201700027
# Homework 3
# Άσκηση 1

[Source](https://github.com/bentrevett/pytorch-sentiment-analysis)

## Imports

In [None]:
import os
import re
import pandas as pd
import time
import random
import numpy as np
import nltk
import spacy
from google.colab import drive
from matplotlib import pyplot as plt
nltk.download('stopwords')
nlp = spacy.load('en')
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, f1_score, classification_report
import torch   
from torchtext import data, datasets
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

## Load Dataset and Preprocessing

Drop uneccessary columns.

In [None]:
drive.mount('/content/gdrive', force_remount=True)
tweets = pd.read_csv('/content/gdrive/My Drive/DI/Colab Notebooks/ex3/SentimentTweets.csv')
print(tweets.columns)
tweets = tweets.drop(['Unnamed: 0', 'id', 'date', 'flag', 'user'], axis=1)
tweets.rename(columns={'target': 'label'}, inplace = True)
tweets

In [None]:
tweets = tweets.sample(n = 3000)

Clean tweet text, remove stopwords and save to new csv file.

In [None]:
tweets = tweets.apply(lambda x: x.astype(str).str.lower())
def cleanText(text):
    text = text.str.replace(r'RT[\s]+', '')                                              # Removing RT
    text = text.str.replace(r'&amp;', '&')            # Replace '&amp;' with '&'
    text = text.str.replace(r'#.*?(?=\s|$)', '')                                        # remove hashtags and mentions
    text = text.str.replace(r'@.*?(?=\s|$)', '')
    text = text.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)  # remove urls
    text = text.apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    text = text.str.replace(r"\'re", " are")                                             # Change 're to 'are'
    text = text.str.replace(r"\'t", " not")                                             # Change 't to 'not'
    text = text.str.replace(r"\'d", " would")                                           # Change 'd to 'would'
    text = text.replace(r'\\n',' ', regex=True)                                         # remove newlines and other special characters
    text = text.replace(r'\\u',' ', regex=True)
    text = text.replace(r'\\x',' ', regex=True)
    text = text.str.replace('\d+', '')                                                  # remove all numbers
    text = text.str.replace(r'\b(\w{1,2})\b', '')                                       # remove words with 2 or 1 letter only
    text = text.str.replace('[^\w\s]','')                                               # remove punctuations
    text = text.apply(lambda x: re.sub(' +', ' ', x))                                   # replace multiple whitespaces
    # text = text.str.replace(r'(@.*?)[\s]', '')        # Remove '@name'
    # text = text.str.replace(r'\s+', ' ').str.strip()  # Remove trailing whitespace
    return text
def remove_stops(row):
  stops = nltk.corpus.stopwords.words("english")
  meaningful_words = [w for w in row if w not in stops]
  return meaningful_words
def rejoin_words(row):
  joined_words = ( " ".join(row))
  return joined_words

tweets['text'] = cleanText(tweets['text'])

# erase empty lines
nan_value = float("NaN")
tweets['text'].replace(" ", "", inplace=True)
tweets['text'].replace("", nan_value, inplace=True)
tweets.dropna(subset=['text'], inplace=True)

# 0 will be for negative and 1 for positive
tweets['label'] = tweets['label'].astype(int)
tweets['label'].replace(4, 1, inplace=True)

# # no good for for Sentiment Classification and takes some time
# tweets["text"] = tweets["text"].str.split()
# tweets['text'] = tweets['text'].apply(lambda x: remove_stops(x))
# tweets['text'] = tweets['text'].apply(lambda x: rejoin_words(x))

nan_value = float("NaN")  # erase empty lines
tweets['text'].replace(" ", "", inplace=True)
tweets['text'].replace("", nan_value, inplace=True)
tweets.dropna(subset=['text'], inplace=True)

tweets.reset_index(drop = True, inplace = True)

tweets["label"].replace( { 1: "pos", 0: "neg" }, inplace=True )

trainTWEETS, testTWEETS = train_test_split(tweets, test_size=0.2)
print("Value counts for Train sentiments")
print(trainTWEETS.label.value_counts())
print("Value counts for Test sentiments")
print(testTWEETS.label.value_counts())
trainTWEETS.to_csv(r'/content/gdrive/My Drive/DI/Colab Notebooks/ex3/trainTWEETS.csv')
testTWEETS.to_csv(r'/content/gdrive/My Drive/DI/Colab Notebooks/ex3/testTWEETS.csv')
tweets

## Test Data Set

In [None]:
testTRY = pd.read_csv('/content/gdrive/My Drive/DI/Colab Notebooks/ex3/testTWEETS.csv', index_col=0)
testTRY

## A GPU can be added by going to the menu and selecting:
## Runtime -> Change runtime type -> Hardware accelerator: GPU

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
start_timeTOTAL = time.time()

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## RNN

An RNN takes in sequence of words, $X=\{x_1, ..., x_T\}$, one at a time, and produces a _hidden state_, $h$, for each word. We use the RNN _recurrently_ by feeding in the current word $x_t$ as well as the hidden state from the previous word, $h_{t-1}$, to produce the next hidden state, $h_t$. 

$$h_t = \text{RNN}(x_t, h_{t-1})$$

We'll be using an RNN architecture called a Long Short-Term Memory (LSTM). Why is an LSTM better than a standard RNN? Standard RNNs suffer from the [vanishing gradient problem](https://en.wikipedia.org/wiki/Vanishing_gradient_problem). LSTMs overcome this by having an extra recurrent state called a _cell_, $c$ - which can be thought of as the "memory" of the LSTM - and the use use multiple _gates_ which control the flow of information into and out of the memory. We can simply think of the LSTM as a function of $x_t$, $h_t$ and $c_t$, instead of just $x_t$ and $h_t$.

$$(h_t, c_t) = \text{LSTM}(x_t, h_t, c_t)$$

The initial cell state, $c_0$, like the initial hidden state is initialized to a tensor of all zeros. The sentiment prediction is still, however, only made using the final hidden state, not the final cell state, i.e. $\hat{y}=f(h_T)$.

The concept behind a bidirectional RNN is simple. As well as having an RNN processing the words in the sentence from the first to the last (a forward RNN), we have a second RNN processing the words in the sentence from the **last to the first** (a backward RNN). At time step $t$, the forward RNN is processing word $x_t$, and the backward RNN is processing word $x_{T-t+1}$. 

In PyTorch, the hidden state (and cell state) tensors returned by the forward and backward RNNs are stacked on top of each other in a single tensor. 

We make our sentiment prediction using a concatenation of the last hidden state from the forward RNN (obtained from final word of the sentence), $h_T^\rightarrow$, and the last hidden state from the backward RNN (obtained from the first word of the sentence), $h_T^\leftarrow$, i.e. $\hat{y}=f(h_T^\rightarrow, h_T^\leftarrow)$

Multi-layer RNNs (also called *deep RNNs*) are another simple concept. The idea is that we add additional RNNs on top of the initial standard RNN, where each RNN added is another *layer*. The hidden state output by the first (bottom) RNN at time-step $t$ will be the input to the RNN above it at time step $t$. The prediction is then made from the final hidden state of the final (highest) layer.

The more parameters you have in in your model, the higher the probability that your model will overfit (memorize the training data, causing  a low training error but high validation/testing error, i.e. poor generalization to new, unseen examples). To combat this, we use regularization. More specifically, we use a method of regularization called *dropout*. Dropout works by randomly *dropping out* (setting to 0) neurons in a layer during a forward pass. The probability that each neuron is dropped out is set by a hyperparameter and each neuron with dropout applied is considered indepenently. One theory about why dropout works is that a model with parameters dropped out can be seen as a "weaker" (less parameters) model. The predictions from all these "weaker" models (one for each forward pass) get averaged together withinin the parameters of the model. Thus, your one model can be thought of as an ensemble of weaker models, none of which are over-parameterized and thus should not overfit.

In [None]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
TEXT  = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float, is_target=True, unk_token=None, batch_first=True)

fields = [(None, None), ('label', LABEL), ('text',TEXT)]
training_data = data.TabularDataset(
                          path =  '/content/gdrive/My Drive/DI/Colab Notebooks/ex3/trainTWEETS.csv',
                          format = 'csv',
                          fields = fields,
                          skip_header = True
                  )
test_data = data.TabularDataset(
                          path =  '/content/gdrive/My Drive/DI/Colab Notebooks/ex3/testTWEETS.csv',
                          format = 'csv',
                          fields = fields,
                          skip_header = True
                  )
for i in range(5):
  print(vars(training_data.examples[i]))

In [None]:
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of test examples: {len(test_data)}')

These pre-trained vectors already have words with similar semantic meaning close together in vector space, e.g. "terrible", "awful", "dreadful" are nearby. This gives our embedding layer a good initialization as it does not have to learn these relations from scratch.

In [None]:
#initialize glove embeddings
MAX_VOCAB_SIZE = 35_000 # top words
TEXT.build_vocab(
        train_data,
        max_size = MAX_VOCAB_SIZE,
        vectors = "glove.6B.100d",
        unk_init = torch.Tensor.normal_,
    )
LABEL.build_vocab(train_data)

Sometimes takes neg as 0 and sometimes takes neg as 1.

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

#Commonly used words
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:20])
print(LABEL.vocab.stoi)

two possibilities:

defaultdict(<function _default_unk_index at 0x7f588bf2eae8>, {'neg': 0, 'pos': 1})

defaultdict(<function _default_unk_index at 0x7f588bf2eae8>, {'neg': 1, 'pos': 0})

In [None]:
#define hyperparameters
INPUT_DIM       = len(TEXT.vocab)
EMBEDDING_DIM   = 100
HIDDEN_DIM      = 300
OUTPUT_DIM      = 1
N_LAYERS        = 2
BIDIRECTIONAL   = True
DROPOUT         = 0.5
N_EPOCHS        = 50
PAD_IDX         = TEXT.vocab.stoi[TEXT.pad_token]
BATCH_SIZE      = 256

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
      (train_data, valid_data, test_data), 
      batch_size = BATCH_SIZE,
      sort_key = lambda x: len(x.text),
      sort_within_batch=True,
      device = device
    )

In [None]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

Our three layers are an _embedding_ layer, our RNN, and a _linear_ layer. All layers have their parameters initialized to random values, unless explicitly specified.

The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector space.

The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$.

Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.

We feed the last hidden state, `hidden`, through the linear layer, `fc`, to produce a prediction.

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
                
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        
        # pack sequence
        if(torch.cuda.is_available()):
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths = text_lengths.cpu())
        else:
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        # unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden)

In [None]:
model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)
print(model)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

We'll be using *packed padded sequences*, which will make our RNN only process the non-padded elements of our sequence, and for any padded element the `output` will be a zero tensor.

Copying the pre-trained word embeddings we loaded earlier into the `embedding` layer of our model.

We then replace the initial weights of the `embedding` layer with the pre-trained embeddings.

As our `<unk>` and `<pad>` token aren't in the pre-trained vocabulary they have been initialized using `unk_init` (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment. 

We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()

print(optimizer)

model = model.to(device)
criterion = criterion.to(device)

#define metric
def binary_accuracy(preds, y):
    # Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss  = 0
    epoch_acc   = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 4.0)
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss  = 0
    epoch_acc   = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Early Stoping Implementation from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved. Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement. Default: 0
            path (str): Path for the checkpoint to be saved to. Default: 'checkpoint.pt'
            trace_func (function): trace print function. Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
train_losses  = []
val_losses    = []
train_accL  = []
val_accL    = []
iter = []

early_stopping = EarlyStopping(patience=4, verbose=True)

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc*100:.2f}%\n')
    
    train_losses.append(train_loss)
    val_losses.append(valid_loss)
    train_accL.append(train_acc)
    val_accL.append(valid_acc)
    iter.append(epoch)

    # early_stopping needs the validation loss to check if it has decresed, 
    # and if it has, it will make a checkpoint of the current model
    early_stopping(valid_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

If no early stopping used, red line is on last epoch

In [None]:
fig = plt.figure(figsize=(10,8))

plt.plot(range(1,len(train_losses)+1),train_losses, label='Training Loss')
plt.plot(range(1,len(val_losses)+1),val_losses,label='Validation Loss')

minposs = val_losses.index(min(val_losses))+1
minposs = val_losses.index(min(val_losses))+1
plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')

plt.xlabel('Epochs')
plt.ylabel('Loss')

plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.title("Training Curve")
plt.plot(iter, train_losses, label="Train")
plt.xlabel("Iterations")
plt.ylabel("Loss")

plt.subplot(1,2,2)
plt.title("Accuracy Curve")
plt.plot(iter, train_accL, label="Train")
plt.plot(iter, val_accL, label="Validation")
plt.xlabel("Iterations")
plt.ylabel("Training Accuracy")
plt.legend(loc='best')
plt.show()

print("Final Training Accuracy: {}".format(train_accL[-1]))
print("Final Validation Accuracy: {}".format(val_accL[-1]))

In [None]:
test_loss, test_acc = evaluate(model, valid_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
def predictFIRST(model, sentence, flag=False):
    model.eval()
    if(flag==False):
      tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    else:
        tokenized = sentence
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)

    prediction = torch.sigmoid( model(tensor, length_tensor) )
    
    return prediction.item()

In [None]:
PRED_VAL = predictFIRST(model, "This film is terrible")
print("For text: """"This film is terrible"""" ->", PRED_VAL)
PRED_VAL = round( PRED_VAL )  # for negative
print( PRED_VAL )
print("For text: """"This film is great"""" ->", predictFIRST(model, "This film is great"))
# neg
print("For text: """"david carradine sad thai law sure fowl play"""" ->", predictFIRST(model, "david carradine sad thai law sure fowl play"))
# pos
print("For text: """"tell bro say congrats"""" ->", predictFIRST(model, "tell bro say congrats"))

In [None]:
def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    """
    fpr, tpr, threshold = roc_curve(y_true, probs)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
def writePredsSECOND(row):
  if( round( row )==PRED_VAL ):
    return "neg"
  else:
    return "pos"
def writeProbs(row):
  return predictFIRST(model, row)
def ProbsToPreds(row):
  return round( row )
def LabelTOINT(row):
  if( row=="neg" ):
    return PRED_VAL
  else:
    return 1 - PRED_VAL

def computeDFAccuracySECOND(df):
  df['Probs'] = df['text'].apply(lambda x: writeProbs(x))
  df['Preds'] = df['Probs'].apply(lambda x: writePredsSECOND(x))
  df['PredsINTS'] = df['Probs'].apply(lambda x: ProbsToPreds(x))
  df['labelINT'] = df['label'].apply(lambda x: LabelTOINT(x))

  print(classification_report(df['label'],df['Preds']))
  print("Accuracy Score -> ",   accuracy_score(   df['label'], df['Preds']))
  print("Precision Score -> ",  precision_score(  df['label'], df['Preds'], average='macro')*100)  # warnings for small epochs
  print("Recall Score -> ",     recall_score(     df['label'], df['Preds'], average='macro')*100)
  print("F-Measure Score -> ",  f1_score(         df['label'], df['Preds'], average='macro')*100)
  
  evaluate_roc(df['Probs'], df['labelINT']) # thelei labels int
  
  return df

testTRY = computeDFAccuracySECOND(testTRY)
testTRY

## Time Needed

In [None]:
end_timeTOTAL = time.time()
mins, secs = epoch_time(start_timeTOTAL, end_timeTOTAL)
print(f'Total Time: {mins}m {secs}s')