In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install transformers -qq
# !pip install sentencepiece -qq
# !pip install tokenizer -qq

##Load Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def to_df(x, y):
    d = {"text": x, "label": y}
    return pd.DataFrame(d)

In [None]:
import random
from itertools import chain
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from nlpaug.util import Action


alpha_common_error = 0.10
alpha_common_error_char = 0.05
aug1_OCR = nac.OcrAug(aug_word_p=alpha_common_error)
aug2_Rins = nac.RandomCharAug(action="insert", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug3_Rsub = nac.RandomCharAug(action="substitute", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug4_Rswa = nac.RandomCharAug(action="swap", aug_word_p=alpha_common_error,aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char) #
aug5_Rdel = nac.RandomCharAug(action="delete", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug6_Kb = nac.KeyboardAug(aug_word_p=alpha_common_error)
aug7_Split = naw.SplitAug(aug_p=alpha_common_error)


def text2augment(text, m):
    output = [text, ]

    temp = random.sample(range(0, 7), m - 1)

    if 0 in temp:
        output.append( *aug1_OCR.augment(text))
    if 1 in temp:
        output.append( *aug2_Rins.augment(text))
    if 2 in temp:
        output.append( *aug3_Rsub.augment(text))
    if 3 in temp:
        output.append( *aug4_Rswa.augment(text))
    if 4 in temp:
        output.append( *aug5_Rdel.augment(text))
    if 5 in temp:
        output.append( *aug6_Kb.augment(text))
    if 6 in temp:
        output.append( *aug7_Split.augment(text))

    return output


def aug_replicate(y_labels):
    return list(chain(* [[y]*(2 if y == 0 else 4) for y in y_labels] ))

def aug_text(x_text, y_labels):
    x_text = [ text2augment(x, 2 if y == 0 else 4) for x, y in zip(x_text, y_labels)]
    return pd.Series(list(chain(*x_text)), index=None)

def split_3_aug(df, test_size=0.2, valid_size=0.2):
    _df = df.copy().sample(frac=1).reset_index()
    _df = _df[["text", "label"]]

    x = _df["text"].copy()
    y = _df["label"].copy()
    #split train-test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y)
    # augment
    # x_test = aug_text(x_test, y_test)
    # y_test = aug_replicate(y_test)
    # split train-valid
    x, y = x_train, y_train
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=valid_size, stratify=y)
    # augment
    x_train = aug_text(x_train, y_train)
    y_train = aug_replicate(y_train)
    x_valid = aug_text(x_valid, y_valid)
    y_valid = aug_replicate(y_valid)

    print(x_valid.shape)
    print("DONE")
    print(len(y_valid))

    print(x_train.shape)
    print("DONE")
    print(len(y_train))

    return to_df(x_train, y_train), to_df(x_valid, y_valid), to_df(x_test, y_test)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
import pandas as pd

tname_data = "./hsd_merge_cleaned_lowered"
data = pd.read_csv(f"{tname_data}.csv")

train, valid, test = split_3_aug(data)

X_train = train['text']
y_train = train['label']

X_valid = valid['text']
y_valid = valid['label']

X_test = test['text']
y_test = test['label']

(5772,)
DONE
5772
(23082,)
DONE
23082


## Set Cuda

In [None]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

torch.cuda.is_available()

True

# Extract feature by using BETO

In [None]:
import pandas as pd
from glob import glob

train_sentences = list(train['text'].values)
train_labels = list(train['label'].values)

valid_sentences = list(valid['text'].values)
valid_labels = list(valid['label'].values)

test_sentences = list(test['text'].values)
test_labels = list(test['label'].values)

Load tokenizer of BETO

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [None]:
# Encode train label

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_labels)
encoded_labels = le.transform(train_labels)
encoded_test_labels = le.transform(valid_labels)

In [None]:
# Tokens IDs tensor

def encoder_generator(sentences,labels):

    sent_index = []
    input_ids = []
    attention_masks =[]

    for index,sent in enumerate(sentences):

        sent_index.append(index)

        encoded_dict = tokenizer.encode_plus(sent,
                                             add_special_tokens=True,
                                             max_length=30,
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids,dim=0).cuda()
    attention_masks = torch.cat(attention_masks,dim=0).cuda()
    labels = torch.tensor(labels).cuda()
    sent_index = torch.tensor(sent_index).cuda()

    return sent_index,input_ids,attention_masks,labels

train_sent_index,train_input_ids,train_attention_masks,train_encoded_label_tensors = encoder_generator(train_sentences,encoded_labels)
valid_sent_index,valid_input_ids,valid_attention_masks,valid_encoded_label_tensors = encoder_generator(valid_sentences,encoded_test_labels)
print('Original: ', train_sentences[0])
print('Token IDs:', train_input_ids[0])



Original:  a mi no me gusta porque el escudo es en blanco y negro y los colores del escudo no son esos puto fútbol​ moderno
Token IDs: tensor([    4,  1013,  1153,  1084,  1129,  2816,  1817,  1040, 11888,  1058,
         1036,  5122,  1042,  5499,  1042,  1065,  8855,  1072, 11888,  1084,
         1404,  2651, 11935,  5921, 11897,     5,     1,     1,     1,     1],
       device='cuda:0')


In [None]:
# Connvert train, dev input by using TensorDataset

from torch.utils.data import TensorDataset,random_split

train_dataset = TensorDataset(train_input_ids,train_attention_masks,train_encoded_label_tensors)
valid_dataset = TensorDataset(valid_input_ids,valid_attention_masks,valid_encoded_label_tensors)

print('train data samples is {}'.format(len(train_dataset)))
print("valid data samples is {}".format(len(valid_dataset)))

train data samples is 23082
valid data samples is 5772


In [None]:
# Set cuda by using device

from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

bs=128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data_loader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=bs)
valid_data_loader = DataLoader(valid_dataset,
                              sampler=RandomSampler(valid_dataset),
                              batch_size=bs)

Load model BETO

In [None]:
from transformers import AutoModel

beto = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
beto = beto.to(device)

Downloading pytorch_model.bin: 100%|██████████| 440M/440M [02:31<00:00, 2.89MB/s] 
Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Build CNN

In [None]:
import torch
import torch.nn as nn

import torch
import torch.nn as nn

class CNNForNLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, num_filters, filter_sizes):
        super(CNNForNLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, filter_size)
            for filter_size in filter_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x, _):
        embedded = self.embedding(x)  # x: (batch_size, sequence_length)
        embedded = embedded.permute(0, 2, 1)  # embedded: (batch_size, embedding_dim, sequence_length)
        feature_maps = []
        for conv in self.convs:
            feature_map = torch.relu(conv(embedded))  # feature_map: (batch_size, num_filters, H)
            pooled = torch.max(feature_map, dim=2)[0]  # pooled: (batch_size, num_filters)
            feature_maps.append(pooled)
        combined = torch.cat(feature_maps, dim=1)  # combined: (batch_size, len(filter_sizes) * num_filters)
        combined = self.dropout(combined)
        logits = self.fc(combined)  # logits: (batch_size, num_classes)
        return logits


In [None]:

# Definir los parámetros del modelo
vocab_size = 31002#tamaño del vocabulario
embedding_dim = 768 #Dimension de los vectores de embedding
num_classes = 2 #numero de clases o categorias de clasificacion
num_filters = 32  #numero de filtros convolucionales
filter_sizes = [3]  #tamaño de los filtros convolucionales


CNNmodel = CNNForNLP(vocab_size,embedding_dim,num_classes,num_filters,filter_sizes)


In [None]:

# Imprimir el modelo
print(CNNmodel)

CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)


In [None]:
# Optimizer and criterion

import torch.optim as optim

model_parameters = list(CNNmodel.parameters())

optimizer = optim.Adam(model_parameters,lr=2e-5,eps=1e-8)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
# Calculate accuracy per batch during train

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).cuda()

In [None]:
# Def for training

from tqdm import tqdm

def train(model):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(train_data_loader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()

        predictions = model(b_input_ids,b_input_mask)

        loss = criterion(predictions, b_labels)

        acc = categorical_accuracy(predictions, b_labels)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(train_data_loader), epoch_acc / len(train_data_loader)

In [None]:
# Class for predict label

import numpy as np

def predictions_labels(preds,labels):
    pred = np.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [None]:
# Evaluate loss, acc  and f1-macro

from sklearn.metrics import classification_report,accuracy_score,f1_score
def eval(model):
    epoch_loss = 0

    total_predictions = []
    total_true = []

    all_true_labels = []
    all_pred_labels = []

    model.eval()

    with torch.no_grad():

        for batch in tqdm(valid_data_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            predictions = model(b_input_ids,b_input_mask)

            loss = criterion(predictions, b_labels)
            epoch_loss += loss.item()

            predictions = predictions.detach().cpu().numpy()

            label_ids = b_labels.to('cpu').numpy()

            pred,true = predictions_labels(predictions,label_ids)

            all_pred_labels.extend(pred)
            all_true_labels.extend(true)

    print(classification_report(all_pred_labels,all_true_labels))
    avg_val_accuracy = accuracy_score(all_pred_labels,all_true_labels)
    macro_f1_score = f1_score(all_pred_labels,all_true_labels,average='macro')

    avg_val_loss = epoch_loss/len(valid_data_loader)

    print("accuracy = {0:.2f}".format(avg_val_accuracy))

    return avg_val_loss,avg_val_accuracy,macro_f1_score

In [None]:
# Time for training

import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Set device and gpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

CNNmodel.cuda()

CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

# Training

In [None]:
epochs = 20

best_macro_f1 = float('0')

for epoch in range(epochs):

    start_time = time.time()
    train_loss,train_acc = train(CNNmodel)
    valid_loss,valid_acc,macro_f1 = eval(CNNmodel)
    end_time = time.time()


    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
    torch.save(CNNmodel,'./cnn_model_part1_'+'task2a_2.pt')
    print("model saved")

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. acc: {valid_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. F1: {macro_f1*100:.2f}%')
    print('=============Epoch Ended==============')

100%|██████████| 181/181 [00:04<00:00, 38.36it/s] 
100%|██████████| 46/46 [00:00<00:00, 554.24it/s]


              precision    recall  f1-score   support

           0       0.99      0.77      0.86      5657
           1       0.06      0.71      0.11       115

    accuracy                           0.76      5772
   macro avg       0.53      0.74      0.49      5772
weighted avg       0.97      0.76      0.85      5772

accuracy = 0.76
model saved
Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.670 | Train acc: 63.16%
	 Val. Loss: 0.532 |  Val. acc: 76.46%
	 Val. Loss: 0.532 |  Val. F1: 48.60%


100%|██████████| 181/181 [00:01<00:00, 108.05it/s]
100%|██████████| 46/46 [00:00<00:00, 621.65it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5517
           1       0.12      0.69      0.21       255

    accuracy                           0.77      5772
   macro avg       0.55      0.73      0.54      5772
weighted avg       0.94      0.77      0.84      5772

accuracy = 0.77
model saved
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 0.596 | Train acc: 71.22%
	 Val. Loss: 0.514 |  Val. acc: 77.25%
	 Val. Loss: 0.514 |  Val. F1: 53.88%


100%|██████████| 181/181 [00:01<00:00, 108.33it/s]
100%|██████████| 46/46 [00:00<00:00, 613.35it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5479
           1       0.14      0.67      0.23       293

    accuracy                           0.77      5772
   macro avg       0.56      0.73      0.55      5772
weighted avg       0.94      0.77      0.83      5772

accuracy = 0.77
model saved
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.566 | Train acc: 73.46%
	 Val. Loss: 0.502 |  Val. acc: 77.36%
	 Val. Loss: 0.502 |  Val. F1: 54.94%


100%|██████████| 181/181 [00:01<00:00, 108.11it/s]
100%|██████████| 46/46 [00:00<00:00, 656.83it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5467
           1       0.15      0.68      0.24       305

    accuracy                           0.77      5772
   macro avg       0.56      0.73      0.55      5772
weighted avg       0.93      0.77      0.83      5772

accuracy = 0.77
model saved
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.549 | Train acc: 74.42%
	 Val. Loss: 0.498 |  Val. acc: 77.46%
	 Val. Loss: 0.498 |  Val. F1: 55.41%


100%|██████████| 181/181 [00:01<00:00, 108.12it/s]
100%|██████████| 46/46 [00:00<00:00, 629.91it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5470
           1       0.14      0.68      0.24       302

    accuracy                           0.77      5772
   macro avg       0.56      0.73      0.55      5772
weighted avg       0.93      0.77      0.83      5772

accuracy = 0.77
model saved
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.537 | Train acc: 75.24%
	 Val. Loss: 0.497 |  Val. acc: 77.44%
	 Val. Loss: 0.497 |  Val. F1: 55.31%


100%|██████████| 181/181 [00:01<00:00, 108.21it/s]
100%|██████████| 46/46 [00:00<00:00, 638.91it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5464
           1       0.15      0.67      0.24       308

    accuracy                           0.77      5772
   macro avg       0.56      0.72      0.55      5772
weighted avg       0.93      0.77      0.83      5772

accuracy = 0.77
model saved
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.523 | Train acc: 75.67%
	 Val. Loss: 0.490 |  Val. acc: 77.41%
	 Val. Loss: 0.490 |  Val. F1: 55.37%


100%|██████████| 181/181 [00:01<00:00, 108.67it/s]
100%|██████████| 46/46 [00:00<00:00, 647.89it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5467
           1       0.15      0.67      0.24       305

    accuracy                           0.77      5772
   macro avg       0.56      0.73      0.55      5772
weighted avg       0.93      0.77      0.83      5772

accuracy = 0.77
model saved
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.516 | Train acc: 76.26%
	 Val. Loss: 0.484 |  Val. acc: 77.43%
	 Val. Loss: 0.484 |  Val. F1: 55.34%


100%|██████████| 181/181 [00:01<00:00, 108.41it/s]
100%|██████████| 46/46 [00:00<00:00, 597.25it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5453
           1       0.15      0.67      0.25       319

    accuracy                           0.78      5772
   macro avg       0.56      0.73      0.56      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.504 | Train acc: 76.63%
	 Val. Loss: 0.492 |  Val. acc: 77.53%
	 Val. Loss: 0.492 |  Val. F1: 55.84%


100%|██████████| 181/181 [00:01<00:00, 107.75it/s]
100%|██████████| 46/46 [00:00<00:00, 630.43it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5454
           1       0.15      0.68      0.25       318

    accuracy                           0.78      5772
   macro avg       0.57      0.73      0.56      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.495 | Train acc: 77.19%
	 Val. Loss: 0.480 |  Val. acc: 77.58%
	 Val. Loss: 0.480 |  Val. F1: 55.92%


100%|██████████| 181/181 [00:01<00:00, 106.78it/s]
100%|██████████| 46/46 [00:00<00:00, 621.73it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5457
           1       0.15      0.68      0.25       315

    accuracy                           0.78      5772
   macro avg       0.56      0.73      0.56      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 10 | Epoch Time: 0m 1s
	Train Loss: 0.485 | Train acc: 77.68%
	 Val. Loss: 0.479 |  Val. acc: 77.56%
	 Val. Loss: 0.479 |  Val. F1: 55.83%


100%|██████████| 181/181 [00:01<00:00, 105.28it/s]
100%|██████████| 46/46 [00:00<00:00, 629.62it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5438
           1       0.16      0.68      0.26       334

    accuracy                           0.78      5772
   macro avg       0.57      0.73      0.56      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 11 | Epoch Time: 0m 1s
	Train Loss: 0.479 | Train acc: 77.70%
	 Val. Loss: 0.470 |  Val. acc: 77.69%
	 Val. Loss: 0.470 |  Val. F1: 56.46%


100%|██████████| 181/181 [00:01<00:00, 105.27it/s]
100%|██████████| 46/46 [00:00<00:00, 629.77it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5436
           1       0.16      0.69      0.26       336

    accuracy                           0.78      5772
   macro avg       0.57      0.74      0.57      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 12 | Epoch Time: 0m 1s
	Train Loss: 0.472 | Train acc: 78.31%
	 Val. Loss: 0.469 |  Val. acc: 77.79%
	 Val. Loss: 0.469 |  Val. F1: 56.70%


100%|██████████| 181/181 [00:01<00:00, 105.18it/s]
100%|██████████| 46/46 [00:00<00:00, 629.86it/s]


              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5429
           1       0.17      0.70      0.27       343

    accuracy                           0.78      5772
   macro avg       0.57      0.74      0.57      5772
weighted avg       0.93      0.78      0.83      5772

accuracy = 0.78
model saved
Epoch: 13 | Epoch Time: 0m 1s
	Train Loss: 0.464 | Train acc: 78.42%
	 Val. Loss: 0.468 |  Val. acc: 77.98%
	 Val. Loss: 0.468 |  Val. F1: 57.22%


100%|██████████| 181/181 [00:01<00:00, 104.96it/s]
100%|██████████| 46/46 [00:00<00:00, 605.28it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5430
           1       0.17      0.71      0.28       342

    accuracy                           0.78      5772
   macro avg       0.57      0.75      0.57      5772
weighted avg       0.93      0.78      0.84      5772

accuracy = 0.78
model saved
Epoch: 14 | Epoch Time: 0m 1s
	Train Loss: 0.459 | Train acc: 79.05%
	 Val. Loss: 0.465 |  Val. acc: 78.10%
	 Val. Loss: 0.465 |  Val. F1: 57.43%


100%|██████████| 181/181 [00:01<00:00, 104.87it/s]
100%|██████████| 46/46 [00:00<00:00, 621.63it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5424
           1       0.18      0.72      0.29       348

    accuracy                           0.78      5772
   macro avg       0.58      0.75      0.58      5772
weighted avg       0.93      0.78      0.84      5772

accuracy = 0.78
model saved
Epoch: 15 | Epoch Time: 0m 1s
	Train Loss: 0.448 | Train acc: 79.54%
	 Val. Loss: 0.463 |  Val. acc: 78.27%
	 Val. Loss: 0.463 |  Val. F1: 57.89%


100%|██████████| 181/181 [00:01<00:00, 105.09it/s]
100%|██████████| 46/46 [00:00<00:00, 621.65it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5409
           1       0.19      0.73      0.30       363

    accuracy                           0.78      5772
   macro avg       0.58      0.76      0.59      5772
weighted avg       0.93      0.78      0.84      5772

accuracy = 0.78
model saved
Epoch: 16 | Epoch Time: 0m 1s
	Train Loss: 0.448 | Train acc: 79.55%
	 Val. Loss: 0.463 |  Val. acc: 78.47%
	 Val. Loss: 0.463 |  Val. F1: 58.55%


100%|██████████| 181/181 [00:01<00:00, 102.90it/s]
100%|██████████| 46/46 [00:00<00:00, 636.73it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5407
           1       0.19      0.73      0.30       365

    accuracy                           0.78      5772
   macro avg       0.58      0.76      0.59      5772
weighted avg       0.93      0.78      0.84      5772

accuracy = 0.78
model saved
Epoch: 17 | Epoch Time: 0m 1s
	Train Loss: 0.441 | Train acc: 79.74%
	 Val. Loss: 0.458 |  Val. acc: 78.47%
	 Val. Loss: 0.458 |  Val. F1: 58.59%


100%|██████████| 181/181 [00:01<00:00, 104.86it/s]
100%|██████████| 46/46 [00:00<00:00, 605.03it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5382
           1       0.20      0.73      0.32       390

    accuracy                           0.79      5772
   macro avg       0.59      0.76      0.59      5772
weighted avg       0.92      0.79      0.84      5772

accuracy = 0.79
model saved
Epoch: 18 | Epoch Time: 0m 1s
	Train Loss: 0.434 | Train acc: 80.25%
	 Val. Loss: 0.456 |  Val. acc: 78.69%
	 Val. Loss: 0.456 |  Val. F1: 59.49%


100%|██████████| 181/181 [00:01<00:00, 105.56it/s]
100%|██████████| 46/46 [00:00<00:00, 629.89it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5394
           1       0.20      0.74      0.31       378

    accuracy                           0.79      5772
   macro avg       0.59      0.77      0.59      5772
weighted avg       0.93      0.79      0.84      5772

accuracy = 0.79
model saved
Epoch: 19 | Epoch Time: 0m 1s
	Train Loss: 0.431 | Train acc: 80.22%
	 Val. Loss: 0.464 |  Val. acc: 78.76%
	 Val. Loss: 0.464 |  Val. F1: 59.40%


100%|██████████| 181/181 [00:01<00:00, 107.95it/s]
100%|██████████| 46/46 [00:00<00:00, 647.91it/s]


              precision    recall  f1-score   support

           0       0.98      0.79      0.87      5380
           1       0.21      0.74      0.32       392

    accuracy                           0.79      5772
   macro avg       0.59      0.77      0.60      5772
weighted avg       0.92      0.79      0.84      5772

accuracy = 0.79
model saved
Epoch: 20 | Epoch Time: 0m 1s
	Train Loss: 0.429 | Train acc: 80.32%
	 Val. Loss: 0.451 |  Val. acc: 78.86%
	 Val. Loss: 0.451 |  Val. F1: 59.85%


In [None]:
# Save BETO and CNN

torch.save(CNNmodel,'module2_part1.pt')


## EVALUATING

In [None]:
# Load BETO and cnn

import torch
CNNmodel = torch.load(r'cnn_model_part1_task2a_2.pt')
CNNmodel.eval()



CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

Predict label from true label

In [None]:
test_sent_index, test_input_ids, test_attention_masks, test_encoded_label_tensors = encoder_generator(test_sentences,test_labels)
test_dataset = TensorDataset(test_input_ids,test_attention_masks,test_encoded_label_tensors)

test_data_loader = DataLoader(test_dataset,
                              sampler=RandomSampler(test_dataset),
                              batch_size=bs)

all_pred_labels = []
all_true_labels = []

with torch.no_grad():
  for batch in tqdm(test_data_loader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    predictions = CNNmodel(b_input_ids,b_input_mask)


    predictions = predictions.detach().cpu().numpy()

    label_ids = b_labels.to('cpu').numpy()

    pred, true = predictions_labels(predictions, label_ids)

    all_pred_labels.extend(pred)
    all_true_labels.extend(true)

100%|██████████| 25/25 [00:00<00:00, 532.42it/s]


In [None]:
# The final score in the test set (classification report)

print(classification_report(all_pred_labels,all_true_labels, digits = 4))

              precision    recall  f1-score   support

           0     0.9725    0.8870    0.9278      2990
           1     0.2318    0.5763    0.3306       177

    accuracy                         0.8696      3167
   macro avg     0.6022    0.7316    0.6292      3167
weighted avg     0.9311    0.8696    0.8944      3167



In [None]:
# Confusion matrix in thetest set

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_true_labels, all_pred_labels)
cm

array([[2652,   75],
       [ 338,  102]], dtype=int64)