In [None]:
!pip install --no-cache-dir transformers sentencepiece

In [None]:
import numpy as np
import pandas as pd
import random
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from torch.optim.lr_scheduler import CosineAnnealingLR
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import AdamW
from torch.cuda.amp import GradScaler
from torch import autocast

import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoConfig, AutoModel

import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def seed_everything(seed: int):

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)

g = torch.Generator()
g.manual_seed(0)

In [None]:
class MeanHead(nn.Module):
    def __init__(self, hidden_size: int, num_hidden_layers: int):
        super(MeanHead, self).__init__()

        self.linear_output = nn.Sequential(
                                nn.Dropout(p = 0.2),
                                nn.Linear(hidden_size, 3)
                              )

    def forward(self, head_inputs: dict):

        features = self.get_features(head_inputs)
        output = self.linear_output(features)

        return output

    def get_features(self, head_inputs: dict):

        last_hidden_state = head_inputs['output_model'][0]
        attention_mask = head_inputs['attention_mask']

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask

        return mean_embeddings

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_path: str, layers_freeze: int):
        super().__init__()

        self.config_model = AutoConfig.from_pretrained(model_path)
        self.config_model.attention_probs_dropout_prob = 0
        self.config_model.hidden_dropout_prob = 0

        self.model = AutoModel.from_pretrained(model_path, config=self.config_model)
        self.hidden_size = self.config_model.hidden_size
        self.num_hidden_layers = self.config_model.num_hidden_layers

        if layers_freeze > 0:
            if layers_freeze == self.num_hidden_layers:
                print(f'Freezing all model')
                self.model.requires_grad_(False)
            else:
                print(f'Freezing the first {layers_freeze} layers')
                self.freeze_layers(layers_freeze)

        self.head = MeanHead(self.hidden_size, self.num_hidden_layers)

    def freeze_layers(self, layers: int):

        self.model.embeddings.requires_grad_(False)
        self.model.encoder.layer[:layers].requires_grad_(False)

    def take_features(self, inputs):
        output_model = self.model(**inputs, return_dict=False, output_hidden_states = False)

        inputs['output_model'] = output_model

        return inputs

    def forward(self, inputs):

        features = self.take_features(inputs)

        return self.head(features)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):

        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df['discurso'][idx]

        tokenized = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            verbose=False
        )

        labels = self.df['label'][idx]
        targets = torch.tensor(labels, dtype=torch.long)


        token_output = {'input_ids': torch.tensor(tokenized['input_ids'], dtype=torch.long),
                        'attention_mask': torch.tensor(tokenized['attention_mask'], dtype=torch.long)}

        return token_output, targets

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
def get_loaders(CFG):

    train = CustomDataset(CFG['TRAIN_DF'], CFG['TOKENIZER'], CFG['MAX_LEN'])

    train_loader = torch.utils.data.DataLoader(train,
                             shuffle=True,
                             pin_memory=True,
                             batch_size=CFG['BATCH_SIZE'],
                             num_workers=0,
                            worker_init_fn=seed_worker,
                            generator=g)

    valid = CustomDataset(CFG['VAL_DF'], CFG['TOKENIZER'], CFG['MAX_LEN'])

    valid_loader = torch.utils.data.DataLoader(valid,
                             shuffle=False,
                             pin_memory=True,
                             batch_size=CFG['BATCH_SIZE'] * 6,
                             num_workers=0,
                            worker_init_fn=seed_worker,
                            generator=g)

    return train_loader, valid_loader

In [None]:
from tqdm.notebook import tqdm

In [None]:
class CustomScheduler:
    def __init__(self, optimizer, total_steps, warmup_steps=0):
        self.warmup_steps = int(warmup_steps)
        self.optimizer = optimizer
        self.lr_warmup = {}
        self.linear_decay_layers = {}

        steps_decay_linear_lr = total_steps - self.warmup_steps
        for index, _ in enumerate(self.optimizer.param_groups):
            lr_linear_decay = self.optimizer.param_groups[index]["lr"] / steps_decay_linear_lr
            self.linear_decay_layers[f"{index}"] = lr_linear_decay

        self.initial_lr = {}
        for index, _ in enumerate(self.optimizer.param_groups):
            self.initial_lr[f"{index}"] = self.optimizer.param_groups[index]["lr"]

        if (self.warmup_steps):
            for index, _ in enumerate(self.optimizer.param_groups):
                lr_step_warmup = (self.optimizer.param_groups[index]["lr"] - self.optimizer.param_groups[index]["min_lr"]) / self.warmup_steps
                self.lr_warmup[f"{index}"] = lr_step_warmup
                self.optimizer.param_groups[index]["lr"] = self.optimizer.param_groups[index]["min_lr"]

    def step(self, current_step):

        if self.warmup_steps > current_step:
            for index, _ in enumerate(self.optimizer.param_groups):
                self.optimizer.param_groups[index]["lr"] += self.lr_warmup[f"{index}"]
        else:
            for index, _ in enumerate(self.optimizer.param_groups):
                self.optimizer.param_groups[index]["lr"]  -= self.linear_decay_layers[f"{index}"]

In [None]:
all_data = pd.read_csv('/content/drive/MyDrive/PLN - Projeto/data_label.csv').drop(columns=['Unnamed: 0'])

In [None]:
lb_encoder = LabelEncoder()
all_data['label'] = lb_encoder.fit_transform(all_data['label'])

In [None]:
all_data

In [None]:
train_val_df, test_df = train_test_split(all_data, test_size=0.1, random_state=42, stratify=all_data.label)

In [None]:
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, stratify=train_val_df.label)

In [None]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
def get_optimizer_grouped_parameters(model, CFG):

    groups = [['layer.0.','layer.1.','layer.2.'],
             ['layer.3.', 'layer.4.','layer.5.'],
             ['layer.6.','layer.7.', 'layer.8.'],
             ['layer.9.','layer.10.','layer.11.']]

    all_groups =['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']


    optimizer_grouped_parameters = [
         {'params': [p for i, (n, p) in enumerate(model.named_parameters()) if not any(nd in n for nd in all_groups) and i < 5],'weight_decay': CFG['WGD'], 'lr': CFG['LR']*0.89, 'min_lr': 0},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in groups[0])],'weight_decay': CFG['WGD'], 'lr': CFG['LR']*0.91, 'min_lr': 0},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in groups[1])],'weight_decay': CFG['WGD'], 'lr': CFG['LR']*0.93, 'min_lr': 0},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in groups[2])],'weight_decay': CFG['WGD'], 'lr': CFG['LR']*0.95, 'min_lr': 0},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in groups[3])],'weight_decay': CFG['WGD'], 'lr': CFG['LR']*0.97, 'min_lr': 0},
         {'params': [p for i, (n, p) in enumerate(model.named_parameters()) if not any(nd in n for nd in all_groups) and i > 10 ],'weight_decay': CFG['WGD'], 'lr': CFG['LR'], 'min_lr': 0}
    ]

    return optimizer_grouped_parameters

In [None]:
def tokenize_samples(samples, CFG):

  tokenized = CFG['TOKENIZER'](
            samples,
            add_special_tokens=True,
            max_length=CFG['MAX_LEN'],
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            verbose=False
        )

  inputs = {'input_ids': torch.tensor(tokenized['input_ids'], dtype=torch.long),
                  'attention_mask': torch.tensor(tokenized['attention_mask'], dtype=torch.long)}

  return inputs

In [None]:
def inference(samples, model, CFG):

  inputs = tokenize_samples(samples, CFG)

  inputs = {k:inputs[k].to(device=CFG['DEVICE']) for k in inputs.keys()}

  model.eval()
  with torch.no_grad():
    output = model(inputs)

  preds = torch.argmax(F.softmax(output, 1), 1).cpu().detach().numpy()

  return preds

In [None]:
def train_epoch(model, loader, optimizer, loss_func, schedule, global_steps, scaler, CFG):

  avg_train_loss = 0

  train_preds = []
  train_targets = []

  model.train()
  for inputs, targets in tqdm(loader, total=len(loader)):

      inputs = {k:inputs[k].to(device=CFG['DEVICE']) for k in inputs.keys()}
      targets = targets.to(device=CFG['DEVICE'])

      with autocast(device_type=CFG['DEVICE'], dtype=torch.float16):
        output = model(inputs)
        loss = loss_func(output, targets)

      optimizer.zero_grad()

      scaler.scale(loss).backward()

      #scaler.unscale_(optimizer)
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

      scaler.step(optimizer)
      scaler.update()

      schedule.step(global_steps)

      avg_train_loss += loss.item()
      train_preds.append(torch.argmax(F.softmax(output, 1), 1).cpu().detach().numpy())
      train_targets.append(targets.cpu().detach().numpy())
      global_steps += 1

      del inputs, targets, output, loss

  torch.cuda.empty_cache()

  print(f'AVG TRAIN LOSS: {avg_train_loss / len(loader)} / TRAIN accuracy: {accuracy_score(np.concatenate(train_targets).reshape(-1), np.concatenate(train_preds).reshape(-1))}')

  return global_steps

In [None]:
def validation_epoch(model, loader, loss_func, best_score, CFG):

  avg_valid_loss = 0

  val_preds = []
  val_targets = []

  model.eval()
  with torch.no_grad():
      for inputs, targets in tqdm(loader, total=len(loader)):
          inputs = {k:inputs[k].to(device=CFG['DEVICE']) for k in inputs.keys()}
          targets = targets.to(device=CFG['DEVICE'])

          with autocast(device_type=CFG['DEVICE'], dtype=torch.float16):
            output = model(inputs)
            loss = loss_func(output, targets)

          avg_valid_loss += loss.item()
          val_preds.append(torch.argmax(F.softmax(output, 1), 1).cpu().detach().numpy())
          val_targets.append(targets.cpu().detach().numpy())

          del inputs, targets, output, loss

  torch.cuda.empty_cache()

  val_preds = np.concatenate(val_preds).reshape(-1)
  val_targets = np.concatenate(val_targets).reshape(-1)

  valid_accuracy = accuracy_score(val_targets, val_preds)

  print(f'AVG VALID LOSS: {avg_valid_loss / len(loader)} / VALID accuracy: {valid_accuracy}')

  if valid_accuracy > best_score:
      best_score = valid_accuracy
      print(f"{'-'*20} Saving model, Score: {best_score} {'-'*20}")
      torch.save(model.state_dict(), f'/content/drive/MyDrive/PLN - Projeto/deberta_best.pth')

  return val_preds, val_targets, best_score

In [None]:
def train_model(CFG):
  train_loader, valid_loader = get_loaders(CFG)

  model = CustomModel(CFG['MODEL_PATH'], CFG['FREEZE_LAYERS'])

  #print(model)

  # for i, (n, p) in enumerate(model.named_parameters()):
  #   print(n)

  loss_func = nn.CrossEntropyLoss(reduction='mean')

  #parameters = [
  #         {'params': [p for p in model.parameters()],'weight_decay': CFG['WGD'], 'lr': CFG['LR'], 'min_lr': CFG['MIN_LR']}]

  optimizer_grouped_parameters = get_optimizer_grouped_parameters(model, CFG)

  optimizer = AdamW(optimizer_grouped_parameters, lr=CFG['LR'])

  custom_lr_scheduler = CustomScheduler(optimizer,
                                        int(CFG['EPOCHS'] * len(train_loader)),
                                        int(CFG['EPOCHS'] * len(train_loader) * CFG['WARMUP_PERCENT'])
                                       )

  global_steps = 0
  best_score = 0

  model.to(CFG['DEVICE'])

  scaler = GradScaler()

  for epoch in tqdm(range(0, CFG['EPOCHS']), desc="Training..."):
    print(f"{'-'*30} EPOCH {epoch + 1} / {CFG['EPOCHS']} {'-'*30}")

    global_steps = train_epoch(model, train_loader, optimizer, loss_func, custom_lr_scheduler, global_steps, scaler, CFG)
    _, _, best_score = validation_epoch(model, valid_loader, loss_func, best_score, CFG)

In [None]:
model_path = '/content/drive/MyDrive/PLN - Projeto/DebertaV3'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

In [None]:
Configs = {
    'TRAIN_DF': train_df,
    'VAL_DF': val_df,
    'LR': 1e-5,
    'MIN_LR': 0,
    'WGD': 0.01,
    'MAX_LEN':  256,
    'MODEL_PATH': '/content/drive/MyDrive/PLN - Projeto/DebertaV3',
    'EPOCHS': 5,
    'BATCH_SIZE': 20,
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
    'FREEZE_LAYERS': 0,
    'WARMUP_PERCENT': 0.12,
    'TOKENIZER': tokenizer
}

In [None]:
train_model(Configs)

In [None]:
train_loader, valid_loader = get_loaders(Configs)

In [None]:
model = CustomModel(Configs['MODEL_PATH'], Configs['FREEZE_LAYERS'])
model.to(Configs['DEVICE'])

model.load_state_dict(torch.load('/content/drive/MyDrive/PLN - Projeto/deberta_best.pth'))

In [None]:
loss_func = nn.CrossEntropyLoss(reduction='mean')

In [None]:
preds, targets, _ = validation_epoch(model, valid_loader, loss_func, 1.0, Configs)

In [None]:
cm = confusion_matrix(targets, preds)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=lb_encoder.classes_)

disp.plot()
plt.show()

In [None]:
test = CustomDataset(test_df, Configs['TOKENIZER'], Configs['MAX_LEN'])

test_loader = torch.utils.data.DataLoader(test,
                            pin_memory=True,
                            batch_size=Configs['BATCH_SIZE'] * 6,
                            num_workers=0,
                            worker_init_fn=seed_worker,
                            generator=g)

In [None]:
preds, targets, _ = validation_epoch(model, test_loader, loss_func, 1.0, Configs)

In [None]:
cm = confusion_matrix(targets, preds)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=lb_encoder.classes_)

disp.plot()
plt.show()

In [None]:
preds, targets, _ = validation_epoch(model, train_loader, loss_func, 1.0, Configs)

In [None]:
cm = confusion_matrix(targets, preds)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=lb_encoder.classes_)

disp.plot()
plt.show()