# Imports + GPU Setup

In [1]:
import os

os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import transformers
import logging
import re
import nltk
import torch.nn.functional as F
import torch.optim as optim


from sklearn import metrics
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from copy import deepcopy
from urllib import request
from dont_patronize_me import DontPatronizeMe # data manager module
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler

from transformers import RobertaModel, RobertaTokenizer

from preprocessing import load_data, preprocess_data, DPMDataset

logging.basicConfig(level=logging.ERROR)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')



device: cuda


In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

True
1
NVIDIA A30 MIG 2g.12gb


# Data Setup

Retrieves the data, applies the specified train and test split to organise data into **train_df** and **dev_df**.

In [3]:
train_df, dev_df, test_df = load_data()

# downsample negative instances
pcldf = train_df[train_df.label==1]
npos = len(pcldf)
balanced_train_df = pd.concat([pcldf, train_df[train_df.label==0][:int(2.5*npos)]])
balanced_train_df = balanced_train_df[['text', 'community', 'label', 'country']]

# Dataset

In [None]:
processed_train_df = preprocess_data(balanced_train_df, clean_data=False, augment_data=True, add_country=False, add_community=False)
processed_dev_df = preprocess_data(dev_df, clean_data=False, add_country=False, add_community=False)
processed_test_df = preprocess_data(test_df, clean_data=False, add_country=False, add_community=False)

# Models

In [7]:
from transformers import RobertaPreTrainedModel, TrainingArguments, DebertaModel,DebertaPreTrainedModel

class PoolingHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.pooling = nn.AdaptiveMaxPool1d(1)  # Global Max Pooling
        self.projection =nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(config.hidden_size,1),
            nn.Sigmoid()
        )

    def forward(self, x):
        deberta_output_permuted = x.permute(0, 2, 1)  # Change the shape for pooling
        pooled_output = self.pooling(deberta_output_permuted).squeeze(-1)  # Apply pooling
        logits = self.projection(pooled_output)  # Projection layer
        return logits
    
class CLSHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.projection =nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(config.hidden_size,1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = x[:,0,:]
        logits = self.projection(x)  # Projection layer
        return logits

class DebertaClassification(DebertaPreTrainedModel):
    """
    Implementation of Deberta with a classifier head
    """
    def __init__(self,config):

        super().__init__(config)

        self.deberta = DebertaModel(config)
        self.head = CLSHead(config)

        self.init_weights()

    def forward(

        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        target=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        
        deberta_output = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )[0]
        logits = self.head(deberta_output)

        return logits


# Training Loop

In [8]:
# TRAINING LOOP FOR TRAINING DEBERTA 
from transformers import Trainer, TrainingArguments, DebertaTokenizer, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

class Trainer_PCL(Trainer):

    def __init__( 
        self,
        **kwargs
        ):

        super().__init__(**kwargs)
        
        self.epoch = 1
        self.results = {}


    def compute_loss(self, model, inputs, num_items_in_batch=None):
        
        outputs = model(**inputs).view(-1)

        loss_fn = nn.BCELoss()
        target = inputs['target'].float()
        loss = loss_fn(outputs, target)
        return loss
    
    # Custom Evaluation 
    def evaluate(self, evaluate_datset=None, ignore_keys=None, metric_key_prefix='eval'):
        
        if self.epoch < 10:
            self.epoch+=1
            return
        preds = []
        labels = []

        eval_dataloader = super().get_test_dataloader(self.eval_dataset)
        self.model.eval()
        with torch.no_grad():
            for data in tqdm(eval_dataloader):
                
                output = self.model(**data)
                pred = torch.max(output, 1)[1]
            
                preds.extend(pred.cpu().tolist())
                labels.extend(data['target'].cpu().tolist())

        # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
        metrics = compute_metrics((preds, labels))
        print(metrics)

        self.results[self.epoch] = metrics
        self.epoch += 1       

            
def compute_metrics(eval_pred):

    preds, labels = eval_pred

    report = classification_report(preds, labels, target_names=["Not PCL","PCL"], output_dict= True) 

    return {"f1": report['PCL']['f1-score'],
            "precision": report['PCL']['precision'],
            "recall": report['PCL']['recall']
            }

def train(model, data, num_epochs, lr=0.0001, optimizer=None, lr_scheduler=None):

    data = data.reset_index(drop=True)
    
    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
    train_dataset = DPMDataset(data, tokenizer, max_len=128)

    Training_args = TrainingArguments(
        output_dir="test_trainer",
        learning_rate=lr,
        logging_steps=100,
        per_device_train_batch_size=8,
        num_train_epochs=num_epochs,
        remove_unused_columns=False,
        logging_dir='./logs', 
    )
    
    trainer = Trainer_PCL(
        model = model,
        args = Training_args,
        train_dataset = train_dataset,
        # eval_dataset = eval_dataset,
        data_collator= train_dataset.collate_fn,
        optimizers = (optimizer, lr_scheduler),
    )

    trainer.train()
    trainer.save_model('deberta-finetuned')
    return trainer.results

In [12]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

def evaluate(model, tokenizer, data_df, batch_size=16):
    model.eval()  # Mise en mode évaluation

    text_input = data_df["text"].tolist()
    labels = data_df["label"].tolist()

    # Tokenisation
    encodings = tokenizer(text_input, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Création du DataLoader pour l'évaluation
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))
    dataloader = DataLoader(dataset, batch_size=batch_size)

    preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, batch_labels = [x.to(device) for x in batch]

            # Prédictions
            output = model(input_ids=input_ids, attention_mask=attention_mask).view(-1)
            batch_preds = torch.round(output).cpu().tolist()

            preds.extend(batch_preds)
            all_labels.extend(batch_labels.cpu().tolist())

    # Calcul du F1-score
    return f1_score(all_labels, preds)


# TRAINING

In [10]:
def set_seed(i):
    torch.manual_seed(i)
    np.random.seed(i)

In [13]:
scores = []

for i in range(5):
    set_seed(i)

    # model = JoBert.from_pretrained('FacebookAI/roberta-base').to(device)
    model = DebertaClassification.from_pretrained('microsoft/deberta-base').to(device)

    lr = 1e-5
    optimizer = optim.AdamW(model.parameters(), lr)
    lrs = lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    
    results = train(model, processed_train_df, num_epochs= 5, lr=lr, optimizer=optimizer, lr_scheduler=lrs)

    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

    score = evaluate(model, tokenizer, processed_dev_df)
    scores.append(score)

print(scores)
print(np.mean(scores))
print(np.std(scores))

Some weights of DebertaClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['head.projection.1.bias', 'head.projection.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6604
200,0.4777
300,0.4216
400,0.344
500,0.375
600,0.2961
700,0.3774
800,0.3653
900,0.2775
1000,0.257


Some weights of DebertaClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['head.projection.1.bias', 'head.projection.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6129
200,0.4608
300,0.4045
400,0.3356
500,0.3689
600,0.3336
700,0.329
800,0.3295
900,0.2544
1000,0.2431


Some weights of DebertaClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['head.projection.1.bias', 'head.projection.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6597
200,0.4519
300,0.4358
400,0.3515
500,0.3666
600,0.335
700,0.3357
800,0.3936
900,0.3033
1000,0.2663


Some weights of DebertaClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['head.projection.1.bias', 'head.projection.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6506
200,0.5307
300,0.467
400,0.4304
500,0.4289
600,0.3392
700,0.3874
800,0.3644
900,0.303
1000,0.2717


Some weights of DebertaClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['head.projection.1.bias', 'head.projection.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6709
200,0.5098
300,0.4379
400,0.3613
500,0.3876
600,0.3458
700,0.4062
800,0.3593
900,0.2873
1000,0.2647


[0.5687645687645687, 0.5454545454545454, 0.5350089766606823, 0.551594746716698, 0.5447154471544715]
0.5491076569501931
0.011171296379381402
