# Main imports and code

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
import pandas as pd
import logging
import torch
import random
import os
from sklearn.metrics import accuracy_score, f1_score
from preprocessing import load_data, preprocess_data, DPMDataset
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [2]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()


print('Cuda available? ',cuda_available)

Cuda available?  False


In [None]:
torch.cuda.set_device(1)

In [4]:
train_df, dev_df, test_df = load_data()

In [5]:
train_df = preprocess_data(train_df)
dev_df = preprocess_data(dev_df)
test_df = preprocess_data(test_df)

# RoBERTa Baseline for Task 1

In [6]:
training_set = train_df[['text', 'label']]

In [None]:
training_set

Unnamed: 0,text,label
0,This concerns the poor-families. The scheme sa...,1
1,This concerns the homeless. Durban 's homeless...,1
2,This concerns the poor-families. The next imme...,1
3,This concerns the vulnerable. Far more importa...,1
4,This concerns the poor-families. To strengthen...,1
...,...,...
8370,This concerns the refugee. Rescue teams search...,0
8371,This concerns the hopeless. The launch of ' Ha...,0
8372,This concerns the homeless. The unrest has lef...,0
8373,This concerns the hopeless. You have to see it...,0


In [None]:
from transformers import DebertaPreTrainedModel, DebertaModel

class DebertaClassification(DebertaPreTrainedModel):
    """
    Implementation of Deberta with a classifier head
    """
    def __init__(self,config):

        super().__init__(config)

        # self.roberta=RobertaModel(config)
        self.deberta = DebertaModel(config)
        self.pooling = torch.nn.AdaptiveMaxPool1d(1)  # Global Max Pooling
        self.projection =torch.nn.Sequential(torch.nn.Dropout(0.2),torch.nn.Linear(config.hidden_size,2))

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        target=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        
        deberta_output = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )[0]

        deberta_output_permuted = deberta_output.permute(0, 2, 1)  # Change the shape for pooling
        pooled_output = self.pooling(deberta_output_permuted).squeeze(-1)  # Apply pooling
        logits = self.projection(pooled_output)  # Projection layer
        return logits

In [None]:
# TRAINING LOOP FOR TRAINING DEBERTA 
from transformers import Trainer, TrainingArguments, DebertaTokenizer, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Custom Training Functions
class Trainer_PCL(Trainer):

    def __init__( 
        self,
        **kwargs
        ):

        super().__init__(**kwargs)
        
        self.epoch = 1
        self.results = {}


    def compute_loss(self, model, inputs):
        
        outputs = model(**inputs)

        loss_fn = nn.CrossEntropyLoss()
        target = inputs['target']
        loss = loss_fn(outputs.view(-1, 2), target.view(-1))
        return loss
    
    # Custom Evaluation 
    def evaluate(self, evaluate_datset=None, ignore_keys=None, metric_key_prefix='eval'):
        
        if self.epoch < 10:
            self.epoch+=1
            return
        preds = []
        labels = []

        eval_dataloader = super().get_test_dataloader(self.eval_dataset)
        self.model.eval()
        with torch.no_grad():
            for data in tqdm(eval_dataloader):
                
                output = self.model(**data)
                pred = torch.max(output, 1)[1]
            
                preds.extend(pred.cpu().tolist())
                labels.extend(data['target'].cpu().tolist())

        # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
        metrics = compute_metrics((preds, labels))
        print(metrics)

        self.results[self.epoch] = metrics
        self.epoch += 1       

            
def compute_metrics(eval_pred):

    preds, labels = eval_pred

    report = classification_report(preds, labels, target_names=["Not PCL","PCL"], output_dict= True) 

    return {"f1": report['PCL']['f1-score'],
            "precision": report['PCL']['precision'],
            "recall": report['PCL']['recall']
            }

def train(model, data, num_epochs, lr=0.0001, optimizer=None, lr_scheduler=None):
    
 #   train_data, eval_data = train_test_split(data, test_size=0.2, random_state=1)

    # Reset the index for the training DataFrame
    data = data.reset_index(drop=True)
    # Reset the index for the evaluation DataFrame
  #  eval_data = eval_data.reset_index(drop=True)
    
    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
    train_dataset = DPMDataset(data, tokenizer, max_len=128)
   # eval_dataset = DPMDataset2(eval_data, tokenizer, max_len=128)

    Training_args = TrainingArguments(output_dir="test_trainer",
                                    learning_rate=lr,
                                    logging_steps=100,
                                    per_device_train_batch_size=8,
                                    num_train_epochs=num_epochs,
                                    remove_unused_columns=False,
                                    logging_dir='./logs', 
                                 #   evaluation_strategy="epoch"
                                    )
    
    trainer = Trainer_PCL(
        model = model,
        args = Training_args,
        train_dataset = train_dataset,
       # eval_dataset = eval_dataset,
        data_collator= train_dataset.collate_fn,
        optimizers = (optimizer, lr_scheduler),
    )

    trainer.train()
    trainer.save_model('deberta-finetuned')
    return trainer.results

In [None]:
# Code for evaluating model performance 
def predict_PCL(input, tokenizer, model, country=None):
  
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=128)
  if country is not None:
    encodings['countries'] = country
  
  output = model(**encodings)
  preds = torch.max(output, 1)

  return {'prediction':preds[1], 'confidence':preds[0]}

def evaluate(model, tokenizer, data_loader, score=True):

  preds = []
  tot_labels = []

  with torch.no_grad():
    
    for data in tqdm(data_loader):

      text = data['text']
      # country = data['country'].float()
      pred = predict_PCL(text, tokenizer, model)

      preds.append(pred['prediction'].tolist())
      if score:
        tot_labels.append(data['target'].tolist())

  if score:
    # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
    print(classification_report(tot_labels, preds, target_names=["Not PCL","PCL"]))

  return preds


In [None]:
from torch.optim import lr_scheduler

# model = JoBert.from_pretrained('FacebookAI/roberta-base').to(device)
model = DebertaClassification.from_pretrained('microsoft/deberta-base')

lr = 1e-5
optimizer = optim.AdamW(model.parameters(), lr=lr)
lrs = lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

results = train(model, training_set, num_epochs= 10, lr=lr, optimizer=optimizer, lr_scheduler=lrs)

In [None]:
# Définir les poids des classes
class_weights = [1.0, 4.0]  # Liste au lieu de tensor

# Définir les paramètres du modèle
task1_model_args = ClassificationArgs(
    num_train_epochs=5,  # Augmenter les epochs pour mieux apprendre
    no_save=True,
    no_cache=True,
    overwrite_output_dir=True,
    use_multiprocessing=False, 
    use_multiprocessing_for_evaluation=False,
    process_count=1,
    learning_rate=4e-6,  # Learning rate plus petit pour RoBERTa-Large
    train_batch_size=8,  # RoBERTa-Large demande plus de VRAM
    eval_batch_size=8,  # Taille du batch pour l'évaluation
    max_seq_length=256,  # Limite la longueur des textes
    early_stopping_consider_epochs=True,  # Active l'early stopping
    early_stopping_patience=2,  # Stoppe si aucune amélioration après 2 epochs
    early_stopping_delta=0.001,  # Seuil de tolérance pour early stopping
)

# Initialiser le modèle
task1_model = ClassificationModel(
    "roberta",
    "roberta-large",
    args=task1_model_args,
    num_labels=2,
    use_cuda=cuda_available,
    weight=class_weights  # Ajout des poids des classes ici
)

# Entraînement du modèle avec poids
task1_model.train_model(
    training_set1[['text', 'label']], 
    weight=class_weights  # Ajout des poids des classes ici aussi
)

In [None]:
dev_preds_task1, _ = task1_model.predict(dev_df.text.tolist())

In [None]:
import collections
print(collections.Counter(dev_preds_task1))


In [None]:
train_preds_task1, _ = task1_model.predict(train_df.text.tolist())

In [None]:
# Get true labels
dev_true_labels = dev_df['label'].tolist()
train_true_labels = train_df['label'].tolist()

print(f'Accuracy train: {accuracy_score(train_true_labels, train_preds_task1)}')
print(f'Accuracy test: {accuracy_score(dev_true_labels, dev_preds_task1)}')
print(f'F1 train: {f1_score(train_true_labels, train_preds_task1)}')
print(f'F1 test: {f1_score(dev_true_labels, dev_preds_task1)}')