In [35]:
import pandas as pd
import numpy as np
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {DEVICE}')

Using device: cuda


In [33]:
file = "../news_datasets/full_training_dataset.csv"
BERT_OUTPUT_DIR = 'classification_report/distilbert_classification_model'
encoder_filename_pkl = 'classification_report/bert_label_encoder.pkl'

In [10]:
df = pd.read_csv(file)
print(f"Dataset loaded with shape: {df.shape}")

Dataset loaded with shape: (398, 5)


In [13]:
# Split the dataset into training and testing sets
X = df['Full Text']
y = df['llm_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encode the labels
label_encoder = LabelEncoder().fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
NUM_LABELS = len(label_encoder.classes_)

In [14]:
#  Calculate class weights to handle class imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
print("\n[INFO] Class weights calculated (inversely proportional to frequency) :")
print(class_weights_tensor)


[INFO] Class weights calculated (inversely proportional to frequency) :
tensor([0.4229, 0.5372, 1.2823, 3.0577, 1.4722, 1.0743, 1.7283, 2.0921],
       device='cuda:0')


In [18]:
#  Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
MAX_LENGTH = 512 # Maximum length for DistilBERT

def tokenize_data(texts):
    return tokenizer(
        list(texts),
        max_length=MAX_LENGTH,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

# Create Datasets and Dataloaders class
class EnvironmentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EnvironmentDataset(train_encodings, y_train_tensor)
test_dataset = EnvironmentDataset(test_encodings, y_test_tensor)

BATCH_SIZE = 8
train_dataoader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [28]:
# Model class modified to ponderate loss with class weights
class WeightedDistilBert(nn.Module):
    def __init__(self, num_labels, class_weights):
        super(WeightedDistilBert, self).__init__()
        self.distilbert = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=num_labels
        )
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits

        loss=None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.distilbert.num_labels), labels.view(-1))   

        return loss, logits

In [29]:
# Model Initialization
model = WeightedDistilBert(NUM_LABELS, class_weights_tensor).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=2e-5) # Typical learning rate for fine-tuning

# Training Loop
def train_bert(model, dataloader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        print(f"\n--- Epoch {epoch + 1} / {epochs} ---")
        for batch in tqdm(dataloader, desc="Training"):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            loss, _ = model(input_ids, attention_mask, labels)

            loss.backward()
            optimizer.step()

train_bert(model, train_dataoader, optimizer, epochs=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 1 / 3 ---


Training: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]



--- Epoch 2 / 3 ---


Training: 100%|██████████| 40/40 [00:09<00:00,  4.01it/s]



--- Epoch 3 / 3 ---


Training: 100%|██████████| 40/40 [00:09<00:00,  4.26it/s]


In [30]:
# Evaluation
def evaluate_bert(model, dataloader, encoder):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            _, logits = model(input_ids, attention_mask)
            
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(predictions)
            all_labels.extend(labels.cpu().numpy())

    target_names = encoder.classes_

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"\n🚀 Accuracy for model BERT : {len(all_labels)} datas : {accuracy:.4f}")
    
    report = classification_report(all_labels, all_preds, target_names=target_names)
    print("\nClassification report (BERT) :")
    print(report)
    
    return all_labels, all_preds

# Lancement de l'évaluation
all_labels, all_preds = evaluate_bert(model, test_dataloader, label_encoder)


Evaluation: 100%|██████████| 10/10 [00:01<00:00,  9.90it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



🚀 Accuracy for model BERT : 80 datas : 0.7125

Classification report (BERT) :
                                     precision    recall  f1-score   support

        BIODIVERSITY AND ECOSYSTEMS       0.86      1.00      0.92        24
              CLIMATE AND EMISSIONS       0.77      0.53      0.62        19
              ENERGY AND TRANSITION       0.60      0.75      0.67         8
                  NATURAL RESOURCES       0.00      0.00      0.00         3
            POLICIES AND REGULATION       0.60      0.43      0.50         7
POLLUTION AND ENVIRONMENTAL QUALITY       0.60      1.00      0.75         9
                RISKS AND DISASTERS       0.62      1.00      0.77         5
              SOCIO-ECONOMIC IMPACT       0.00      0.00      0.00         5

                           accuracy                           0.71        80
                          macro avg       0.51      0.59      0.53        80
                       weighted avg       0.66      0.71      0.67      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [37]:
# Save the trained model and tokenizer
os.makedirs(BERT_OUTPUT_DIR, exist_ok=True)

model.distilbert.save_pretrained(BERT_OUTPUT_DIR)
tokenizer.save_pretrained(BERT_OUTPUT_DIR)

with open(encoder_filename_pkl, 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

print(f"✅ DistilBERT model and Tokenizer registered in : {BERT_OUTPUT_DIR}")
print(f"✅ Label Encoder registered in : {encoder_filename_pkl}")

✅ DistilBERT model and Tokenizer registered in : classification_report/distilbert_classification_model
✅ Label Encoder registered in : classification_report/bert_label_encoder.pkl
