# Librairies Importation

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Preprocessing

In [28]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# Charger les données
X_train = pd.read_csv('kaggle_data/train_x.csv', index_col=0)
y_train = pd.read_csv('kaggle_data/train_y.csv')
X_test = pd.read_csv('kaggle_data/test_x.csv')
X_val = pd.read_csv('kaggle_data/val_x.csv')
y_val = pd.read_csv('kaggle_data/val_y.csv')

#Change type of string column to string
X_train['string'] = X_train['string'].astype(str)
X_test['string'] = X_test['string'].astype(str)
X_val['string'] = X_val['string'].astype(str)


#Sample dataset
X_train_sample = X_train[:10]
y_train_sample = y_train[:10]
X_val_sample = X_val[:10]
y_val_sample = y_val[:10]

def train_bert_model(target):
    
    # Get the target column
    y_target = y_train_sample[target]
    
    # Initialize the BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    # Tokenize the input data
    inputs = tokenizer(X_train_sample['string'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    
    # Create the dataloader
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(y_target.tolist()))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
    
    # Set up the optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(dataloader) * 10
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Train the model
    model.train()
    for epoch in range(10):
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        for batch in progress_bar:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'Loss': loss.item()})
    
    # Set the name of the model
    model_name = f"model_BERT_{target}"
    
    return model, model_name


In [29]:
train_bert_model('male')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1/1 [00:12<00:00, 12.44s/it, Loss=0.639]
Epoch 2: 100%|██████████| 1/1 [00:07<00:00,  7.56s/it, Loss=0.606]
Epoch 3: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it, Loss=0.548]
Epoch 4: 100%|██████████| 1/1 [00:08<00:00,  8.12s/it, Loss=0.538]
Epoch 5: 100%|██████████| 1/1 [00:08<00:00,  8.07s/it, Loss=0.463]
Epoch 6: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it, Loss=0.494]
Epoch 7: 100%|██████████| 1/1 [00:09<00:00,  9.04s/it, Loss=0.478]
Epoch 8: 100%|██████████| 1/1 [00:09<00:00,  9.77s/it, Loss=0.423]
Epoch 9: 100%|██████████| 1/1 [00:08<00:00,  8.94s/it, Loss=0.471]
Epoch 10: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it, Loss=0.413]


(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerN

In [31]:
import os
# Create the "models" directory if it doesn't exist
if not os.path.exists("models"):
    os.makedirs("models")

toxicity_categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions','black', 'white']

trained_models = {}

for category in toxicity_categories:
    model, model_name = train_bert_model(category)
    trained_models[model_name]=model
    model.save_pretrained(f"{model_name}")

trained_models


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1: 100%|██████████| 1/1 [00:09<00:00,  9.59s/it, Loss=0.871]
Epoch 2: 100%|██████████| 1/1 [00:11<00:00, 11.26s/it, Loss=0.701]
Epoch 3: 100%|██████████| 1/1 [00:17<00:00, 17.21s/it, Loss=0.592]
Epoch 4: 100%|██████████| 1/1 [00:19<00:00, 19.63s/it, Loss=0.533]
Epoch 5: 100%|██████████| 1/1 [00:19<00:00, 19.28s/it, Loss=0.487]
Epoch 6: 100%|██████████| 1/1 [00:16<00:00, 16.83s/it, Loss=0.413]
Epoch 7: 100%|██████████| 1/1 [00:12<00:00, 12.26s/it, Loss=0.441]
Epoch 8: 100%|██████████| 1/1 [00:10<00:00, 10.90s/it, Loss=0.384]
Epoch 9: 100%|██████████| 1/1 [00:12<00:00, 12.37s/it, Loss=0.325]
Epoch 10: 100%|██████████| 1/1 [00:15<00:00, 15.24s/it, Loss=0.306]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1/1 [00:14

{'model_BERT_male': BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Change type of string column to string
X_val['string'] = X_val['string'].astype(str)

X_val_sample = X_val[:100]
y_val_sample = y_val[:100]

categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions','black', 'white']
# Iterate over the trained models
evaluation_results = {}
for (model_name, model), category in zip(trained_models.items(), categories):
    # Tokenize the input data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(X_val_sample['string'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    
    # Create the dataloader
    dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(y_val_sample[category].tolist()))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False)
    
    # Evaluate the model
    model.eval()
    predictions = []
    targets = []
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.tolist())
        targets.extend(labels.tolist())
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(targets, predictions)
    precision = precision_score(targets, predictions)
    recall = recall_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    
    # Store the evaluation results
    evaluation_results[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

evaluation_results


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'model_BERT_male': {'Accuracy': 0.94,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_female': {'Accuracy': 0.89,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_LGBTQ': {'Accuracy': 0.99,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_christian': {'Accuracy': 0.97,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_muslim': {'Accuracy': 0.91,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_other_religions': {'Accuracy': 0.94,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_black': {'Accuracy': 0.93,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_BERT_white': {'Accuracy': 0.94,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0}}