# XLM-Roberta with Training Data Augmented by Translation and Back-Translation

## Import libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/testdf/test_df.csv
/kaggle/input/roberta/transformers/default/1/RobertaModel/config.json
/kaggle/input/roberta/transformers/default/1/RobertaModel/tokenizer_config.json
/kaggle/input/roberta/transformers/default/1/RobertaModel/model.safetensors
/kaggle/input/roberta/transformers/default/1/RobertaModel/special_tokens_map.json
/kaggle/input/roberta/transformers/default/1/RobertaModel/sentencepiece.bpe.model
/kaggle/input/all-incl/all_incl_aug_shuffle.csv
/kaggle/input/smaller-aug-sets/tr_incl_non.csv
/kaggle/input/smaller-aug-sets/back_incl_non.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test_labels.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintende

## Prepare GPU

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Import and prepare the datasets

In [23]:
# Choose and import the training set for the current run

#train_df = pd.read_csv('/kaggle/input/all-incl/all_incl_aug_shuffle.csv')
train_df = pd.read_csv('/kaggle/input/smaller-aug-sets/back_incl_non.csv')
#train_df = pd.read_csv('/kaggle/input/smaller-aug-sets/tr_incl_non.csv')

In [6]:
# Import the validation set

val_df = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')

In [7]:
# Import the test set

test_df = pd.read_csv('/kaggle/input/testdf/test_df.csv')

In [7]:
# Print the train set
print(train_df.head())
train_df.shape

                 id                                       comment_text  toxic  \
0  9391e0448ec3a85e  Mr Birdsmight, with due respect i have full ri...      0   
1  efd69e53c53a3127  So basically, the user Johnuniq is to follow t...      0   
2  b1ba84c35a13f661  Я говорю о правильных комментариях, приятель. ...      1   
3  d3a86ff124309a1d  "\n\n Jimmy Wales used to make money from Porn...      1   
4  ec05157d086edc23  I did not ignore the hangon notice. I read you...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


(194400, 8)

In [None]:
# Print the test set
print(test_df.head())

   id                                       comment_text lang  toxic
0   0  Doctor Who adlı viki başlığına 12. doctor olar...   tr      0
1   1   Вполне возможно, но я пока не вижу необходимо...   ru      0
2   2  Quindi tu sei uno di quelli   conservativi  , ...   it      1
3   3  Malesef gerçekleştirilmedi ancak şöyle bir şey...   tr      0
4   4  :Resim:Seldabagcan.jpg resminde kaynak sorunu ...   tr      0


In [None]:
# Print the valisation set
print(val_df.head())

   id                                       comment_text lang  toxic
0   0  Este usuario ni siquiera llega al rango de    ...   es      0
1   1  Il testo di questa voce pare esser scopiazzato...   it      0
2   2  Vale. Sólo expongo mi pasado. Todo tiempo pasa...   es      1
3   3  Bu maddenin alt başlığı olarak  uluslararası i...   tr      0
4   4  Belçika nın şehirlerinin yanında ilçe ve belde...   tr      0


In [8]:
# Print the sizes of the data sets
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

Train size: 194400, Val size: 8000, Test size: 63812


## Prepare for training

In [8]:
# Import libraries

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import roc_auc_score, accuracy_score


In [9]:
# Load the model (possibly pick a pre-trained model, that was saved during previous training)
model = XLMRobertaForSequenceClassification.from_pretrained('/kaggle/input/roberta/transformers/default/1/RobertaModel')


In [10]:
# Load the tokenizer (possibly pick a tokenizer, that was saved during previous training)
tokenizer = XLMRobertaTokenizer.from_pretrained('/kaggle/input/roberta/transformers/default/1/RobertaModel')


In [24]:
# Put model on GPU, if possible
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
# Check, if the model is on GPU

if next(model.parameters()).is_cuda:
    print("The model is on the GPU.")
else:
    print("The model is on the CPU.")


The model is on the GPU.


In [26]:
# Define Tokenization function
def tokenize_function(batch):
    batch['comment_text'] = [str(text) for text in batch['comment_text']]
    tokenized = tokenizer(
        batch['comment_text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

    tokenized['labels'] = batch['toxic']
    return tokenized

In [14]:
# Convert validation and test datasets to HuggingFace datasets and tokenize them

val_dataset = Dataset.from_pandas(val_df[['comment_text', 'toxic']])
test_dataset = Dataset.from_pandas(test_df[['comment_text', 'toxic']])


val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=1024)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=1024)

Map:   0%|          | 0/194400 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/63812 [00:00<?, ? examples/s]

In [27]:
# Convert training dataset to HuggingFace dataset and tokenize it

train_dataset = Dataset.from_pandas(train_df[['comment_text', 'toxic']])
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=1024)


Map:   0%|          | 0/194400 [00:00<?, ? examples/s]

In [29]:
# Define metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    roc_auc = roc_auc_score(labels, probs)
    accuracy = accuracy_score(labels, preds)

    return {
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }

In [28]:

# Training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/JigsawProject/jigsaw-multilingual-toxic-comment-classification/checkpoints',
    eval_strategy='steps',
    learning_rate=3e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/JigsawProject/jigsaw-multilingual-toxic-comment-classification/logs',
    save_strategy='no',
    load_best_model_at_end=False,
    metric_for_best_model='roc_auc',
    greater_is_better=True,
    fp16=True,
    save_steps=1000,  # Save checkpoint every 1000 steps (adjust as needed)
    save_total_limit=2,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [30]:
# Check again, if model is on GPU

print(f"Using device: {model.device}")



Using device: cuda:0


## Training

In [31]:
# Start training
trainer.train()



Step,Training Loss,Validation Loss,Roc Auc,Accuracy
500,0.1022,0.996895,0.848208,0.709875
1000,0.0935,0.754129,0.858741,0.77
1500,0.0823,0.766818,0.869176,0.778625




TrainOutput(global_step=1519, training_loss=0.09249851019308707, metrics={'train_runtime': 2593.8116, 'train_samples_per_second': 74.948, 'train_steps_per_second': 0.586, 'total_flos': 1.2787197290496e+16, 'train_loss': 0.09249851019308707, 'epoch': 1.0})

## Evaluation

In [None]:
# Evaluate on test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test ROC-AUC: {test_results['eval_roc_auc']:.4f}")
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")

The ROC-AUC and Accuracy for the current run are printed. In this case, an additional experiment was conducted but not finished. The results of the different experiments and runs were saved in tables, the final results can be found in our group's presentation.

## Save the model and tokenizer
To enable continuous training and re-using of pre-trained models for adding more training epochs, the models and tokenizers are saved with an associated name.

In [19]:
# Save the model to the /kaggle/working/ directory
model.save_pretrained('/kaggle/working/RobertaModelg')

# Save the tokenizer to the /kaggle/working/ directory
tokenizer.save_pretrained('/kaggle/working/RobertaModelg')



('/kaggle/working/RobertaModelg/tokenizer_config.json',
 '/kaggle/working/RobertaModelg/special_tokens_map.json',
 '/kaggle/working/RobertaModelg/sentencepiece.bpe.model',
 '/kaggle/working/RobertaModelg/added_tokens.json')