# RoBERTa Training for NLI

In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


## Step 1: Load and prepare training & dev datasets

In [2]:
train_df = pd.read_csv("/kaggle/input/nlu-data-file/train.csv")
dev_df = pd.read_csv("/kaggle/input/nlu-data-file/dev.csv")

dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)


## Step 2: Tokenization

In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)
dev_dataset = dev_dataset.map(tokenize, batched=True)

dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])
dev_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/24432 [00:00<?, ? examples/s]

Map:   0%|          | 0/6736 [00:00<?, ? examples/s]

## Step 3: Define model and training loop

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=10,
    logging_strategy="steps",
    report_to="none",              
    disable_tqdm=False,             
    log_level="info",               
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 24,432
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12,216
  Number of trainable parameters = 124,647,170


Epoch,Training Loss,Validation Loss


## Step 4: Save the model and tokenizer

In [None]:
model.save_pretrained("./roberta_nli_model")
tokenizer.save_pretrained("./roberta_nli_model")
