<a href="https://colab.research.google.com/github/1122iqra/AttractingContributors-shows-or-series/blob/main/finetune_pashto_paraphrase_detection(XLM_ROBERTA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install Libraries
!pip install -qq transformers[torch] sentencepiece

In [3]:
# Import Libraries
import time
import json

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup

In [4]:
## make sure that gpu is available
torch.cuda.is_available()

True

In [5]:
## First, upload your csv file using column on the left side (via folder icon -> upload button)
## After that, run this cell
path_file = '/content/Book5.csv'
df = pd.read_csv(path_file)
df = df.rename(columns={'sentence 1': 'sent1', 'sentence 2': 'sent2'})
df = df.dropna()   ### Some cells seem empty and cause error, fix them when you use
df['label'] = df['label'].replace({'P': 1, 'NP': 0})
df

Unnamed: 0,sent1,sent2,label
0,امریکا: یوې روسۍ جټ الوتکې د امریکا له بې پیلو...,امریکا وایی، د روسیې یوې الوتکې د امریکا د بې ...,1
1,د ميلي په وينا، تر اوسه معلومه نه ده، چې بريد ...,"میلي وویل چې هغه ""تر اوسه ډاډه نه دی"" چې ایا د...",1
2,په ترکیه کې د وروستیو زلزلو له امله تر سلو زیا...,په ترکیه کې د وروستۍ زلزلې له امله تر سلو زیات...,0
3,په سوریه کې یو نوی زیږیدلی ماشوم له خټو څخه ژو...,د ترکیې او سوریې د زلزلې له امله د مړو شمیر ۸۴...,1
4,دغه زلزله چې شدت یې د ریښتر په کچه ۵.۶ ښودل شو...,د 7.8 درجې زلزلې د دوشنبې په ورځ په داسې حال ک...,0
...,...,...,...
5399,چین د علي‌ بابا انلاین تجاري‌ شرکت ته ډیره ستر...,چین د علي‌ بابا شرکت ته داسې نقدي‌ جزا ورکړه چ...,1
5400,ایران د کوریا برمته کړې ټانکر کښتۍ خوشې کړه,ایران د جنوري‌ په ۴ مه نیټه د چاپیریال د ککړول...,1
5401,په امریکا کې په یو سوداګریز مرکز وسله وال برید...,د امریکا د ټکساس ایالت په یو سوداګریز مرکز وسل...,1
5402,د امریکا د سویلي‌ کارولینا په ایالت کې په یو ک...,په امریکا کې په یو کور وسله وال برید شوی دی,1


In [6]:
## Download a tokenizer using the transformers library
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [7]:
## Create dataset
# First split dataset into train/dev/test subset
num_train = 3000
num_dev = 1500
num_test = len(df) - num_train - num_dev

train_sent1 = df['sent1'].tolist()[:num_train]
train_sent2 = df['sent2'].tolist()[:num_train]
train_labels = df['label'].tolist()[:num_train]

dev_sent1 = df['sent1'].tolist()[num_train:num_train+num_dev]
dev_sent2 = df['sent2'].tolist()[num_train:num_train+num_dev]
dev_labels = df['label'].tolist()[num_train:num_train+num_dev]

test_sent1 = df['sent1'].tolist()[num_train+num_dev:]
test_sent2 = df['sent2'].tolist()[num_train+num_dev:]
test_labels = df['label'].tolist()[num_train+num_dev:]

In [8]:
# Define Dataset Class
class PashtoPPDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
## Encode data
train_encodings = tokenizer(train_sent1, train_sent2, truncation=True, padding=True, max_length=256, return_tensors='pt')
dev_encodings = tokenizer(dev_sent1, dev_sent2, truncation=True, padding=True, max_length=256, return_tensors='pt')
test_encodings = tokenizer(test_sent1, test_sent2, truncation=True, padding=True, max_length=256, return_tensors='pt')

train_dataset = PashtoPPDataset(train_encodings, train_labels)
dev_dataset = PashtoPPDataset(dev_encodings, dev_labels)
test_dataset = PashtoPPDataset(test_encodings, test_labels)

In [10]:
# Check tokenization
idx = 10

sent_1 = df['sent1'].tolist()[idx]
sent_2 = df['sent2'].tolist()[idx]
print('Raw Sentence 1 : ', sent_1)
print('Raw Sentence 2 : ', sent_2)
print('Tokenized Sentence Pair : ', [tokenizer.decode(x) for x in train_encodings['input_ids'][idx]])

Raw Sentence 1 :  طالبان زموږ د امنیت شورا خبرو ته نه دي رسېدلي
Raw Sentence 2 :  امنیت شورا: طالبانو زموږ هیلې نه دي پوره کړي
Tokenized Sentence Pair :  ['<s>', 'طالبان', 'زموږ', 'د', 'امنیت', 'شورا', 'خبرو', 'ته', 'نه', 'دي', 'رسېدل', 'ي', '</s>', '</s>', 'امنیت', 'شورا', ':', 'طالبانو', 'زموږ', 'هی', 'لې', 'نه', 'دي', 'پوره', 'کړي', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>

In [11]:
# you can chenge these values and check the difference in outputs
batch_size = 32
num_epochs = 3
learning_rate = 2e-5

# Load Model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

# Define Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_dir="./logs",
    logging_steps=5,
    output_dir="./output",
    evaluation_strategy="epoch",
)

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Define Scheduler
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Set Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
)

# Start Fine Tuning
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.2725,0.123385
2,0.3223,0.128567
3,0.1454,0.10219


TrainOutput(global_step=282, training_loss=0.3394542907675107, metrics={'train_runtime': 388.3006, 'train_samples_per_second': 23.178, 'train_steps_per_second': 0.726, 'total_flos': 1026749782440000.0, 'train_loss': 0.3394542907675107, 'epoch': 3.0})

In [13]:
# Display loss per epoch
log_history = trainer.state.log_history

# just for formatting
for entry in log_history:
    if 'learning_rate' in entry:
        entry['learning_rate'] = format(entry['learning_rate'], '.5f')

log_history

[{'loss': 0.7004, 'learning_rate': '0.00002', 'epoch': 0.05, 'step': 5},
 {'loss': 0.6758, 'learning_rate': '0.00002', 'epoch': 0.11, 'step': 10},
 {'loss': 0.7262, 'learning_rate': '0.00002', 'epoch': 0.16, 'step': 15},
 {'loss': 0.6992, 'learning_rate': '0.00002', 'epoch': 0.21, 'step': 20},
 {'loss': 0.6861, 'learning_rate': '0.00002', 'epoch': 0.27, 'step': 25},
 {'loss': 0.6865, 'learning_rate': '0.00002', 'epoch': 0.32, 'step': 30},
 {'loss': 0.6746, 'learning_rate': '0.00002', 'epoch': 0.37, 'step': 35},
 {'loss': 0.6346, 'learning_rate': '0.00002', 'epoch': 0.43, 'step': 40},
 {'loss': 0.6178, 'learning_rate': '0.00002', 'epoch': 0.48, 'step': 45},
 {'loss': 0.5541, 'learning_rate': '0.00002', 'epoch': 0.53, 'step': 50},
 {'loss': 0.4633, 'learning_rate': '0.00002', 'epoch': 0.59, 'step': 55},
 {'loss': 0.4279, 'learning_rate': '0.00002', 'epoch': 0.64, 'step': 60},
 {'loss': 0.3687, 'learning_rate': '0.00002', 'epoch': 0.69, 'step': 65},
 {'loss': 0.282, 'learning_rate': '0.00

In [14]:
## Check Result

# On trained data
prediction = trainer.predict(train_dataset)
predicted_labels = prediction.predictions.argmax(axis=1)
accuracy = (predicted_labels == train_labels).sum() / len(train_labels)
print()
print('### Train ###')
print('Predict Paraphrase : {}, Non-Paraphrase : {}'.format(sum(predicted_labels), len(predicted_labels)-sum(predicted_labels)))
print('Actual Paraphrase : {}, Non-Paraphrase : {}'.format(sum(train_labels), len(train_labels)-sum(train_labels)))
print('Accuracy : {:.4f}'.format(accuracy))
print()

# On dev data
prediction = trainer.predict(dev_dataset)
predicted_labels = prediction.predictions.argmax(axis=1)
accuracy = (predicted_labels == dev_labels).sum() / len(dev_labels)
print()
print('### Dev ###')
print('Predict Paraphrase : {}, Non-Paraphrase : {}'.format(sum(predicted_labels), len(predicted_labels)-sum(predicted_labels)))
print('Actual Paraphrase : {}, Non-Paraphrase : {}'.format(sum(dev_labels), len(dev_labels)-sum(dev_labels)))
print('Accuracy : {:.4f}'.format(accuracy))
print()

# On test data
prediction = trainer.predict(test_dataset)
predicted_labels = prediction.predictions.argmax(axis=1)
accuracy = (predicted_labels == test_labels).sum() / len(test_labels)
print()
print('### Test ###')
print('Predict Paraphrase : {}, Non-Paraphrase : {}'.format(sum(predicted_labels), len(predicted_labels)-sum(predicted_labels)))
print('Actual Paraphrase : {}, Non-Paraphrase : {}'.format(sum(test_labels), len(test_labels)-sum(test_labels)))
print('Accuracy : {:.4f}'.format(accuracy))
print()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



### Train ###
Predict Paraphrase : 1357, Non-Paraphrase : 1643
Actual Paraphrase : 1308, Non-Paraphrase : 1692
Accuracy : 0.9457




### Dev ###
Predict Paraphrase : 975, Non-Paraphrase : 525
Actual Paraphrase : 972, Non-Paraphrase : 528
Accuracy : 0.9673




### Test ###
Predict Paraphrase : 657, Non-Paraphrase : 232
Actual Paraphrase : 662, Non-Paraphrase : 227
Accuracy : 0.9809



In [17]:
from transformers import TrainingArguments
from sklearn.metrics import classification_report  # Import the classification_report function

# Assuming you have defined the 'test_dataset', 'trainer', and 'test_preds_raw' variables
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

              precision    recall  f1-score   support

           0      0.953     0.974     0.963       227
           1      0.991     0.983     0.987       662

    accuracy                          0.981       889
   macro avg      0.972     0.978     0.975       889
weighted avg      0.981     0.981     0.981       889

