In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import os
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import Dataset
import collections
import emoji
import re
from emot.emo_unicode import EMOTICONS_EMO
import nltk
nltk.download('words')
import nltk, string, re, spacy,unicodedata, random
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002500-\U00002BEF"  # chinese char
                              u"\U00002702-\U000027B0"
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              u"\U0001f926-\U0001f937"
                              u"\U00010000-\U0010ffff"
                              u"\u2640-\u2642"
                              u"\u2600-\u2B55"
                              u"\u200d"
                              u"\u23cf"
                              u"\u23e9"
                              u"\u231a"
                              u"\ufe0f"  # dingbats
                              u"\u3030"
                              "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
        
def preprocess(text):
    text = str(text)
    text = deEmojify(text) #remove emojis
    text = re.sub(r'([\.\'\"\/\-\_\--])',' ', text) # remove punctuations , removes @USER / some abbreviatins
    to_remove_url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
      '[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = re.sub(to_remove_url,'',text)  # remove url patterns
    text = re.sub(" \d+", " ", text)
    text = text.replace(","," ")
    text = re.sub(r'(?:^| )\w(?:$| )', ' ', text).strip()
    punctuation='!!"$%&()*+-/:;<=>?[\]^_{|}~.'
    text = ''.join(ch for ch in text if ch not in set(punctuation))
    
    tokenizer = ToktokTokenizer()
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    text = ' '.join(ch for ch in tokens)
    return text 

def clean(df):
    df['text'] = df['text'].apply(lambda x: preprocess(x))


In [2]:
train = pd.read_csv('tamil_train.csv')
val = pd.read_csv('tamil_dev.csv')
train.drop(['Unnamed: 0'], axis=1, inplace=True)
val.drop(['Unnamed: 0'], axis=1, inplace=True)
train.dropna()
train['label'] = pd.Categorical(train.label)
val.dropna()
val['label'] = pd.Categorical(val.label)
train['text'].apply(str)
clean(train)
clean(val)
train

Unnamed: 0,text,label
0,மோவி வேற லேவில் லா எரிகா பொகுது,Not_offensive
1,லோவ் அஜித் குமார் விவேகம் மோவி இங்கி மேஜி பட் ...,not-Tamil
2,படம் நல்ல காமெடி படாம இருகும் போலை,Not_offensive
3,கார்த்திக் சுப்பராஜ் அன்னி இந்த படம் வெற்றி அட...,Not_offensive
4,கவுண்டர் தேவர் சார்பாக வெற்றி பெற வாழ்த்துக்கள்,Not_offensive
...,...,...
35134,டிரெண்டிங் நம்பர் #2 இதுக்கு நம்மலாம் காரணம்னு...,Not_offensive
35135,மோவி ஸ்கிரிப்ட் சூப்பர் அதுவும் ஹிப் ஹாப் தமிழ...,Not_offensive
35136,ஜஸ்ட் லிக்ஸ் போர் லிக்ஸ்,Not_offensive
35137,ஆலோ லே லோ கண்டா லே லோ,not-Tamil


In [3]:
class tamil_Offensive_Dataset(Dataset):
    def __init__(self, encodings, labels, bpe = False):
        self.encodings = encodings
        self.labels = labels
        self.is_bpe_tokenized = bpe

    def __getitem__(self, idx):
        if not self.is_bpe_tokenized:
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        else:
            item = {
                'input_ids': torch.LongTensor(self.encodings[idx].ids),
                'attention_mask': torch.LongTensor(self.encodings[idx].attention_mask)
            }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
model_name = 'Mbert_base_cased_tamil_weighted'

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [5]:
 # Optimiser
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)



In [6]:
label_mapping = {
        'Not_offensive': 0, 
        'not-Tamil': 1, 
        'Offensive': 2, 
    }

In [9]:
# Collecting Text and Labels
train_batch_sentences = list(str(train['text']))
train_batch_labels =  [label_mapping[x] for x in train['label']]
dev_batch_sentences = list(val['text'])
dev_batch_labels =  [label_mapping[x] for x in val['label']]

In [10]:
# Convert to Tensor
if 'parameters' in tokenizer.__dict__.keys() and tokenizer.__dict__['_parameters']['model'] == 'ByteLevelBPE':
    train_encodings = tokenizer.encode_batch(train_batch_sentences)
    dev_encodings = tokenizer.encode_batch(dev_batch_sentences)
else:
    train_encodings = tokenizer(train_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
    dev_encodings = tokenizer(dev_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt")

train_labels = torch.tensor(train_batch_labels)
dev_labels = torch.tensor(dev_batch_labels)

In [11]:
# Defining Datasets
train_dataset = tamil_Offensive_Dataset(train_encodings, train_labels, bpe = False)
dev_dataset = tamil_Offensive_Dataset(dev_encodings, dev_labels, bpe = False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
best_val_f1 = 0
count = 0

In [31]:
# Alternate Loss Fn
# Weighted Manual Loss Function
from sklearn.utils import class_weight
import torch.nn as nn

weights = class_weight.compute_class_weight(class_weight = 'balanced',classes = np.unique(train_batch_labels),y = train_batch_labels)
weights = np.exp(weights)/np.sum(np.exp(weights))
class_weights = torch.FloatTensor(weights).to(device)
loss_function = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')

In [34]:
# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)
loss_weighted = True

In [33]:
for epoch in range(10):
    train_preds = []
    train_labels = []
    total_train_loss = 0
    model.train()
    print("==========================================================")
    print("Epoch {}".format(epoch))
    print("Train")
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        if loss_weighted:
            loss = loss_function(outputs[1], labels)
        else:
            loss = outputs[0]
        loss.backward()
        optimizer.step()

        for logits in outputs[1].detach().cpu().numpy():
            train_preds.append(np.argmax(logits))
        for logits in labels.cpu().numpy():
            train_labels.append(logits)
        total_train_loss += loss.item()/len(train_loader)

    print("Dev")
    dev_preds = []
    model.eval()
    total_val_loss = 0
    with torch.set_grad_enabled(False):
        for batch in tqdm(dev_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            if loss_weighted:
                loss = loss_function(outputs[1], labels)
            else:
                loss = outputs[0]
            total_val_loss += loss.item()/len(dev_loader)

            for logits in outputs[1].cpu().numpy():
                dev_preds.append(np.argmax(logits))

    y_true = dev_batch_labels
    y_pred = dev_preds
    target_names = label_mapping.keys()
    train_report = classification_report(train_labels, train_preds, target_names=target_names)
    report = classification_report(y_true, y_pred, target_names=target_names)
    val_f1 = f1_score(y_true, y_pred, average='macro')

    if val_f1 > best_val_f1:
        PATH = 'finetuned_models/' + model_name + '.pth'
        torch.save(model.state_dict(), PATH)
        model.save_pretrained(os.path.join('finetuned_berts/', model_name))
        best_val_f1 = val_f1
        count = 0
    else:
        count += 1

    print(train_report)
    print(report)
    print("Epoch {}, Train Loss = {}, Val Loss = {}, Val F1 = {}, Best Val f1 = {}, stagnant = {}".format(epoch, total_train_loss, total_val_loss, val_f1, best_val_f1, count))
    if count == 5:
        print("No increase for 5 epochs, Stopping ...")
        break

Epoch 0
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:39:30<00:00,  5.99s/it]    


Dev


100%|██████████| 275/275 [05:04<00:00,  1.11s/it]


               precision    recall  f1-score   support

Not_offensive       0.84      0.51      0.64     25425
    not-Tamil       0.12      0.86      0.21      1454
    Offensive       0.42      0.47      0.44      8260

     accuracy                           0.52     35139
    macro avg       0.46      0.61      0.43     35139
 weighted avg       0.71      0.52      0.58     35139

               precision    recall  f1-score   support

Not_offensive       0.91      0.63      0.75      3193
    not-Tamil       0.41      0.94      0.57       172
    Offensive       0.43      0.75      0.55      1023

     accuracy                           0.67      4388
    macro avg       0.58      0.77      0.62      4388
 weighted avg       0.78      0.67      0.69      4388

Epoch 0, Train Loss = 0.7306731233960809, Val Loss = 0.5184727655486623, Val F1 = 0.6221883514508227, Best Val f1 = 0.6221883514508227, stagnant = 0
Epoch 1
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [2:57:13<00:00,  4.84s/it]  


Dev


100%|██████████| 275/275 [05:23<00:00,  1.18s/it]


               precision    recall  f1-score   support

Not_offensive       0.90      0.66      0.76     25425
    not-Tamil       0.27      0.91      0.42      1454
    Offensive       0.49      0.68      0.57      8260

     accuracy                           0.67     35139
    macro avg       0.55      0.75      0.58     35139
 weighted avg       0.77      0.67      0.70     35139

               precision    recall  f1-score   support

Not_offensive       0.90      0.74      0.81      3193
    not-Tamil       0.57      0.87      0.69       172
    Offensive       0.50      0.74      0.60      1023

     accuracy                           0.74      4388
    macro avg       0.66      0.78      0.70      4388
 weighted avg       0.80      0.74      0.76      4388

Epoch 1, Train Loss = 0.5004196689113577, Val Loss = 0.4856082540699705, Val F1 = 0.6994401585597435, Best Val f1 = 0.6994401585597435, stagnant = 0
Epoch 2
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:03:13<00:00,  5.00s/it]  


Dev


100%|██████████| 275/275 [05:26<00:00,  1.19s/it]


               precision    recall  f1-score   support

Not_offensive       0.90      0.72      0.80     25425
    not-Tamil       0.37      0.95      0.53      1454
    Offensive       0.52      0.70      0.60      8260

     accuracy                           0.73     35139
    macro avg       0.60      0.79      0.64     35139
 weighted avg       0.79      0.73      0.74     35139

               precision    recall  f1-score   support

Not_offensive       0.90      0.78      0.84      3193
    not-Tamil       0.56      0.91      0.69       172
    Offensive       0.55      0.72      0.62      1023

     accuracy                           0.77      4388
    macro avg       0.67      0.80      0.72      4388
 weighted avg       0.80      0.77      0.78      4388

Epoch 2, Train Loss = 0.41567704193043864, Val Loss = 0.45846494287760414, Val F1 = 0.7159201609499668, Best Val f1 = 0.7159201609499668, stagnant = 0
Epoch 3
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:07:46<00:00,  5.13s/it]  


Dev


100%|██████████| 275/275 [05:39<00:00,  1.23s/it]


               precision    recall  f1-score   support

Not_offensive       0.91      0.75      0.82     25425
    not-Tamil       0.47      0.96      0.63      1454
    Offensive       0.54      0.75      0.63      8260

     accuracy                           0.76     35139
    macro avg       0.64      0.82      0.69     35139
 weighted avg       0.81      0.76      0.77     35139

               precision    recall  f1-score   support

Not_offensive       0.90      0.79      0.84      3193
    not-Tamil       0.43      0.94      0.59       172
    Offensive       0.58      0.68      0.63      1023

     accuracy                           0.77      4388
    macro avg       0.64      0.80      0.69      4388
 weighted avg       0.81      0.77      0.78      4388

Epoch 3, Train Loss = 0.3602962276347671, Val Loss = 0.42805837181650797, Val F1 = 0.6853552587950832, Best Val f1 = 0.7159201609499668, stagnant = 1
Epoch 4
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:09:27<00:00,  5.17s/it]  


Dev


100%|██████████| 275/275 [05:54<00:00,  1.29s/it]


               precision    recall  f1-score   support

Not_offensive       0.92      0.76      0.83     25425
    not-Tamil       0.51      0.97      0.67      1454
    Offensive       0.56      0.76      0.65      8260

     accuracy                           0.77     35139
    macro avg       0.66      0.83      0.72     35139
 weighted avg       0.82      0.77      0.78     35139

               precision    recall  f1-score   support

Not_offensive       0.90      0.81      0.85      3193
    not-Tamil       0.65      0.85      0.74       172
    Offensive       0.56      0.71      0.63      1023

     accuracy                           0.78      4388
    macro avg       0.70      0.79      0.74      4388
 weighted avg       0.81      0.78      0.79      4388

Epoch 4, Train Loss = 0.3201461789092453, Val Loss = 0.530585306044668, Val F1 = 0.7378064799537162, Best Val f1 = 0.7378064799537162, stagnant = 0
Epoch 5
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:07:20<00:00,  5.12s/it]  


Dev


100%|██████████| 275/275 [05:43<00:00,  1.25s/it]


               precision    recall  f1-score   support

Not_offensive       0.92      0.77      0.84     25425
    not-Tamil       0.57      0.98      0.72      1454
    Offensive       0.57      0.78      0.65      8260

     accuracy                           0.78     35139
    macro avg       0.69      0.84      0.74     35139
 weighted avg       0.82      0.78      0.79     35139

               precision    recall  f1-score   support

Not_offensive       0.93      0.75      0.83      3193
    not-Tamil       0.47      0.95      0.63       172
    Offensive       0.54      0.78      0.64      1023

     accuracy                           0.76      4388
    macro avg       0.65      0.82      0.70      4388
 weighted avg       0.82      0.76      0.78      4388

Epoch 5, Train Loss = 0.3170101687965574, Val Loss = 0.43074587411585863, Val F1 = 0.7000060288081768, Best Val f1 = 0.7378064799537162, stagnant = 1
Epoch 6
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:04:52<00:00,  5.05s/it]  


Dev


100%|██████████| 275/275 [05:21<00:00,  1.17s/it]


               precision    recall  f1-score   support

Not_offensive       0.93      0.79      0.85     25425
    not-Tamil       0.59      0.98      0.74      1454
    Offensive       0.59      0.79      0.67      8260

     accuracy                           0.80     35139
    macro avg       0.70      0.85      0.75     35139
 weighted avg       0.83      0.80      0.81     35139

               precision    recall  f1-score   support

Not_offensive       0.89      0.84      0.86      3193
    not-Tamil       0.59      0.88      0.71       172
    Offensive       0.61      0.67      0.64      1023

     accuracy                           0.80      4388
    macro avg       0.70      0.80      0.74      4388
 weighted avg       0.81      0.80      0.81      4388

Epoch 6, Train Loss = 0.28386176458363216, Val Loss = 0.5356363963733684, Val F1 = 0.7369041674057678, Best Val f1 = 0.7378064799537162, stagnant = 2
Epoch 7
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [2:57:22<00:00,  4.84s/it]  


Dev


100%|██████████| 275/275 [05:15<00:00,  1.15s/it]


               precision    recall  f1-score   support

Not_offensive       0.94      0.79      0.86     25425
    not-Tamil       0.62      0.99      0.76      1454
    Offensive       0.60      0.82      0.69      8260

     accuracy                           0.81     35139
    macro avg       0.72      0.87      0.77     35139
 weighted avg       0.84      0.81      0.81     35139

               precision    recall  f1-score   support

Not_offensive       0.89      0.87      0.88      3193
    not-Tamil       0.73      0.82      0.77       172
    Offensive       0.63      0.66      0.64      1023

     accuracy                           0.82      4388
    macro avg       0.75      0.78      0.76      4388
 weighted avg       0.82      0.82      0.82      4388

Epoch 7, Train Loss = 0.26751943477601087, Val Loss = 0.6131653035703026, Val F1 = 0.7630105453228063, Best Val f1 = 0.7630105453228063, stagnant = 0
Epoch 8
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [3:00:37<00:00,  4.93s/it]  


Dev


100%|██████████| 275/275 [05:21<00:00,  1.17s/it]


               precision    recall  f1-score   support

Not_offensive       0.94      0.81      0.87     25425
    not-Tamil       0.63      0.99      0.77      1454
    Offensive       0.62      0.81      0.70      8260

     accuracy                           0.82     35139
    macro avg       0.73      0.87      0.78     35139
 weighted avg       0.85      0.82      0.82     35139

               precision    recall  f1-score   support

Not_offensive       0.91      0.79      0.85      3193
    not-Tamil       0.71      0.86      0.78       172
    Offensive       0.55      0.76      0.64      1023

     accuracy                           0.79      4388
    macro avg       0.73      0.80      0.75      4388
 weighted avg       0.82      0.79      0.80      4388

Epoch 8, Train Loss = 0.24767054148320616, Val Loss = 0.5634147431489757, Val F1 = 0.7548162286079766, Best Val f1 = 0.7630105453228063, stagnant = 1
Epoch 9
Train


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|██████████| 2197/2197 [2:59:44<00:00,  4.91s/it]  


Dev


100%|██████████| 275/275 [05:14<00:00,  1.14s/it]

               precision    recall  f1-score   support

Not_offensive       0.94      0.81      0.87     25425
    not-Tamil       0.68      0.99      0.81      1454
    Offensive       0.62      0.84      0.71      8260

     accuracy                           0.83     35139
    macro avg       0.75      0.88      0.80     35139
 weighted avg       0.86      0.83      0.83     35139

               precision    recall  f1-score   support

Not_offensive       0.91      0.81      0.86      3193
    not-Tamil       0.70      0.88      0.78       172
    Offensive       0.58      0.75      0.65      1023

     accuracy                           0.80      4388
    macro avg       0.73      0.81      0.76      4388
 weighted avg       0.82      0.80      0.81      4388

Epoch 9, Train Loss = 0.23409138983945316, Val Loss = 0.6157536917196762, Val F1 = 0.7626767301208378, Best Val f1 = 0.7630105453228063, stagnant = 2





In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
model.load_state_dict(torch.load('finetuned_models/Mbert_base_cased_tamil_weighted.pth'))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

<All keys matched successfully>

In [4]:
class test_tamil_Offensive_Dataset(Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(1) # Label is irrelevant, since we need predictions
            return item

        def __len__(self):
            return len(self.encodings["input_ids"])

In [5]:
model.eval()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
test_sentence = "இந்த மாதிரி கேடுகெட்ட ஜென்மங்கள் எப்படி நீதியை நிலை நாட்டும்"
test_encodings = tokenizer(test_sentence, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
test = test_tamil_Offensive_Dataset(test_encodings)
test_loader = DataLoader(test, batch_size=1, shuffle=False)

In [8]:
test_preds = []
with torch.set_grad_enabled(False):
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
            for logits in outputs[1].cpu().numpy():
                test_preds.append(np.argmax(logits))

print(list(label_mapping.keys())[test_preds[0]])

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.45it/s]

Offensive



