In [3]:
import pandas as pd
data_train = pd.read_csv('sarcasm_tam_train.csv')
data_val = pd.read_csv('sarcasm_tam_dev.csv')
data_test = pd.read_csv('sarcasm_tam_test_without_labels.csv')

In [4]:
text_train = data_train['Text']
text_val = data_val['Text']

class_label_train = data_train['labels']
class_label_val = data_val['labels']

text_test = data_test['Text']

In [5]:
class_label_train.value_counts(), class_label_val.value_counts()

(labels
 Non-sarcastic    21740
 Sarcastic         7830
 Name: count, dtype: int64,
 labels
 Non-sarcastic    4630
 Sarcastic        1706
 Name: count, dtype: int64)

In [8]:
import numpy as np
classes_list = ["Non-sarcastic", "Sarcastic"]
label_index_train = class_label_train.apply(classes_list.index)
label_index_val = class_label_val.apply(classes_list.index)

In [9]:
import numpy as np
import ktrain
from ktrain import text




In [11]:
# MODEL_NAME = 'distilbert-base-multilingual-cased'
# MODEL_NAME = 'google-bert/bert-base-multilingual-cased'
MODEL_NAME = 'roberta-base'
t = text.Transformer(MODEL_NAME, maxlen=30, classes=classes_list)
trn = t.preprocess_train(np.array(text_train), np.array(class_label_train))
val = t.preprocess_test(np.array(text_val), np.array(class_label_val))
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)

preprocessing train...
language: ta
train sequence lengths:
	mean : 10
	95percentile : 23
	99percentile : 43




Is Multi-Label? False
preprocessing test...
language: ta
test sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 41


In [12]:
learner.fit_onecycle(5e-5, 10) 



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1357504f650>

In [13]:
learner.validate(class_names=t.get_classes()) 

               precision    recall  f1-score   support

Non-sarcastic       0.83      0.89      0.86      4630
    Sarcastic       0.63      0.52      0.57      1706

     accuracy                           0.79      6336
    macro avg       0.73      0.70      0.72      6336
 weighted avg       0.78      0.79      0.78      6336



array([[4099,  531],
       [ 811,  895]], dtype=int64)

In [14]:
learner.view_top_losses(n=1, preproc=t)

----------
id:4136 | loss:6.71 | true:Sarcastic | pred:Non-sarcastic)



In [15]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [16]:
predictor.predict(text_test[0])

'Non-sarcastic'

In [17]:
predictor.predict_proba(text_test[0])

array([0.99867237, 0.0013277 ], dtype=float32)

In [18]:
predictor.get_classes()

['Non-sarcastic', 'Sarcastic']

In [19]:
predictor.explain(text_test[0])

Contribution?,Feature
8.06,Highlighted in text (sum)
0.707,<BIAS>


In [20]:
text_test[1]

'விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக்கள்'

In [21]:
predictor.save('Roberta_TAMIL_SARCASM_DETECTION')

In [22]:
reloaded_predictor = ktrain.load_predictor('Roberta_TAMIL_SARCASM_DETECTION')

In [23]:
predict = reloaded_predictor.predict(text_test.values)

In [24]:
data_test['label'] = predict

In [25]:
new_data_test = data_test[['ID','label']]

In [26]:
import csv
new_data_test.to_csv("Tamil_roberta_sarcasm_detection.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [27]:
check_submission = pd.read_csv("Tamil_roberta_sarcasm_detection.tsv", sep="\t")

In [28]:
check_submission

Unnamed: 0.1,Unnamed: 0,ID,label
0,0,Id_01,Non-sarcastic
1,1,Id_02,Non-sarcastic
2,2,Id_03,Non-sarcastic
3,3,Id_04,Non-sarcastic
4,4,Id_05,Sarcastic
...,...,...,...
6333,6333,Id_6334,Non-sarcastic
6334,6334,Id_6335,Sarcastic
6335,6335,Id_6336,Non-sarcastic
6336,6336,Id_6337,Non-sarcastic


In [29]:
check_submission['label'].value_counts()

label
Non-sarcastic    4827
Sarcastic        1511
Name: count, dtype: int64

In [30]:
model.summary()

Model: "tf_roberta_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 124647170 (475.49 MB)
Trainable params: 124647170 (475.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
