In [1]:
import pandas as pd
data_train = pd.read_csv('sarcasm_tam_train.csv')
data_val = pd.read_csv('sarcasm_tam_dev.csv')
data_test = pd.read_csv('sarcasm_tam_test_without_labels.csv')

In [2]:
text_train = data_train['Text']
text_val = data_val['Text']

class_label_train = data_train['labels']
class_label_val = data_val['labels']

text_test = data_test['Text']

In [3]:
class_label_train.value_counts(), class_label_val.value_counts()

(labels
 Non-sarcastic    21740
 Sarcastic         7830
 Name: count, dtype: int64,
 labels
 Non-sarcastic    4630
 Sarcastic        1706
 Name: count, dtype: int64)

In [4]:
import numpy as np
classes_list = ["Non-sarcastic", "Sarcastic"]
label_index_train = class_label_train.apply(classes_list.index)
label_index_val = class_label_val.apply(classes_list.index)

In [5]:
import numpy as np
import ktrain
from ktrain import text




In [6]:
MODEL_NAME = 'distilbert-base-multilingual-cased'
#MODEL_NAME = 'google-bert/bert-base-multilingual-cased'
t = text.Transformer(MODEL_NAME, maxlen=30, classes=classes_list)
trn = t.preprocess_train(np.array(text_train), np.array(class_label_train))
val = t.preprocess_test(np.array(text_val), np.array(class_label_val))
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)



preprocessing train...
language: id
train sequence lengths:
	mean : 10
	95percentile : 23
	99percentile : 43




Is Multi-Label? False
preprocessing test...
language: id
test sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 41


In [7]:
learner.fit_onecycle(5e-5, 10) 



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f18106ef10>

In [8]:
learner.validate(class_names=t.get_classes()) 

               precision    recall  f1-score   support

Non-sarcastic       0.84      0.89      0.86      4630
    Sarcastic       0.64      0.54      0.58      1706

     accuracy                           0.79      6336
    macro avg       0.74      0.71      0.72      6336
 weighted avg       0.79      0.79      0.79      6336



array([[4116,  514],
       [ 790,  916]], dtype=int64)

In [9]:
learner.view_top_losses(n=1, preproc=t)

----------
id:3023 | loss:8.78 | true:Sarcastic | pred:Non-sarcastic)



In [10]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [11]:
predictor.predict(text_test[0])

'Non-sarcastic'

In [12]:
predictor.predict_proba(text_test[0])

array([9.9974340e-01, 2.5654837e-04], dtype=float32)

In [13]:
predictor.get_classes()

['Non-sarcastic', 'Sarcastic']

In [14]:
predictor.explain(text_test[0])

Contribution?,Feature
12.58,Highlighted in text (sum)
0.276,<BIAS>


In [15]:
text_test[1]

'விழுப்புரம்  வன்னிய கவுண்டர் சார்பாக வாழ்த்துக்கள்'

In [16]:
predictor.save('DistilBert_TAMIL_SARCASM_DETECTION')

In [17]:
reloaded_predictor = ktrain.load_predictor('DistilBert_TAMIL_SARCASM_DETECTION')

In [18]:
predict = reloaded_predictor.predict(text_test.values)

In [19]:
data_test['label'] = predict

In [20]:
new_data_test = data_test[['ID','label']]

In [21]:
import csv
new_data_test.to_csv("Tamil_distilbert_sarcasm_detection.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [22]:
check_submission = pd.read_csv("Tamil_distilbert_sarcasm_detection.tsv", sep="\t")

In [23]:
check_submission

Unnamed: 0.1,Unnamed: 0,ID,label
0,0,Id_01,Non-sarcastic
1,1,Id_02,Non-sarcastic
2,2,Id_03,Non-sarcastic
3,3,Id_04,Non-sarcastic
4,4,Id_05,Non-sarcastic
...,...,...,...
6333,6333,Id_6334,Non-sarcastic
6334,6334,Id_6335,Sarcastic
6335,6335,Id_6336,Non-sarcastic
6336,6336,Id_6337,Non-sarcastic


In [24]:
check_submission['label'].value_counts()

label
Non-sarcastic    4853
Sarcastic        1485
Name: count, dtype: int64

In [25]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  134734080 
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 135326210 (516.23 MB)
Trainable params: 135326210 (516.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
