In [30]:
import pandas as pd
data_train = pd.read_csv('sarcasm_mal_train.csv')
data_val = pd.read_csv('sarcasm_mal_dev.csv')
data_test = pd.read_csv('sarcasm_mal_test_without_labels.csv')

In [31]:
text_train = data_train['Text']
text_val = data_val['Text']

class_label_train = data_train['labels']
class_label_val = data_val['labels']

text_test = data_test['Text']

In [32]:
class_label_train.value_counts(), class_label_val.value_counts()

(labels
 Non-sarcastic    10689
 Sarcastic         2499
 Name: count, dtype: int64,
 labels
 Non-sarcastic    2305
 Sarcastic         521
 Name: count, dtype: int64)

In [33]:
import numpy as np
classes_list = ["Non-sarcastic", "Sarcastic"]
label_index_train = class_label_train.apply(classes_list.index)
label_index_val = class_label_val.apply(classes_list.index)

In [34]:
import numpy as np
import ktrain
from ktrain import text

In [35]:
MODEL_NAME = 'distilbert-base-multilingual-cased'
t = text.Transformer(MODEL_NAME, maxlen=30, classes=classes_list)
trn = t.preprocess_train(np.array(text_train), np.array(class_label_train))
val = t.preprocess_test(np.array(text_val), np.array(class_label_val))
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)



preprocessing train...
language: ml
train sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 47




Is Multi-Label? False
preprocessing test...
language: ml
test sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 45


In [36]:
learner.fit_onecycle(5e-5, 12) 



begin training using onecycle policy with max lr of 5e-05...


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.src.callbacks.History at 0x26de208f590>

In [37]:
learner.validate(class_names=t.get_classes()) 

               precision    recall  f1-score   support

Non-sarcastic       0.88      0.94      0.91      2305
    Sarcastic       0.62      0.41      0.50       521

     accuracy                           0.85      2826
    macro avg       0.75      0.68      0.70      2826
 weighted avg       0.83      0.85      0.83      2826



array([[2173,  132],
       [ 306,  215]], dtype=int64)

In [38]:
learner.view_top_losses(n=1, preproc=t)

----------
id:1257 | loss:10.02 | true:Sarcastic | pred:Non-sarcastic)



In [39]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [40]:
predictor.predict(text_test[0])

'Non-sarcastic'

In [41]:
predictor.predict_proba(text_test[0])

array([9.9995136e-01, 4.8589402e-05], dtype=float32)

In [42]:
predictor.get_classes()

['Non-sarcastic', 'Sarcastic']

In [43]:
predictor.explain(text_test[0])

Contribution?,Feature
12.784,Highlighted in text (sum)
0.519,<BIAS>


In [44]:
text_test[1]

'ഗീതു മോഹൻദാസ് മലയാള സിനിമക്കു നൽകുന്ന വമ്പൻ ഗിഫ്റ്റ് തന്നെ ആവും മൂത്തോന്'

In [45]:
predictor.save('DistilBert_Malayalam_SARCASM_DETECTION')

In [46]:
reloaded_predictor = ktrain.load_predictor('DistilBert_Malayalam_SARCASM_DETECTION')

In [47]:
predict = reloaded_predictor.predict(text_test.values)

In [48]:
data_test['label'] = predict

In [49]:
new_data_test = data_test[['ID','label']]

In [50]:
import csv
new_data_test.to_csv("Malayalam_distilbert_sarcasm_detection.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [51]:
check_submission = pd.read_csv("Malayalam_distilbert_sarcasm_detection.tsv", sep="\t")

In [52]:
check_submission

Unnamed: 0.1,Unnamed: 0,ID,label
0,0,Id_01,Non-sarcastic
1,1,Id_02,Sarcastic
2,2,Id_03,Non-sarcastic
3,3,Id_04,Non-sarcastic
4,4,Id_05,Non-sarcastic
...,...,...,...
2821,2821,Id_2822,Non-sarcastic
2822,2822,Id_2823,Non-sarcastic
2823,2823,Id_2824,Non-sarcastic
2824,2824,Id_2825,Non-sarcastic


In [53]:
check_submission['label'].value_counts()

label
Non-sarcastic    2443
Sarcastic         383
Name: count, dtype: int64

In [54]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  134734080 
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_99 (Dropout)        multiple                  0         
                                                                 
Total params: 135326210 (516.23 MB)
Trainable params: 135326210 (516.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
