In [1]:
import pandas as pd
data_train = pd.read_csv('sarcasm_mal_train.csv')
data_val = pd.read_csv('sarcasm_mal_dev.csv')
data_test = pd.read_csv('sarcasm_mal_test_without_labels.csv')

In [2]:
text_train = data_train['Text']
text_val = data_val['Text']

class_label_train = data_train['labels']
class_label_val = data_val['labels']

text_test = data_test['Text']

In [3]:
class_label_train.value_counts(), class_label_val.value_counts()

(labels
 Non-sarcastic    10689
 Sarcastic         2499
 Name: count, dtype: int64,
 labels
 Non-sarcastic    2305
 Sarcastic         521
 Name: count, dtype: int64)

In [4]:
import numpy as np
classes_list = ["Non-sarcastic", "Sarcastic"]
label_index_train = class_label_train.apply(classes_list.index)
label_index_val = class_label_val.apply(classes_list.index)

In [5]:
import numpy as np
import ktrain
from ktrain import text




In [6]:
# MODEL_NAME = 'distilbert-base-multilingual-cased'
MODEL_NAME = 'google-bert/bert-base-multilingual-cased'
t = text.Transformer(MODEL_NAME, maxlen=30, classes=classes_list)
trn = t.preprocess_train(np.array(text_train), np.array(class_label_train))
val = t.preprocess_test(np.array(text_val), np.array(class_label_val))
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)



preprocessing train...
language: ml
train sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 47




Is Multi-Label? False
preprocessing test...
language: ml
test sequence lengths:
	mean : 10
	95percentile : 21
	99percentile : 45


In [7]:
learner.fit_onecycle(5e-5, 10) 



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x219db8858d0>

In [8]:
learner.validate(class_names=t.get_classes()) 

               precision    recall  f1-score   support

Non-sarcastic       0.88      0.93      0.91      2305
    Sarcastic       0.60      0.44      0.51       521

     accuracy                           0.84      2826
    macro avg       0.74      0.69      0.71      2826
 weighted avg       0.83      0.84      0.83      2826



array([[2152,  153],
       [ 291,  230]], dtype=int64)

In [9]:
learner.view_top_losses(n=1, preproc=t)

----------
id:844 | loss:6.71 | true:Sarcastic | pred:Non-sarcastic)



In [10]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [11]:
predictor.predict(text_test[0])

'Non-sarcastic'

In [12]:
predictor.predict_proba(text_test[0])

array([0.9987226 , 0.00127735], dtype=float32)

In [13]:
predictor.get_classes()

['Non-sarcastic', 'Sarcastic']

In [14]:
predictor.explain(text_test[0])

Contribution?,Feature
10.923,Highlighted in text (sum)
0.587,<BIAS>


In [15]:
text_test[1]

'ഗീതു മോഹൻദാസ് മലയാള സിനിമക്കു നൽകുന്ന വമ്പൻ ഗിഫ്റ്റ് തന്നെ ആവും മൂത്തോന്'

In [16]:
predictor.save('GoogleBert_Malayalam_SARCASM_DETECTION')

In [17]:
reloaded_predictor = ktrain.load_predictor('GoogleBert_Malayalam_SARCASM_DETECTION')

In [18]:
predict = reloaded_predictor.predict(text_test.values)

In [19]:
data_test['label'] = predict

In [20]:
new_data_test = data_test[['ID','label']]

In [21]:
import csv
new_data_test.to_csv("Malayalam_Googlebert_sarcasm_detection.tsv", sep="\t", quoting=csv.QUOTE_NONE)

In [22]:
check_submission = pd.read_csv("Malayalam_Googlebert_sarcasm_detection.tsv", sep="\t")

In [23]:
check_submission

Unnamed: 0.1,Unnamed: 0,ID,label
0,0,Id_01,Non-sarcastic
1,1,Id_02,Non-sarcastic
2,2,Id_03,Non-sarcastic
3,3,Id_04,Non-sarcastic
4,4,Id_05,Non-sarcastic
...,...,...,...
2821,2821,Id_2822,Non-sarcastic
2822,2822,Id_2823,Non-sarcastic
2823,2823,Id_2824,Non-sarcastic
2824,2824,Id_2825,Non-sarcastic


In [24]:
check_submission['label'].value_counts()

label
Non-sarcastic    2419
Sarcastic         407
Name: count, dtype: int64

In [25]:
model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 177854978 (678.46 MB)
Trainable params: 177854978 (678.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
