In [1]:
import pandas as pd
import numpy as np
import ktrain
from ktrain import text

data_train = pd.read_csv('Subtask C/trainC.csv')
data_val = pd.read_csv('Subtask C/valC.csv')
data_test = pd.read_csv('Subtask C/SubTask-C-(index,tweet)test.csv')




In [2]:
train_text = data_train['tweet']
val_text = data_val['tweet']
test_text = data_test['tweet']

class_label_train = data_train['label']
class_label_val = data_val['label']

In [3]:
class_label_train.value_counts(), class_label_val.value_counts()

(label
 individual      1074
 organization     856
 community        284
 Name: count, dtype: int64,
 label
 individual      230
 organization    183
 community        61
 Name: count, dtype: int64)

In [4]:
classes_list = ["individual", "organization", "community"]
label_index_train = class_label_train.apply(classes_list.index)
label_index_val = class_label_val.apply(classes_list.index)

In [19]:
# MODEL_NAME = 'google-bert/bert-base-multilingual-cased'   
MODEL_NAME = 'distilbert-base-multilingual-cased'
t = text.Transformer(MODEL_NAME, maxlen=30, classes=classes_list)
trn = t.preprocess_train(np.array(train_text), np.array(class_label_train))
test = t.preprocess_test(np.array(val_text), np.array(class_label_val)) 
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=test, batch_size=32)



preprocessing train...
language: ne
train sequence lengths:
	mean : 26
	95percentile : 47
	99percentile : 52




Is Multi-Label? False
preprocessing test...
language: ne
test sequence lengths:
	mean : 26
	95percentile : 47
	99percentile : 53


In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint
filepath = "Subtask-C distilbert"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [21]:
learner.fit_onecycle(5e-5, 12, verbose=2, callbacks=callbacks_list)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/12

Epoch 1: val_accuracy improved from -inf to 0.56751, saving model to Subtask-C distilbert
70/70 - 644s - loss: 0.9870 - accuracy: 0.4955 - val_loss: 0.9299 - val_accuracy: 0.5675 - 644s/epoch - 9s/step
Epoch 2/12

Epoch 2: val_accuracy improved from 0.56751 to 0.63291, saving model to Subtask-C distilbert
70/70 - 469s - loss: 0.8761 - accuracy: 0.6174 - val_loss: 0.8461 - val_accuracy: 0.6329 - 469s/epoch - 7s/step
Epoch 3/12

Epoch 3: val_accuracy did not improve from 0.63291
70/70 - 359s - loss: 0.7675 - accuracy: 0.6743 - val_loss: 0.8933 - val_accuracy: 0.6160 - 359s/epoch - 5s/step
Epoch 4/12

Epoch 4: val_accuracy improved from 0.63291 to 0.63713, saving model to Subtask-C distilbert
70/70 - 602s - loss: 0.6199 - accuracy: 0.7561 - val_loss: 0.9104 - val_accuracy: 0.6371 - 602s/epoch - 9s/step
Epoch 5/12

Epoch 5: val_accuracy improved from 0.63713 to 0.64768, saving model to Subtask-C distilbert
70/70 - 5

<keras.src.callbacks.History at 0x1f4066f5550>

In [22]:
filepath = "Subtask-C distilbert"
model.load_weights(filepath)

learner.validate(class_names=t.get_classes())

predictor = ktrain.get_predictor(learner.model, preproc=t)

predict = predictor.predict(test_text.values)

              precision    recall  f1-score   support

   community       0.52      0.25      0.33        61
  individual       0.67      0.73      0.70       230
organization       0.63      0.68      0.66       183

    accuracy                           0.65       474
   macro avg       0.61      0.55      0.56       474
weighted avg       0.64      0.65      0.64       474



In [23]:
label_mapping = {
    "individual": 0,
    "organization": 1,
    "community": 2
}
numeric_predictions = [label_mapping[label] for label in predict]

df = pd.DataFrame({
    'index': data_test['index'],
    'label': numeric_predictions
})
print(df)

     index  label
0    50008      0
1    50013      1
2    50017      0
3    50018      1
4    50036      1
..     ...    ...
470  53138      1
471  53153      1
472  53155      0
473  53156      1
474  53159      1

[475 rows x 2 columns]


In [24]:
df['label'].value_counts()

label
0    267
1    191
2     17
Name: count, dtype: int64

In [25]:
df.to_csv('subtask-C_distilbert_prediction', index=False) 

In [26]:
predict

['individual',
 'organization',
 'individual',
 'organization',
 'organization',
 'organization',
 'organization',
 'individual',
 'individual',
 'individual',
 'organization',
 'individual',
 'individual',
 'organization',
 'organization',
 'individual',
 'organization',
 'individual',
 'organization',
 'organization',
 'individual',
 'individual',
 'individual',
 'individual',
 'individual',
 'individual',
 'individual',
 'community',
 'organization',
 'individual',
 'individual',
 'individual',
 'individual',
 'organization',
 'individual',
 'community',
 'individual',
 'individual',
 'individual',
 'individual',
 'organization',
 'organization',
 'individual',
 'individual',
 'organization',
 'community',
 'individual',
 'individual',
 'individual',
 'individual',
 'organization',
 'individual',
 'individual',
 'organization',
 'individual',
 'organization',
 'individual',
 'individual',
 'individual',
 'individual',
 'organization',
 'organization',
 'organization',
 'organization