<a href="https://colab.research.google.com/github/Anjana2002/Language-Identification-for-Malayalam-English-Code-Mixed-Text/blob/main/modeling/electra_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ELECTRA MODELING

In [4]:
!pip install transformers
!pip install simpletransformers



In [5]:
import pandas as pd
import ast
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import torch
import logging
from simpletransformers.ner import NERModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
df = pd.read_csv('ner_dataset.csv')
df["words"] = df["words"].fillna("").astype(str)
df.head()


Unnamed: 0,sentence_id,words,labels
0,1,nayanthara,mal
1,1,kettiyathin,mal
2,1,shesham,mal
3,1,kanunnavar,mal
4,2,00:12,univ


In [7]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

# Extract unique labels from the dataset
unique_labels = list(df["labels"].unique())
print("Unique labels in the dataset:", unique_labels)

Training size: 263461
Validation size: 32933
Test size: 32933
Unique labels in the dataset: ['mal', 'univ', 'eng', 'mix', 'acr']


In [8]:
model_args = {
    "overwrite_output_dir": True,
    "reprocess_input_data": True,
    "save_steps": -1,
    "num_train_epochs": 5,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "max_seq_length": 128,
    "learning_rate": 2e-5,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "evaluate_during_training": True,
    "output_dir": "outputs/",
    "best_model_dir": "outputs/best_model/",
    "labels_list": unique_labels,  # Add this line with your labels
}

electra = NERModel(
    model_type="electra",
    model_name="google/electra-small-discriminator",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_data = train_df[["sentence_id", "words", "labels"]]
val_data = val_df[["sentence_id", "words", "labels"]]
print(train_data.head())
print(val_data.head())

        sentence_id   words labels
199781        33943  aanenn    mal
131447        21490       %   univ
204812        34786     dhe    mal
27121          4977    next    eng
85254         14281   robin    eng
        sentence_id      words labels
258702        44189        njn    mal
56139          9908      react    eng
198375        33702          ü§£   univ
324160        53398  thonnunnü§ó    mal
263637        45105      üòçüòçüòÇüòÇüòÄ   univ


In [10]:
# Ensure labels are consistent with the defined `labels_list`
assert set(train_data["labels"].unique()).issubset(set(unique_labels)), "Mismatch in train labels"
assert set(val_data["labels"].unique()).issubset(set(unique_labels)), "Mismatch in val labels"

In [11]:
electra.train_model(train_data, eval_data=val_data)

model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

(16525,
 defaultdict(list,
             {'global_step': [2000,
               3305,
               4000,
               6000,
               6610,
               8000,
               9915,
               10000,
               12000,
               13220,
               14000,
               16000,
               16525],
              'train_loss': [0.1984468400478363,
               0.09629452228546143,
               0.09229183197021484,
               0.05234600976109505,
               0.28107237815856934,
               0.06263644993305206,
               0.027373692020773888,
               0.031025348231196404,
               0.09229131042957306,
               0.050657447427511215,
               0.028435805812478065,
               0.0585484579205513,
               0.08199293911457062],
              'eval_loss': [0.12207553298520353,
               0.10635896539399213,
               0.1016251949926948,
               0.09585492171269532,
               0.08562152856431002,
 

In [14]:
example_sentence = "njn enn avide poyi, videoyil njn kandu comedyu love 123 wait"
predictions, raw_outputs = electra.predict([example_sentence])  # Pass it as a list of strings
print("Predictions:", predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

Predictions: [[{'njn': 'mal'}, {'enn': 'mal'}, {'avide': 'mal'}, {'poyi,': 'mal'}, {'videoyil': 'mal'}, {'njn': 'mal'}, {'kandu': 'mal'}, {'comedyu': 'mix'}, {'love': 'eng'}, {'123': 'univ'}, {'wait': 'eng'}]]


  with amp.autocast():


In [15]:
result, model_outputs, predictions = electra.eval_model(val_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  with amp.autocast():


In [16]:

print("Evaluation Results:")
print(f"Precision: {result['precision']:.4f}")
print(f"Recall: {result['recall']:.4f}")
print(result)

Evaluation Results:
Precision: 0.9532
Recall: 0.9511
{'eval_loss': 0.08360449999842483, 'precision': 0.9531983568075117, 'recall': 0.9511400651465798, 'f1_score': 0.9521680986315423}
