<a href="https://colab.research.google.com/github/Anjana2002/Language-Identification-for-Malayalam-English-Code-Mixed-Text/blob/main/modeling/bert_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BERT MODELING


In [2]:
!pip install transformers
!pip install simpletransformers


Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->simpletransformers)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->

In [3]:
import pandas as pd
import ast
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import torch

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
df = pd.read_csv('annotated_comments.csv')
df.head()

Unnamed: 0,video_id,text,annotated
0,63i4ZQcvpcQ,nayanthara kettiyathin shesham kanunnavar,"[('nayanthara', 'mal'), ('kettiyathin', 'mal')..."
1,63i4ZQcvpcQ,00:12 which is that song playing in the backgr...,"[('00:12', 'univ'), ('which', 'eng'), ('is', '..."
2,63i4ZQcvpcQ,2024 july 21n kaanunna njan🥲🥲,"[('2024', 'univ'), ('july', 'eng'), ('21n', 'u..."
3,63i4ZQcvpcQ,15:57 what she is saying,"[('15:57', 'univ'), ('what', 'eng'), ('she', '..."
4,63i4ZQcvpcQ,😊 ara e script oke ezhuthiye..nice dialogues a...,"[('😊', 'univ'), ('ara', 'mal'), ('e', 'undef')..."


In [5]:
#rename video_id to sentence_id
df = df.rename(columns={'video_id':'sentence_id'})
df['sentence_id'] = range(1, len(df) + 1)
#tokenize text
df['text'] = df['text'].apply(word_tokenize)
df.head()

Unnamed: 0,sentence_id,text,annotated
0,1,"[nayanthara, kettiyathin, shesham, kanunnavar]","[('nayanthara', 'mal'), ('kettiyathin', 'mal')..."
1,2,"[00:12, which, is, that, song, playing, in, th...","[('00:12', 'univ'), ('which', 'eng'), ('is', '..."
2,3,"[2024, july, 21n, kaanunna, njan🥲🥲]","[('2024', 'univ'), ('july', 'eng'), ('21n', 'u..."
3,4,"[15:57, what, she, is, saying]","[('15:57', 'univ'), ('what', 'eng'), ('she', '..."
4,5,"[😊, ara, e, script, oke, ezhuthiye, .., nice, ...","[('😊', 'univ'), ('ara', 'mal'), ('e', 'undef')..."


In [6]:
formatted_data = []

for _, row in df.iterrows():
    sentence_id = row["sentence_id"]

    # Convert the `annotated` string to a list of tuples
    annotations = ast.literal_eval(row["annotated"]) if isinstance(row["annotated"], str) else row["annotated"]

    # Iterate over the annotations and extract words and labels
    for item in annotations:
        if isinstance(item, tuple) and len(item) == 2:  # Ensure it is a valid (word, label) tuple
            token, label = item
            if label != "undef":  # Exclude tokens with the label "undef"
                formatted_data.append({"sentence_id": sentence_id, "words": token, "labels": label})
            # formatted_data.append({"sentence_id": sentence_id, "words": token, "labels": label})

# Convert the formatted data to a DataFrame
df = pd.DataFrame(formatted_data)

# Save the formatted dataset to a new CSV file
df.to_csv("ner_dataset.csv", index=False)
print("Formatted dataset saved as 'ner_dataset.csv'")

Formatted dataset saved as 'ner_dataset.csv'


In [7]:
df.head()

Unnamed: 0,sentence_id,words,labels
0,1,nayanthara,mal
1,1,kettiyathin,mal
2,1,shesham,mal
3,1,kanunnavar,mal
4,2,00:12,univ


In [8]:

# Split into training (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Training size: 263461
Validation size: 32933
Test size: 32933


In [9]:
# Extract unique labels from the dataset
unique_labels = list(df["labels"].unique())
print("Unique labels in the dataset:", unique_labels)


Unique labels in the dataset: ['mal', 'univ', 'eng', 'mix', 'acr']


In [10]:
# !pip install --upgrade torch transformers simpletransformers



BERT


In [11]:
from simpletransformers.ner import NERModel
import logging

# Enable logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define model configuration
model_args = {
    "overwrite_output_dir": True,
    "reprocess_input_data": True,
    "save_steps": -1,
    "num_train_epochs": 5,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "max_seq_length": 128,
    "learning_rate": 2e-5,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "evaluate_during_training": True,
    "output_dir": "outputs/",
    "best_model_dir": "outputs/best_model/",
    "labels_list": unique_labels,  # Add this line with your labels
}
model_1 = NERModel(
    model_type="bert",
    model_name="bert-base-uncased",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)
print("Transformer model for NER initialized successfully!")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Transformer model for NER initialized successfully!


In [12]:
train_data = train_df[["sentence_id", "words", "labels"]]
val_data = val_df[["sentence_id", "words", "labels"]]
print(train_data.head())
print(val_data.head())
print(train_data.dtypes)
print(train_data["words"].apply(type).value_counts())

        sentence_id   words labels
199781        33943  aanenn    mal
131447        21490       %   univ
204812        34786     dhe    mal
27121          4977    next    eng
85254         14281   robin    eng
        sentence_id      words labels
258702        44189        njn    mal
56139          9908      react    eng
198375        33702          🤣   univ
324160        53398  thonnunn🤗    mal
263637        45105      😍😍😂😂😀   univ
sentence_id     int64
words          object
labels         object
dtype: object
words
<class 'str'>    263461
Name: count, dtype: int64


In [13]:
# Ensure labels are consistent with the defined `labels_list`
assert set(train_data["labels"].unique()).issubset(set(unique_labels)), "Mismatch in train labels"
assert set(val_data["labels"].unique()).issubset(set(unique_labels)), "Mismatch in val labels"
model_1.train_model(train_data, eval_data=val_data)



  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/3305 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1392 [00:00<?, ?it/s]

(16525,
 defaultdict(list,
             {'global_step': [2000,
               3305,
               4000,
               6000,
               6610,
               8000,
               9915,
               10000,
               12000,
               13220,
               14000,
               16000,
               16525],
              'train_loss': [0.12584254145622253,
               0.046456217765808105,
               0.04634566605091095,
               0.0728599950671196,
               0.07951227575540543,
               0.06504024565219879,
               0.05031076818704605,
               0.0485786609351635,
               0.09683764725923538,
               0.02533101662993431,
               0.048125267028808594,
               0.031401701271533966,
               0.03203465789556503],
              'eval_loss': [0.10208054551018078,
               0.09183744696197575,
               0.08328628469101612,
               0.0790141496074974,
               0.07958522867784791,
  

In [14]:

example_sentence = "njn enn avide poyi, videoyil njn kandu comedyu love 123 wait"
predictions, raw_outputs = model_1.predict([example_sentence])  # Pass it as a list of strings
print("Predictions:", predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

Predictions: [[{'njn': 'mal'}, {'enn': 'mal'}, {'avide': 'mal'}, {'poyi,': 'mal'}, {'videoyil': 'mal'}, {'njn': 'mal'}, {'kandu': 'mal'}, {'comedyu': 'mix'}, {'love': 'eng'}, {'123': 'univ'}, {'wait': 'eng'}]]


  with amp.autocast():
