In [1]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=6e3c4d37037737fe4f4a9c1edea660897260c85804f8244ca28d5415776cb6bb
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np
from seqeval.metrics import classification_report


In [3]:
dataset = load_dataset("Achuth7Achu/Malayalam_ner_tagged")
print(dataset["train"][0])  # Check the first example


ner_tagged_dataset.csv:   0%|          | 0.00/26.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/91047 [00:00<?, ? examples/s]

{'sentence': 'ക്യാന്\u200dസറിനോട് പോരാടുന്ന കിസി, മാനി എന്നിവരുടെ ജീവിതമാണ് ചിത്രം പറയുന്നത്.', 'ner_tags': 'O,O,O,O,CARDINAL,O,O,O,O,O'}


In [4]:
def process_dataset(example):
    example["ner_tags"] = example["ner_tags"].split(",")  # Convert comma-separated string to list
    example["sentence"] = example["sentence"].split()  # Convert sentence to list of words
    return example

dataset = dataset.map(process_dataset)
print(dataset["train"][0])  # Check updated format


Map:   0%|          | 0/91047 [00:00<?, ? examples/s]

{'sentence': ['ക്യാന്\u200dസറിനോട്', 'പോരാടുന്ന', 'കിസി,', 'മാനി', 'എന്നിവരുടെ', 'ജീവിതമാണ്', 'ചിത്രം', 'പറയുന്നത്.'], 'ner_tags': ['O', 'O', 'O', 'O', 'CARDINAL', 'O', 'O', 'O', 'O', 'O']}


In [5]:
model_name = "l3cube-pune/malayalam-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
# Get all unique labels
unique_labels = set(tag for row in dataset["train"]["ner_tags"] for tag in row)
label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id2label = {idx: label for label, idx in label2id.items()}
print(label2id)  # Check label mapping


{'CARDINAL': 0, 'DATE': 1, 'EVENT': 2, 'FAC': 3, 'GPE': 4, 'LANGUAGE': 5, 'LAW': 6, 'LOC': 7, 'MONEY': 8, 'NORP': 9, 'O': 10, 'ORDINAL': 11, 'ORG': 12, 'PERCENT': 13, 'PERSON': 14, 'PRODUCT': 15, 'QUANTITY': 16, 'TIME': 17, 'WORK_OF_ART': 18}


In [None]:
def convert_labels(example):
    example["ner_tags"] = [label2id[label] for label in example["ner_tags"]]  # Convert to IDs
    return example

dataset = dataset.map(convert_labels)
print(dataset["train"][0])  # Check label conversion


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, padding="max_length", is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Assign label
            else:
                label_ids.append(-100)  # Ignore subwords
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",      # Save model checkpoints here
    evaluation_strategy="epoch", # Evaluate at the end of every epoch
    learning_rate=2e-5,          # Standard fine-tuning LR for BERT
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=6,          # Train for 3 epochs (adjust as needed)
    weight_decay=0.01,           # Helps regularization
    logging_dir="./logs",        # Log directory
    logging_steps=500,           # Log loss every 500 steps
    save_strategy="epoch",       # Save checkpoint after each epoch
    report_to="none"             # Disable automatic logging
)


In [None]:
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],  # (Use a separate validation set if available)
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()

In [None]:
model.save_pretrained("fine_tuned_malayalam_ner2")
tokenizer.save_pretrained("fine_tuned_malayalam_ner2")


In [14]:
pip install huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [19]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from huggingface_hub import HfApi

model.push_to_hub("AksharaBalan/malayalam-ner-model2")
tokenizer.push_to_hub("AksharaBalan/malayalam-ner-model2")


model.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AksharaBalan/malayalam-ner-model/commit/f13bd9a3f813ab415ab8b77701b64cce5f872ab1', commit_message='Upload tokenizer', commit_description='', oid='f13bd9a3f813ab415ab8b77701b64cce5f872ab1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AksharaBalan/malayalam-ner-model', endpoint='https://huggingface.co', repo_type='model', repo_id='AksharaBalan/malayalam-ner-model'), pr_revision=None, pr_num=None)