In [17]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import torch
import ast  # To safely convert string representation of lists back to Python lists

# Load dataset
dataset = load_dataset("AksharaBalan/malayalam-ner-dataset-naamapadam")

# Manually create a validation split (10% for validation)
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
dataset["validation"] = dataset.pop("test")

# Print dataset info
print(dataset)
print(dataset["train"][0])

# Load model and tokenizer
model_name = "l3cube-pune/malayalam-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Automatically determine number of labels
unique_labels = set()
for row in dataset["train"]:
    unique_labels.update(ast.literal_eval(row["ner_tags"]))
num_labels = len(unique_labels)

# Load model with correct label size
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

def tokenize_and_align_labels(examples):
    # Convert string representations back to lists
    examples["tokens"] = [ast.literal_eval(x) for x in examples["tokens"]]
    examples["ner_tags"] = [ast.literal_eval(x) for x in examples["ner_tags"]]

    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    all_labels = []
    for i in range(len(examples["tokens"])):  # Loop through each example
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore special tokens
            else:
                label_ids.append(examples["ner_tags"][i][word_id])  # Assign label
        
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./malayalam-ner-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=True,  # Enable model pushing
    hub_model_id="AksharaBalan/malayalam-bert-finetuned-ner",
    report_to="none"  # Disabling WandB logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start Fine-Tuning
trainer.train()

# Save and push the model
model.save_pretrained("./malayalam-ner-finetuned")
tokenizer.save_pretrained("./malayalam-ner-finetuned")
trainer.push_to_hub()


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 45000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5000
    })
})
{'tokens': "['അനന്ത്', 'നാഗി', 'െല', 'അചബൽ', 'ചൌക്കിൽ', 'ക്രമസമാധാന', 'ചുമതലയുണ്ടായിരുന്ന', 'സി', '.', 'ആർ', '.', 'പി', '.', 'എഫ്', 'സംഘത്തിനു', 'നേരെ', 'തീവ്രവാദികൾ', 'വെടിയുതിർക്കുകയായിരുന്നു', '.']", 'ner_tags': '[0, 5, 0, 5, 6, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at l3cube-pune/malayalam-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3455,0.37115
2,0.323,0.337661
3,0.2676,0.339289


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/AksharaBalan/malayalam-bert-finetuned-ner/commit/ba8ad85b34643c7051fea91fdb1a60a445ab00b6', commit_message='End of training', commit_description='', oid='ba8ad85b34643c7051fea91fdb1a60a445ab00b6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AksharaBalan/malayalam-bert-finetuned-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='AksharaBalan/malayalam-bert-finetuned-ner'), pr_revision=None, pr_num=None)

In [18]:
# Save and push the model
model.save_pretrained("./malayalam-ner-finetuned-bert")
tokenizer.save_pretrained("./malayalam-ner-finetuned-bert")
trainer.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/AksharaBalan/malayalam-bert-finetuned-ner/commit/ba8ad85b34643c7051fea91fdb1a60a445ab00b6', commit_message='End of training', commit_description='', oid='ba8ad85b34643c7051fea91fdb1a60a445ab00b6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AksharaBalan/malayalam-bert-finetuned-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='AksharaBalan/malayalam-bert-finetuned-ner'), pr_revision=None, pr_num=None)

**TAKING INFERENCE**

In [19]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Model checkpoint from Hugging Face
model_name = "AksharaBalan/malayalam-bert-finetuned-ner"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Load NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Device set to use cuda:0


In [22]:
sample_text = "സിനിമാതാരം മോഹൻലാൽ, പ്രശസ്തനായ നടൻ മമ്മൂട്ടിയോടൊപ്പം കൊച്ചിയിലെ ലുലുമാളിൽ ഏഷ്യാനെറ്റ് ന്യൂസിന്റെ അഭിമുഖത്തിൽ പങ്കെടുത്തു."

# Get NER predictions
predictions = ner_pipeline(sample_text)

# Print results
for entity in predictions:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


Entity: സിനിമാതാരം, Label: LABEL_0, Score: 0.9922
Entity: മോഹൻലാൽ, Label: LABEL_1, Score: 0.9534
Entity: , പ്രശസ്തനായ നടൻ, Label: LABEL_0, Score: 0.9923
Entity: മമ്മൂട്ടിയോടൊപ്പം, Label: LABEL_1, Score: 0.8746
Entity: കൊച്ചിയിലെ ലുലുമാളിൽ, Label: LABEL_0, Score: 0.7634
Entity: ഏഷ്യാനെറ്റ്, Label: LABEL_3, Score: 0.7398
Entity: ന്യൂസിന്റെ, Label: LABEL_4, Score: 0.4759
Entity: അഭിമുഖത്തിൽ പങ്കെടുത്തു., Label: LABEL_0, Score: 0.9943


In [21]:
# Get label mappings from the model
label_mapping = model.config.id2label
print(label_mapping)


{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6'}
