<a href="https://colab.research.google.com/github/Akshara-Balan/Finetuning-with-NER/blob/main/Fine_tuning_with_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate
!pip install transformers datasets seqeval torch accelerate
!pip install scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=a6b1809fcc1f02814782b6881ec4a9f1647e8bf49dc0b0a0711f12da877979bc
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packa

In [None]:
import numpy as np
from datasets import load_dataset, Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import gc

def setup_training(model_name="bert-base-multilingual-cased", batch_size=4):
    # Load the dataset without streaming
    try:
        dataset = load_dataset("ai4bharat/naamapadam", "ml")
        # Take a subset of the dataset
        subset_size = 1000  # Adjust this number based on your memory constraints
        dataset['train'] = Dataset.from_dict(
            dataset['train'][:subset_size]
        )
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

    # Load tokenizer with lower memory footprint
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # Create label mapping
    unique_labels = set()
    for example in dataset['train']:
        unique_labels.update(example["ner_tags"])
    label_list = sorted(list(unique_labels))
    label_to_id = {label: i for i, label in enumerate(label_list)}
    id_to_label = {i: label for i, label in enumerate(label_list)}

    def tokenize_and_align_labels(examples, max_length=128):  # Reduced from 512
        try:
            tokenized_inputs = tokenizer(
                examples["tokens"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
                is_split_into_words=True,
                return_overflowing_tokens=True,
            )

            sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
            all_new_labels = []

            for i in range(len(tokenized_inputs["input_ids"])):
                sample_index = sample_map[i]
                label = examples["ner_tags"][sample_index]

                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                new_labels = []

                for word_idx in word_ids:
                    if word_idx is None:
                        new_labels.append(-100)
                    elif word_idx != previous_word_idx:
                        if word_idx < len(label):
                            new_labels.append(label_to_id.get(label[word_idx], 0))
                        else:
                            new_labels.append(0)
                        previous_word_idx = word_idx
                    else:
                        new_labels.append(-100)

                all_new_labels.append(new_labels)

            tokenized_inputs["labels"] = all_new_labels
            return tokenized_inputs

        except Exception as e:
            print(f"Error in tokenization: {e}")
            return None

    # Process dataset with memory-efficient batching
    tokenized_dataset = dataset['train'].map(
        tokenize_and_align_labels,
        batched=True,
        batch_size=32,  # Smaller batch size for processing
        remove_columns=['tokens', 'ner_tags']
    )

    # Split dataset
    train_test_ratio = 0.2
    dataset_dict = tokenized_dataset.train_test_split(test_size=train_test_ratio, seed=42)
    train_dataset = dataset_dict['train']
    validation_dataset = dataset_dict['test']

    # Clear memory
    gc.collect()
    torch.cuda.empty_cache()

    # Initialize model with gradient checkpointing
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list)
    )
    model.gradient_checkpointing_enable()  # Reduce memory usage during training

    # Training arguments optimized for memory efficiency
    training_args = TrainingArguments(
        output_dir="./ner_results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        report_to=[],
        run_name=None,
        gradient_accumulation_steps=4,  # Reduce memory usage
        fp16=True,  # Use mixed precision training
        optim="adamw_torch",
    )

    return model, tokenizer, train_dataset, validation_dataset, training_args, label_list, id_to_label

# Rest of the code remains the same...
def train_model(model, tokenizer, train_dataset, validation_dataset, training_args, id_to_label):
    # Metric computation
    metric = evaluate.load("seqeval")

    def compute_metrics(eval_pred):
        try:
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)

            true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
            true_predictions = [[id_to_label[p] for p, l in zip(prediction, label) if l != -100]
                              for prediction, label in zip(predictions, labels)]

            results = metric.compute(predictions=true_predictions, references=true_labels, scheme="plain")
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
        except Exception as e:
            print(f"Metric computation error: {e}")
            return {"error": str(e)}

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
    )

    try:
        print("Starting training...")
        trainer.train()
        print("Training completed successfully")

        print("Saving model...")
        trainer.save_model("./fine_tuned_malayalam_ner")
        tokenizer.save_pretrained("./fine_tuned_malayalam_ner")
        print("Model saved successfully")

        return trainer
    except Exception as e:
        print(f"Training error: {e}")
        return None

# Main execution
def main():
    # Start with a smaller batch size
    BATCH_SIZE = 4

    # Setup training components
    components = setup_training(batch_size=BATCH_SIZE)
    if components is None:
        return

    model, tokenizer, train_dataset, validation_dataset, training_args, label_list, id_to_label = components

    # Train model
    trainer = train_model(model, tokenizer, train_dataset, validation_dataset, training_args, id_to_label)

    if trainer is not None:
        print("Training completed successfully")
    else:
        print("Training failed")

if __name__ == "__main__":
    main()

README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

naamapadam.py:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/61.8M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/81.5k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/315k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/716652 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/974 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3618 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Starting training...




In [None]:
def test_model_with_examples(examples=5):
    print("\nTesting model with examples:")
    model = AutoModelForTokenClassification.from_pretrained("./fine_tuned_malayalam_ner")
    model.eval()  # Set model to evaluation mode

    for i in range(min(examples, len(dataset["train"]))):
        example = dataset["train"][i]

        # Ensure the sentence is passed as a string
        sentence = example["sentence"]
        if isinstance(sentence, list):
            sentence = " ".join(sentence)  # Convert list to a string if needed

        tokens = tokenizer(sentence,
                          return_tensors="pt",
                          truncation=True,
                          is_split_into_words=True)

        with torch.no_grad():
            outputs = model(**tokens)
            predictions = torch.argmax(outputs.logits, dim=2)

        token_predictions = [id_to_label[p.item()] for p in predictions[0]]

        # Align predictions with original words
        word_ids = tokens.word_ids()
        aligned_predictions = []
        for word_idx in range(len(sentence.split())):
            word_predictions = [token_predictions[j] for j, w_id in enumerate(word_ids) if w_id == word_idx]
            if word_predictions:
                aligned_predictions.append(max(set(word_predictions), key=word_predictions.count))
            else:
                aligned_predictions.append("O")  # Default if no prediction

        print(f"Example {i}:")
        print("Sentence:", sentence)
        print("True tags:", [id_to_label.get(tag, "Unknown") for tag in example["ner_tags"]])
        print("Predicted:", aligned_predictions[:len(sentence.split())])
        print()

# Uncomment to test the model
import torch
test_model_with_examples()
