<a href="https://colab.research.google.com/github/ChetankUMD/ComplexNER-TeamACS/blob/main/bert%2Bcrf_multiconer_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kmkurn/pytorch-crf.git
!pip install ./pytorch-crf

Cloning into 'pytorch-crf'...
remote: Enumerating objects: 680, done.[K
remote: Counting objects: 100% (111/111), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 680 (delta 41), reused 87 (delta 37), pack-reused 569 (from 1)[K
Receiving objects: 100% (680/680), 110.95 KiB | 4.62 MiB/s, done.
Resolving deltas: 100% (306/306), done.
Processing ./pytorch-crf
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytorch-crf
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-crf: filename=pytorch_crf-0.7.2-py3-none-any.whl size=6410 sha256=a3e0d63a85a85b46eb9d720c813f5ef71f581785c7852ea05713e2042ef7615a
  Stored in directory: /root/.cache/pip/wheels/3f/71/a9/58ef9f02d4052a15b0a5ebe4b70b16e1738654e2d2979fb4fc
Successfully built pytorch-crf
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
!pip install torch transformers datasets seqeval tqdm matplotlib
!pip install 'transformers[sentencepiece]'

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl

In [None]:
# Imports
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorForTokenClassification, TrainingArguments, get_scheduler
from datasets import load_dataset
from torchcrf import CRF
from seqeval.metrics import classification_report
from tqdm import tqdm
import os
from google.colab import drive
import gzip

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the directory to save checkpoints in Google Drive
google_drive_dir = "/content/drive/My Drive/NER_checkpoints"
os.makedirs(google_drive_dir, exist_ok=True)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# resume_from_checkpoint = "/content/drive/My Drive/NER_checkpoints/latest_checkpoint.pt.gz"
resume_from_checkpoint = None
start_epoch = 0

# Load the CoNLL-2003 dataset
dataset = load_dataset("MultiCoNER/multiconer_v2", "English (EN)")

# Reduce the dataset size (30% of the original size) for quicker experimentation
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]))))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]))))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]))))

# Get label names
label_list = dataset["train"].features["ner_tags_index"].feature.names
num_labels = len(label_list)

# Load tokenizer
model_name = "bert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization and label alignment function
def tokenize_and_align_labels_with_crf(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags_index"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels_with_crf, batched=True)

# Remove unnecessary columns
tokenized_datasets["train"] = tokenized_datasets["train"].remove_columns(["id", "sample_id", "tokens", "ner_tags", "ner_tags_index"])
tokenized_datasets["validation"] = tokenized_datasets["validation"].remove_columns(["id", "sample_id", "tokens", "ner_tags", "ner_tags_index"])
tokenized_datasets["test"] = tokenized_datasets["test"].remove_columns(["id", "sample_id", "tokens", "ner_tags", "ner_tags_index"])

# Set dataset format
tokenized_datasets["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["validation"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# DataLoader setup
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)


# Define the model with CRF
class BertCRFNER(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertCRFNER, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(self.dropout(outputs.last_hidden_state))

        if labels is not None:
            # Replace -100 with a valid index (e.g., 0)
            valid_labels = labels.clone()
            valid_labels[labels == -100] = 0

            # Compute CRF loss
            loss = -self.crf(logits, valid_labels, mask=attention_mask.bool())
            return loss
        else:
            # Decode CRF predictions
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return predictions

# Training configuration
num_epochs = 5
learning_rate = 3e-5
weight_decay = 0.01

# Initialize the model
improved_model = BertCRFNER(model_name=model_name, num_labels=num_labels)
improved_model.to(device)

# Optimizer and Scheduler setup
optimizer = torch.optim.AdamW(improved_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Loss tracking
train_losses = []
val_losses = []


# Modify the training setup to support checkpoint resuming
if resume_from_checkpoint:
    # Load the checkpoint
    # checkpoint = torch.load(resume_from_checkpoint)
    with gzip.open(resume_from_checkpoint, 'rb') as f:

        checkpoint = torch.load(f, map_location=device)

    # Restore model state
    improved_model.load_state_dict(checkpoint['model_state_dict'])
    improved_model.to(device)

    # Restore optimizer state
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # Restore scheduler state
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    # Set the starting epoch and potentially modify num_epochs
    start_epoch = checkpoint['epoch']

    # Optionally prepopulate loss lists with previous losses
    train_losses = checkpoint.get('train_losses', [])
    val_losses = checkpoint.get('val_losses', [])

    print(f"Resuming training from epoch {start_epoch}")

# latest_checkpoint_path = os.path.join(google_drive_dir, "latest_checkpoint.pt")
latest_checkpoint_path = os.path.join(google_drive_dir, "latest_checkpoint.pt.gz")
# Training and Validation Loop
for epoch in range(start_epoch,num_epochs):
    # Training Phase
    improved_model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = improved_model(input_ids, attention_mask, labels=labels)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(improved_model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    # Average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch + 1}: Training Loss = {avg_train_loss}")

    # Validation Phase
    improved_model.eval()
    total_val_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss = improved_model(input_ids, attention_mask, labels=labels)
            total_val_loss += loss.item()

            preds = improved_model(input_ids, attention_mask)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy().tolist())

    # Average validation loss for the epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss}")

    # torch.save({
    #     'epoch': epoch + 1,
    #     'model_state_dict': improved_model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'scheduler_state_dict': scheduler.state_dict()
    # }, latest_checkpoint_path)

    with gzip.open(latest_checkpoint_path, 'wb') as f:
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': improved_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict()
        }, f)


    print(f"Checkpoint saved at: {latest_checkpoint_path}")

    # Decode and print classification report
    decoded_predictions = []
    decoded_labels = []
    for preds, labels in zip(predictions, true_labels):
        valid_preds = [p for p, l in zip(preds, labels) if l != -100]
        valid_labels = [l for l in labels if l != -100]
        decoded_predictions.append([label_list[p] for p in valid_preds])
        decoded_labels.append([label_list[l] for l in valid_labels])

    print(f"Classification Report for Epoch {epoch + 1}:")
    print(classification_report(decoded_labels, decoded_predictions))

# Plot loss curves
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss', marker='o')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss', marker='o')
plt.title('Training and Validation Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
loss_curve_path = os.path.join(google_drive_dir, 'loss_curve.png')
plt.savefig(loss_curve_path)
plt.close()

print("Training completed. Checkpoints and loss curve saved in /content/checkpoints/")

Mounted at /content/drive
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.45k [00:00<?, ?B/s]

multiconer_v2.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.98M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/117k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16778 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/871 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/249980 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/16778 [00:00<?, ? examples/s]

Map:   0%|          | 0/871 [00:00<?, ? examples/s]

Map:   0%|          | 0/249980 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Training Epoch 1: 100%|██████████| 1049/1049 [22:37<00:00,  1.29s/it]


Epoch 1: Training Loss = 117.65626361508728


Evaluating: 100%|██████████| 55/55 [00:47<00:00,  1.17it/s]


Epoch 1: Validation Loss = 65.69587013938211


  _warn_prf(average, modifier, msg_start, len(result))


Checkpoint saved at: /content/drive/My Drive/NER_checkpoints/latest_checkpoint.pt.gz
Classification Report for Epoch 1:
                       precision    recall  f1-score   support

AerospaceManufacturer       0.38      0.60      0.46        10
  AnatomicalStructure       0.65      0.76      0.70        17
              ArtWork       0.50      0.38      0.43        13
               Artist       0.74      0.83      0.78       212
              Athlete       0.68      0.77      0.72        79
      CarManufacturer       0.20      0.15      0.17        13
               Cleric       0.45      0.33      0.38        15
             Clothing       0.67      0.60      0.63        10
              Disease       0.46      0.33      0.39        18
                Drink       0.27      0.27      0.27        11
             Facility       0.51      0.67      0.58        52
                 Food       0.46      0.32      0.37        19
      HumanSettlement       0.78      0.81      0.79       1

Training Epoch 2: 100%|██████████| 1049/1049 [22:35<00:00,  1.29s/it]


Epoch 2: Training Loss = 53.443223507774796


Evaluating: 100%|██████████| 55/55 [00:46<00:00,  1.17it/s]


Epoch 2: Validation Loss = 60.406124184348364
Checkpoint saved at: /content/drive/My Drive/NER_checkpoints/latest_checkpoint.pt.gz
Classification Report for Epoch 2:
                       precision    recall  f1-score   support

AerospaceManufacturer       0.50      0.60      0.55        10
  AnatomicalStructure       0.69      0.65      0.67        17
              ArtWork       0.70      0.54      0.61        13
               Artist       0.78      0.83      0.80       212
              Athlete       0.70      0.80      0.75        79
      CarManufacturer       0.47      0.54      0.50        13
               Cleric       0.45      0.33      0.38        15
             Clothing       0.45      0.50      0.48        10
              Disease       0.44      0.44      0.44        18
                Drink       0.50      0.55      0.52        11
             Facility       0.68      0.69      0.69        52
                 Food       0.64      0.47      0.55        19
      HumanSet

Training Epoch 3: 100%|██████████| 1049/1049 [22:39<00:00,  1.30s/it]


Epoch 3: Training Loss = 33.29585610126972


Evaluating: 100%|██████████| 55/55 [00:47<00:00,  1.17it/s]


Epoch 3: Validation Loss = 60.80253365256569
Checkpoint saved at: /content/drive/My Drive/NER_checkpoints/latest_checkpoint.pt.gz
Classification Report for Epoch 3:
                       precision    recall  f1-score   support

AerospaceManufacturer       0.57      0.80      0.67        10
  AnatomicalStructure       0.69      0.65      0.67        17
              ArtWork       0.50      0.23      0.32        13
               Artist       0.73      0.87      0.80       212
              Athlete       0.76      0.78      0.77        79
      CarManufacturer       0.54      0.54      0.54        13
               Cleric       0.50      0.33      0.40        15
             Clothing       0.64      0.70      0.67        10
              Disease       0.50      0.61      0.55        18
                Drink       0.60      0.82      0.69        11
             Facility       0.64      0.79      0.71        52
                 Food       0.83      0.53      0.65        19
      HumanSett

Training Epoch 4: 100%|██████████| 1049/1049 [22:35<00:00,  1.29s/it]


Epoch 4: Training Loss = 21.13793125279866


Evaluating: 100%|██████████| 55/55 [00:47<00:00,  1.17it/s]


Epoch 4: Validation Loss = 69.80272369384765
Checkpoint saved at: /content/drive/My Drive/NER_checkpoints/latest_checkpoint.pt.gz
Classification Report for Epoch 4:
                       precision    recall  f1-score   support

AerospaceManufacturer       0.62      0.80      0.70        10
  AnatomicalStructure       0.67      0.71      0.69        17
              ArtWork       0.46      0.46      0.46        13
               Artist       0.81      0.79      0.80       212
              Athlete       0.81      0.75      0.78        79
      CarManufacturer       0.62      0.62      0.62        13
               Cleric       0.50      0.40      0.44        15
             Clothing       0.67      0.80      0.73        10
              Disease       0.56      0.50      0.53        18
                Drink       0.58      0.64      0.61        11
             Facility       0.67      0.79      0.73        52
                 Food       0.50      0.58      0.54        19
      HumanSett

Training Epoch 5:  15%|█▍        | 155/1049 [03:23<19:36,  1.32s/it]


KeyboardInterrupt: 

In [None]:
# After the training loop, add the following code for testing

# Set the model to evaluation mode
improved_model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Disable gradient computation during testing
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Get predictions
        preds = improved_model(input_ids, attention_mask)

        # Extend predictions and true labels
        all_predictions.extend(preds)
        all_true_labels.extend(labels.cpu().numpy().tolist())

# Decode predictions and true labels
decoded_predictions = []
decoded_labels = []
for preds, labels in zip(all_predictions, all_true_labels):
    valid_preds = [p for p, l in zip(preds, labels) if l != -100]
    valid_labels = [l for l in labels if l != -100]
    decoded_predictions.append([label_list[p] for p in valid_preds])
    decoded_labels.append([label_list[l] for l in valid_labels])

# Print classification report
print("Classification Report on Test Data:")
print(classification_report(decoded_labels, decoded_predictions))

Testing:   2%|▏         | 298/15624 [02:06<1:48:29,  2.35it/s]


KeyboardInterrupt: 

In [None]:
# Reduce the test dataset size (e.g., 10% of the original size)
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]) * 0.1)))

# Reapply tokenization to the reduced test dataset
tokenized_datasets["test"] = dataset["test"].map(tokenize_and_align_labels_with_crf, batched=True)
tokenized_datasets["test"] = tokenized_datasets["test"].remove_columns(["id", "sample_id", "tokens", "ner_tags", "ner_tags_index"])
tokenized_datasets["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Recreate the test dataloader with the smaller dataset
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# Set the model to evaluation mode
improved_model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Disable gradient computation during testing
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Get predictions
        preds = improved_model(input_ids, attention_mask)

        # Extend predictions and true labels
        all_predictions.extend(preds)
        all_true_labels.extend(labels.cpu().numpy().tolist())

# Decode predictions and true labels
decoded_predictions = []
decoded_labels = []
for preds, labels in zip(all_predictions, all_true_labels):
    valid_preds = [p for p, l in zip(preds, labels) if l != -100]
    valid_labels = [l for l in labels if l != -100]
    decoded_predictions.append([label_list[p] for p in valid_preds])
    decoded_labels.append([label_list[l] for l in valid_labels])

# Print classification report
print("Classification Report on Test Data (10% subset):")
print(classification_report(decoded_labels, decoded_predictions))

Map:   0%|          | 0/24998 [00:00<?, ? examples/s]

Testing: 100%|██████████| 1563/1563 [11:00<00:00,  2.37it/s]


Classification Report on Test Data (10% subset):
                       precision    recall  f1-score   support

AerospaceManufacturer       0.35      0.63      0.45        82
  AnatomicalStructure       0.65      0.71      0.68       567
              ArtWork       0.40      0.56      0.46       149
               Artist       0.71      0.81      0.76      5696
              Athlete       0.77      0.76      0.77      2676
      CarManufacturer       0.58      0.60      0.59       297
               Cleric       0.50      0.49      0.50       489
             Clothing       0.58      0.61      0.60       224
              Disease       0.65      0.63      0.64       566
                Drink       0.52      0.55      0.53       203
             Facility       0.62      0.67      0.65      1605
                 Food       0.50      0.52      0.51       545
      HumanSettlement       0.86      0.87      0.87      4205
     MedicalProcedure       0.60      0.59      0.59       383
   Me

In [None]:
        EN
micro 0.61
macro 0.53


SyntaxError: invalid syntax (<ipython-input-7-87b0812ace09>, line 1)

In [None]:
checkpoint = torch.load(resume_from_checkpoint, map_location=device)
print("Checkpoint Keys:", checkpoint.keys())


  checkpoint = torch.load(resume_from_checkpoint, map_location=device)


RuntimeError: PytorchStreamReader failed locating file data/2: file not found

In [None]:
test_path = "/content/drive/My Drive/NER_checkpoints/test_checkpoint.pt"
torch.save({'model_state_dict': improved_model.state_dict()}, test_path)
checkpoint = torch.load(test_path, map_location=device)
print("Test checkpoint loaded successfully!")


  checkpoint = torch.load(test_path, map_location=device)


Test checkpoint loaded successfully!


In [None]:
with gzip.open(resume_from_checkpoint, 'rb') as f:
        checkpoint = torch.load(f, map_location=device)

TypeError: filename must be a str or bytes object, or a file

In [None]:
from datasets import load_dataset

# Load the MultiCoNER v2 dataset
dataset = load_dataset("MultiCoNER/multiconer_v2", "English (EN)")

# Extract unique entities from the training set
def extract_entities(dataset):
    entity_set = set()
    for example in dataset["train"]:
        tokens = example["tokens"]
        ner_tags = example["ner_tags_index"]
        for token, tag in zip(tokens, ner_tags):
            if tag != 0:  # Ignore non-entity tags (usually labeled as 'O')
                entity_set.add(token.lower())
    return entity_set

entities = extract_entities(dataset)

# Save entities to a gazetteer file
gazetteer_path = "/content/drive/My Drive/gazetteer.txt"
with open(gazetteer_path, "w") as file:
    for entity in sorted(entities):
        file.write(entity + "\n")

print(f"Gazetteer saved at {gazetteer_path}")


Gazetteer saved at /content/drive/My Drive/gazetteer.txt


In [None]:
with open(resume_from_checkpoint, 'rb') as f:
    magic_bytes = f.read(2)
print(f"Magic bytes: {magic_bytes}")

Magic bytes: b'PK'


In [None]:
import zipfile
import io
import torch

# Path to your ZIP file
resume_from_checkpoint = "/content/drive/My Drive/NER_checkpoints/latest_checkpoint.zip"

# Open the ZIP file and extract the checkpoint
with zipfile.ZipFile(resume_from_checkpoint, 'r') as zf:
    with zf.open("checkpoint.pt") as f:
        checkpoint = torch.load(io.BytesIO(f.read()), map_location=device)

# Restore model state and other parameters
improved_model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint['epoch']

print(f"Checkpoint loaded successfully. Resuming from epoch {start_epoch}.")



FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/NER_checkpoints/latest_checkpoint.zip'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have train_losses and val_losses lists from your training loop

# 1. Basic Line Plot of Training and Validation Losses
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss', marker='o')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss', marker='o')
plt.title('Training and Validation Losses Across Epochs', fontsize=16)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('/content/drive/My Drive/NER_checkpoints/loss_comparison.png')
plt.close()

# 2. Advanced Visualization with Seaborn
plt.figure(figsize=(14, 7))
sns.set_style("whitegrid")
sns.lineplot(x=range(1, len(train_losses) + 1), y=train_losses, label='Training Loss', marker='o')
sns.lineplot(x=range(1, len(val_losses) + 1), y=val_losses, label='Validation Loss', marker='o')
plt.title('Training vs Validation Loss Progression', fontsize=16)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.tight_layout()
plt.savefig('/content/drive/My Drive/NER_checkpoints/seaborn_loss_plot.png')
plt.close()

# 3. Detailed Loss Analysis
plt.figure(figsize=(15, 8))
# Subplot for Training Loss
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss', color='blue', marker='o')
plt.title('Training Loss Progression', fontsize=14)
plt.xlabel('Epoch', fontsize=10)
plt.ylabel('Loss', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)

# Subplot for Validation Loss
plt.subplot(1, 2, 2)
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss', color='red', marker='o')
plt.title('Validation Loss Progression', fontsize=14)
plt.xlabel('Epoch', fontsize=10)
plt.ylabel('Loss', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('/content/drive/My Drive/NER_checkpoints/detailed_loss_analysis.png')
plt.close()

# 4. Percentage Change in Losses
train_loss_pct_change = [((train_losses[i] - train_losses[i-1]) / train_losses[i-1]) * 100 for i in range(1, len(train_losses))]
val_loss_pct_change = [((val_losses[i] - val_losses[i-1]) / val_losses[i-1]) * 100 for i in range(1, len(val_losses))]

plt.figure(figsize=(12, 6))
plt.plot(range(2, len(train_losses) + 1), train_loss_pct_change, label='Training Loss % Change', marker='o')
plt.plot(range(2, len(val_losses) + 1), val_loss_pct_change, label='Validation Loss % Change', marker='o')
plt.title('Percentage Change in Losses Across Epochs', fontsize=16)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Percentage Change (%)', fontsize=12)
plt.axhline(y=0, color='r', linestyle='--')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('/content/drive/My Drive/NER_checkpoints/loss_percentage_change.png')
plt.close()

# Print out some key statistics
print("Training Loss Statistics:")
print(f"Initial Loss: {train_losses[0]:.4f}")
print(f"Final Loss: {train_losses[-1]:.4f}")
print(f"Total Loss Reduction: {train_losses[0] - train_losses[-1]:.4f}")
print(f"Percentage Loss Reduction: {((train_losses[0] - train_losses[-1]) / train_losses[0]) * 100:.2f}%")

print("\nValidation Loss Statistics:")
print(f"Initial Loss: {val_losses[0]:.4f}")
print(f"Final Loss: {val_losses[-1]:.4f}")
print(f"Total Loss Change: {val_losses[-1] - val_losses[0]:.4f}")

Training Loss Statistics:
Initial Loss: 117.6563
Final Loss: 21.1379
Total Loss Reduction: 96.5183
Percentage Loss Reduction: 82.03%

Validation Loss Statistics:
Initial Loss: 65.6959
Final Loss: 69.8027
Total Loss Change: 4.1069


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

# Assuming you have decoded_labels and decoded_predictions from your previous testing code

# # 1. Precision, Recall, F1-Score Visualization
# def plot_entity_performance():
#     # Extract metrics from classification report
#     precision = {}
#     recall = {}
#     f1_score = {}

#     for label in label_list:
#         # Find the corresponding row in the classification report
#         for row in classification_report(decoded_labels, decoded_predictions, output_dict=True)[''][''].items():
#             if row[0] == label:
#                 precision[label] = row[1]['precision']
#                 recall[label] = row[1]['recall']
#                 f1_score[label] = row[1]['f1-score']
#                 break

#     # Sort entities by F1-score
#     sorted_entities = sorted(f1_score.items(), key=lambda x: x[1], reverse=True)

#     # Prepare data for plotting
#     entities = [x[0] for x in sorted_entities]
#     f1_scores = [x[1] for x in sorted_entities]

#     # Create a horizontal bar plot
#     plt.figure(figsize=(12, 10))
#     plt.barh(entities, f1_scores, color='skyblue')
#     plt.title('F1-Scores by Entity (Sorted)', fontsize=16)
#     plt.xlabel('F1-Score', fontsize=12)
#     plt.ylabel('Entity', fontsize=12)
#     plt.xlim(0, 1)

#     # Add value labels
#     for i, v in enumerate(f1_scores):
#         plt.text(v, i, f' {v:.2f}', va='center', fontsize=10)

#     plt.tight_layout()
#     plt.savefig('/content/drive/My Drive/NER_checkpoints/entity_f1_scores.png')
#     plt.close()

def plot_entity_performance():
    # Extract metrics from classification report
    report = classification_report(decoded_labels, decoded_predictions, output_dict=True)
    precision = {label: report[label]['precision'] for label in label_list if label in report}
    recall = {label: report[label]['recall'] for label in label_list if label in report}
    f1_score = {label: report[label]['f1-score'] for label in label_list if label in report}

    # Sort entities by F1-score
    sorted_entities = sorted(f1_score.items(), key=lambda x: x[1], reverse=True)

    # Prepare data for plotting
    entities = [x[0] for x in sorted_entities]
    f1_scores = [x[1] for x in sorted_entities]

    # Create a horizontal bar plot
    plt.figure(figsize=(12, 10))
    plt.barh(entities, f1_scores, color='skyblue')
    plt.title('F1-Scores by Entity (Sorted)', fontsize=16)
    plt.xlabel('F1-Score', fontsize=12)
    plt.ylabel('Entity', fontsize=12)
    plt.xlim(0, 1)

    # Add value labels
    for i, v in enumerate(f1_scores):
        plt.text(v, i, f' {v:.2f}', va='center', fontsize=10)

    plt.tight_layout()
    plt.savefig('/content/drive/My Drive/NER_checkpoints/entity_f1_scores.png')
    plt.close()

# 2. Confusion Matrix for Top Entities
def plot_confusion_matrix():
    # Flatten the predictions and labels
    flat_true = [label for sublist in decoded_labels for label in sublist]
    flat_pred = [pred for sublist in decoded_predictions for pred in sublist]

    # Select top 10 most frequent entities
    unique_labels = list(set(flat_true))
    label_counts = {label: flat_true.count(label) for label in unique_labels}
    top_labels = sorted(label_counts, key=label_counts.get, reverse=True)[:10]

    # Create confusion matrix for top labels
    cm = confusion_matrix(
        [label if label in top_labels else 'Other' for label in flat_true],
        [label if label in top_labels else 'Other' for label in flat_pred]
    )

    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
                xticklabels=top_labels + ['Other'],
                yticklabels=top_labels + ['Other'])
    plt.title('Confusion Matrix for Top 10 Entities', fontsize=16)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.tight_layout()
    plt.savefig('/content/drive/My Drive/NER_checkpoints/confusion_matrix.png')
    plt.close()

# 3. Detailed Entity Performance Radar Chart
def plot_entity_performance_radar():
    # Prepare data
    classification_dict = classification_report(decoded_labels, decoded_predictions, output_dict=True)

    # Select top 10 entities by support
    entities = [label for label in label_list if label in classification_dict and label != '']
    entities.sort(key=lambda x: classification_dict[x]['support'], reverse=True)
    top_entities = entities[:10]

    # Extract metrics
    precision = [classification_dict[entity]['precision'] for entity in top_entities]
    recall = [classification_dict[entity]['recall'] for entity in top_entities]
    f1 = [classification_dict[entity]['f1-score'] for entity in top_entities]

    # Radar chart
    plt.figure(figsize=(10, 10))
    angles = np.linspace(0, 2*np.pi, len(top_entities), endpoint=False)

    # Close the plot
    precision.append(precision[0])
    recall.append(recall[0])
    f1.append(f1[0])
    angles = np.concatenate((angles, [angles[0]]))

    plt.polar(angles, precision, 'o-', label='Precision')
    plt.fill(angles, precision, alpha=0.25)
    plt.polar(angles, recall, 's-', label='Recall')
    plt.fill(angles, recall, alpha=0.25)
    plt.polar(angles, f1, '^-', label='F1-Score')
    plt.fill(angles, f1, alpha=0.25)

    plt.thetagrids(angles[:-1] * 180/np.pi, top_entities)
    plt.title('Performance Metrics for Top 10 Entities', fontsize=16)
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    plt.tight_layout()
    plt.savefig('/content/drive/My Drive/NER_checkpoints/entity_radar_chart.png')
    plt.close()

# Execute all visualization functions
plot_entity_performance()
plot_confusion_matrix()
plot_entity_performance_radar()

# Print out some additional insights
def print_entity_insights():
    # Get classification report as dictionary
    report_dict = classification_report(decoded_labels, decoded_predictions, output_dict=True)

    # Top performing entities
    print("\nTop 5 Best Performing Entities:")
    top_entities = sorted(
        [(entity, report_dict[entity]['f1-score']) for entity in label_list if entity in report_dict],
        key=lambda x: x[1],
        reverse=True
    )[:5]

    for entity, f1 in top_entities:
        print(f"{entity}: F1-Score = {f1:.2f}")

    # Bottom performing entities
    print("\nBottom 5 Performing Entities:")
    bottom_entities = sorted(
        [(entity, report_dict[entity]['f1-score']) for entity in label_list if entity in report_dict],
        key=lambda x: x[1]
    )[:5]

    for entity, f1 in bottom_entities:
        print(f"{entity}: F1-Score = {f1:.2f}")

print_entity_insights()

IndexError: list index out of range

<Figure size 1000x1000 with 0 Axes>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Visualize entity-specific performance
def plot_entity_performance(decoded_predictions, decoded_labels, label_list):
    # Flatten predictions and labels
    flat_true = [l for sublist in decoded_labels for l in sublist]
    flat_pred = [p for sublist in decoded_predictions for p in sublist]

    # Identify unique labels in predictions and labels
    unique_labels = list(set(flat_true) | set(flat_pred))
    filtered_label_list = [label_list[i] for i in unique_labels]

    # Generate classification report
    report = classification_report(flat_true, flat_pred, target_names=filtered_label_list, output_dict=True)

    # Extract and sort F1 scores
    f1_scores = {label: report[label]["f1-score"] for label in filtered_label_list if label in report}
    sorted_entities = sorted(f1_scores.items(), key=lambda x: x[1], reverse=True)

    entities = [x[0] for x in sorted_entities]
    scores = [x[1] for x in sorted_entities]

    # Plot F1 scores
    plt.figure(figsize=(12, 8))
    plt.barh(entities, scores, color='skyblue')
    plt.title("F1 Scores by Entity", fontsize=16)
    plt.xlabel("F1 Score", fontsize=12)
    plt.ylabel("Entity", fontsize=12)
    plt.xlim(0, 1)
    plt.tight_layout()
    plt.show()

# Plot confusion matrix for top entities
def plot_confusion_matrix(decoded_predictions, decoded_labels, label_list):
    # Flatten predictions and labels
    flat_true = [l for sublist in decoded_labels for l in sublist]
    flat_pred = [p for sublist in decoded_predictions for p in sublist]

    # Identify unique labels in predictions and labels
    unique_labels = list(set(flat_true) | set(flat_pred))
    filtered_label_list = [label_list[i] for i in unique_labels]

    # Generate confusion matrix
    cm = confusion_matrix(flat_true, flat_pred, labels=unique_labels)

    # Plot confusion matrix
    plt.figure(figsize=(15, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=filtered_label_list, yticklabels=filtered_label_list)
    plt.title("Confusion Matrix", fontsize=16)
    plt.xlabel("Predicted Labels", fontsize=12)
    plt.ylabel("True Labels", fontsize=12)
    plt.tight_layout()
    plt.show()

# Print detailed classification report
def print_classification_report(decoded_predictions, decoded_labels, label_list):
    # Flatten predictions and labels
    flat_true = [l for sublist in decoded_labels for l in sublist]
    flat_pred = [p for sublist in decoded_predictions for p in sublist]

    # Identify unique labels in predictions and labels
    unique_labels = list(set(flat_true) | set(flat_pred))
    filtered_label_list = [label_list[i] for i in unique_labels]

    # Print classification report
    print("Detailed Classification Report:\n")
    print(classification_report(flat_true, flat_pred, target_names=filtered_label_list))

# Call visualization functions
plot_entity_performance(decoded_predictions, decoded_labels, label_list)
plot_confusion_matrix(decoded_predictions, decoded_labels, label_list)
print_classification_report(decoded_predictions, decoded_labels, label_list)


TypeError: list indices must be integers or slices, not str

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("MultiCoNER/multiconer_v2")

# Extract all languages in the dataset
languages = dataset.keys()

# Function to count entities by language
def count_entities_by_language(dataset, languages):
    entity_counts = {lang: Counter() for lang in languages}

    for lang in languages:
        for example in dataset[lang]["train"]:
            entity_counts[lang].update(example["ner_tags_index"])

    return entity_counts

# Function to map entity indices to names
def map_entity_indices_to_names(entity_counts, label_list):
    named_entity_counts = {
        lang: {label_list[index]: count for index, count in counts.items()}
        for lang, counts in entity_counts.items()
    }
    return named_entity_counts

# Get label names from one language (they are the same across languages)
label_list = dataset[list(languages)[0]].features["ner_tags_index"].feature.names

# Count entities for all languages
entity_counts = count_entities_by_language(dataset, languages)

# Map entity indices to names
named_entity_counts = map_entity_indices_to_names(entity_counts, label_list)

# Convert entity counts to a DataFrame for visualization
entity_counts_df = pd.DataFrame(named_entity_counts).fillna(0)

# Plot 1: Total entity distribution across languages
entity_counts_sum = entity_counts_df.sum(axis=0)
entity_counts_sum.sort_values(ascending=False, inplace=True)
plt.figure(figsize=(12, 6))
entity_counts_sum.plot(kind="bar", color="skyblue")
plt.title("Total Entity Distribution Across Languages", fontsize=16)
plt.ylabel("Total Count", fontsize=12)
plt.xlabel("Language", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("entity_distribution_languages.png")
plt.show()

# Plot 2: Entity distribution within a specific language (e.g., English)
language = "English (EN)"
plt.figure(figsize=(12, 6))
entity_counts_df[language].sort_values(ascending=False).plot(kind="bar", color="lightcoral")
plt.title(f"Entity Distribution in {language}", fontsize=16)
plt.ylabel("Count", fontsize=12)
plt.xlabel("Entity", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"entity_distribution_{language}.png")
plt.show()

# Plot 3: Heatmap of entity presence across languages
binary_presence_df = entity_counts_df.applymap(lambda x: 1 if x > 0 else 0)
plt.figure(figsize=(14, 8))
sns.heatmap(binary_presence_df, annot=True, cmap="YlGnBu", cbar=False, linewidths=0.5)
plt.title("Entity Presence Across Languages", fontsize=16)
plt.ylabel("Entity", fontsize=12)
plt.xlabel("Language", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("entity_presence_heatmap.png")
plt.show()

print("EDA visualizations saved and displayed.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("MultiCoNER/multiconer_v2")

# Extract all languages in the dataset
languages = dataset.keys()

# Function to count entities by language
def count_entities_by_language(dataset, languages):
    entity_counts = {lang: Counter() for lang in languages}

    for lang in languages:
        for example in dataset[lang]["train"]:
            entity_counts[lang].update(example["ner_tags_index"])

    return entity_counts

# Function to map entity indices to names
def map_entity_indices_to_names(entity_counts, label_list):
    named_entity_counts = {
        lang: {label_list[index]: count for index, count in counts.items()}
        for lang, counts in entity_counts.items()
    }
    return named_entity_counts

# Get label names from one language (they are the same across languages)
label_list = dataset[list(languages)[0]].features["ner_tags_index"].feature.names

# Count entities for all languages
entity_counts = count_entities_by_language(dataset, languages)

# Map entity indices to names
named_entity_counts = map_entity_indices_to_names(entity_counts, label_list)

# Convert entity counts to a DataFrame for visualization
entity_counts_df = pd.DataFrame(named_entity_counts).fillna(0)

# Plot 1: Total entity distribution across languages
entity_counts_sum = entity_counts_df.sum(axis=0)
entity_counts_sum.sort_values(ascending=False, inplace=True)
plt.figure(figsize=(12, 6))
entity_counts_sum.plot(kind="bar", color="skyblue")
plt.title("Total Entity Distribution Across Languages", fontsize=16)
plt.ylabel("Total Count", fontsize=12)
plt.xlabel("Language", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("entity_distribution_languages.png")
plt.show()

# Plot 2: Entity distribution within a specific language (e.g., English)
language = "English (EN)"
plt.figure(figsize=(12, 6))
entity_counts_df[language].sort_values(ascending=False).plot(kind="bar", color="lightcoral")
plt.title(f"Entity Distribution in {language}", fontsize=16)
plt.ylabel("Count", fontsize=12)
plt.xlabel("Entity", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"entity_distribution_{language}.png")
plt.show()

# Plot 3: Heatmap of entity presence across languages
binary_presence_df = entity_counts_df.applymap(lambda x: 1 if x > 0 else 0)
plt.figure(figsize=(14, 8))
sns.heatmap(binary_presence_df, annot=True, cmap="YlGnBu", cbar=False, linewidths=0.5)
plt.title("Entity Presence Across Languages", fontsize=16)
plt.ylabel("Entity", fontsize=12)
plt.xlabel("Language", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("entity_presence_heatmap.png")
plt.show()

print("EDA visualizations saved and displayed.")
