1. Imports and Setup:

Place all imports (e.g., torch, transformers, etc.) and environment setup (e.g., GPU configuration) at the top of the notebook.

In [None]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)

# Verify tensor placement
x = torch.randn(3, 3).to(device)
print("Tensor is on device:", x.device)

2. Define the Model:

In [None]:
# Define the model
label_to_id = {"O": 0, "PERSON": 1, "ORG": 2, "PHONE": 3, "EMAIL": 4}  # Example label mapping
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_to_id))
model.to(device)

# Verify model placement
print("Model is on device:", next(model.parameters()).device)

3. Load Training Data:

In [None]:
import importlib
import training_data

# Reload the module to pick up changes
importlib.reload(training_data)

# Access the updated TRAIN_DATA
TRAIN_DATA = training_data.TRAIN_DATA

# Print the updated count
print("Number of training examples:", len(TRAIN_DATA))

4. Preprocess Training and Validation Data

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Tokenizer (using a pre-trained tokenizer, e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

# Define the preprocess_data function
def preprocess_data(data, tokenizer, label_to_id):
    tokenized_data = []

    # Debugging: Print tokenized input and offsets
    for text, annotations in train_data[:5]:  # Check the first 5 examples
        tokenized = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_offsets_mapping=True,
            return_tensors="pt"
        )
        print("Text:", text)
        print("Tokenized Input IDs:", tokenized["input_ids"])
        print("Offsets:", tokenized["offset_mapping"])
        print("Annotations:", annotations)

        # Initialize labels with "O" (outside any entity)
        labels = [label_to_id["O"]] * len(tokenized["input_ids"][0])

        # Align labels with tokens
        offsets = tokenized["offset_mapping"][0].tolist()
        for start, end, label in annotations["entities"]:
            for idx, (token_start, token_end) in enumerate(offsets):
                if token_start >= start and token_end <= end:
                    labels[idx] = label_to_id[label]

        # Remove offset mapping (not needed for training)
        tokenized.pop("offset_mapping")

        # Add labels to the tokenized data
        tokenized["labels"] = torch.tensor(labels)

        tokenized_data.append(tokenized)

    return tokenized_data

# Preprocess training and validation data
tokenized_train_data = preprocess_data(train_data, tokenizer, label_to_id)
tokenized_val_data = preprocess_data(val_data, tokenizer, label_to_id)


5. Create DataLoader:

In [None]:
class NERDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {key: val.clone().detach() for key, val in self.data[idx].items()}

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'].squeeze(0) for item in batch])
    attention_mask = torch.stack([item['attention_mask'].squeeze(0) for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Create DataLoaders with the custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

6. Train the Model:

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Gradient accumulation steps
accumulation_steps = 4  # Simulate a larger batch size by accumulating gradients

# Training loop with gradient accumulation
for epoch in range(30):  # Train for 20 epochs instead of 10
    model.train()
    optimizer.zero_grad()
    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Scale loss by accumulation steps
        loss.backward()

        # Update weights after accumulating gradients
        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch + 1}, Step {step + 1}, Loss: {loss.item()}")
        print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"Reserved memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

# Validation loop
model.eval()
val_loss = 0
for batch in val_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += outputs.loss.item()

val_loss /= len(val_loader)
print(f"Validation Loss: {val_loss}")

7. Save the Model:

In [None]:
# Save the fine-tuned model with label mappings
model.config.id2label = {v: k for k, v in label_to_id.items()}  # Map IDs to labels
model.config.label2id = label_to_id  # Map labels to IDs

model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")
print("Model saved to 'ner_model'")

entities = ner_pipeline(text)
print("Raw Predictions:", entities)

8. Test the Model:

In [None]:
# Post-process predictions
def post_process_predictions(entities):
    merged_entities = []
    temp_entity = None

    for entity in entities:
        # Remove subword tokens (e.g., "##")
        entity["word"] = entity["word"].replace("##", "")

        if temp_entity and entity["entity"] == temp_entity["entity"] and entity["start"] == temp_entity["end"]:
            # Merge consecutive tokens
            temp_entity["word"] += entity["word"]
            temp_entity["end"] = entity["end"]
            temp_entity["score"] = max(temp_entity["score"], entity["score"])
        else:
            if temp_entity:
                merged_entities.append(temp_entity)
            temp_entity = entity

    if temp_entity:
        merged_entities.append(temp_entity)

    return merged_entities

    # Remove duplicates and overlapping predictions
    # unique_entities = []
    # seen = set()
    # for entity in merged_entities:
    #     key = (entity["start"], entity["end"], entity["entity"])
    #     if key not in seen:
    #         unique_entities.append(entity)
    #         seen.add(key)

    # return unique_entities

# Test the model on additional examples
texts = [
    "John Doe works at Acme Corp. His email is john.doe@acme.com and phone is 123-456-7890.",
    "Reach out to Jane Smith at jane.smith@domain.org or call 987-654-3210.",
    "Contact Alice Johnson at alice.johnson@example.com or 555-123-4567.",
    "Michael Brown's phone number is 800-555-0199 and email is michael.brown@domain.com.",
    "Sarah Connor works at Skynet. Her email is sarah.connor@skynet.com.",
]

for text in texts:
    entities = ner_pipeline(text)
    cleaned_entities = post_process_predictions(entities)
    print(f"Text: {text}")
    print(f"Entities: {cleaned_entities}")

9. Save and Load JSON:

In [None]:
import json

# Store results in a list
results = []

for text in texts:
    entities = ner_pipeline(text)
    cleaned_entities = post_process_predictions(entities)

    # Convert np.float32 to float for JSON serialization
    for entity in cleaned_entities:
        entity["score"] = float(entity["score"])

    results.append({"text": text, "entities": cleaned_entities})

# Save to a JSON file
with open("ner_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("Results saved to ner_results.json")