1. Imports and Setup:

Place all imports (e.g., torch, transformers, etc.) and environment setup (e.g., GPU configuration) at the top of the notebook.

In [46]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)

# Verify tensor placement
x = torch.randn(3, 3).to(device)
print("Tensor is on device:", x.device)

Using device: cuda
Tensor is on device: cuda:0


2. Define the Model:

In [47]:
# Define the model
label_to_id = {"O": 0, "PERSON": 1, "ORG": 2, "PHONE": 3, "EMAIL": 4}  # Example label mapping
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_to_id))
model.to(device)

# Verify model placement
print("Model is on device:", next(model.parameters()).device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on device: cuda:0


3. Load Training Data:

In [48]:
from training_data import TRAIN_DATA

# Print the number of training examples
print("Number of training examples:", len(TRAIN_DATA))

Number of training examples: 24


4. Preprocess Training and Validation Data

In [55]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Tokenizer (using a pre-trained tokenizer, e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Split data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

# Define the preprocess_data function
def preprocess_data(data, tokenizer, label_to_id):
    tokenized_data = []

    for text, annotations in data:
        # Tokenize the text
        tokenized = tokenizer(
            text,
            padding="max_length",  # Pad to the maximum length
            truncation=True,       # Truncate if the text is too long
            max_length=128,        # Maximum sequence length
            return_offsets_mapping=True,  # Get character offsets
            return_tensors="pt"    # Return PyTorch tensors
        )

        # Initialize labels with "O" (outside any entity)
        labels = [label_to_id["O"]] * len(tokenized["input_ids"][0])

        # Align labels with tokens
        offsets = tokenized["offset_mapping"][0].tolist()
        for start, end, label in annotations["entities"]:
            for idx, (token_start, token_end) in enumerate(offsets):
                if token_start >= start and token_end <= end:
                    labels[idx] = label_to_id[label]

        # Remove offset mapping (not needed for training)
        tokenized.pop("offset_mapping")

        # Add labels to the tokenized data
        tokenized["labels"] = torch.tensor(labels)

        tokenized_data.append(tokenized)

    return tokenized_data

# Preprocess training and validation data
tokenized_train_data = preprocess_data(train_data, tokenizer, label_to_id)
tokenized_val_data = preprocess_data(val_data, tokenizer, label_to_id)


5. Create DataLoader:

In [56]:
class NERDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {key: val.clone().detach() for key, val in self.data[idx].items()}

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'].squeeze(0) for item in batch])
    attention_mask = torch.stack([item['attention_mask'].squeeze(0) for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Create DataLoaders with the custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

6. Train the Model:

In [57]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Gradient accumulation steps
accumulation_steps = 4  # Simulate a larger batch size by accumulating gradients

# Training loop with gradient accumulation
for epoch in range(50):  # Train for 20 epochs instead of 10
    model.train()
    optimizer.zero_grad()
    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Scale loss by accumulation steps
        loss.backward()

        # Update weights after accumulating gradients
        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch + 1}, Step {step + 1}, Loss: {loss.item()}")
        print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"Reserved memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

# Validation loop
model.eval()
val_loss = 0
for batch in val_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += outputs.loss.item()

val_loss /= len(val_loader)
print(f"Validation Loss: {val_loss}")

Epoch 1, Step 1, Loss: 0.03443974629044533
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 2, Step 1, Loss: 0.03235198184847832
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 3, Step 1, Loss: 0.030262792482972145
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 4, Step 1, Loss: 0.02982369065284729
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 5, Step 1, Loss: 0.028297170996665955
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 6, Step 1, Loss: 0.02617095224559307
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 7, Step 1, Loss: 0.024763695895671844
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 8, Step 1, Loss: 0.02251618355512619
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 9, Step 1, Loss: 0.021297059953212738
Allocated memory: 2516.46 MB
Reserved memory: 4106.00 MB
Epoch 10, Step 1, Loss: 0.020143218338489532
Allocated memory: 2516.46 MB
Reserved memory: 4106.

7. Save the Model:

In [59]:
# Save the fine-tuned model with label mappings
model.config.id2label = {v: k for k, v in label_to_id.items()}  # Map IDs to labels
model.config.label2id = label_to_id  # Map labels to IDs

model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")
print("Model saved to 'ner_model'")

Model saved to 'ner_model'


8. Test the Model:

In [60]:
# Post-process predictions
def post_process_predictions(entities):
    merged_entities = []
    temp_entity = None

    for entity in entities:
        # Remove subword tokens (e.g., "##")
        entity["word"] = entity["word"].replace("##", "")

        if temp_entity and entity["entity"] == temp_entity["entity"] and entity["start"] == temp_entity["end"]:
            # Merge consecutive tokens
            temp_entity["word"] += entity["word"]
            temp_entity["end"] = entity["end"]
            temp_entity["score"] = max(temp_entity["score"], entity["score"])
        else:
            if temp_entity:
                merged_entities.append(temp_entity)
            temp_entity = entity

    if temp_entity:
        merged_entities.append(temp_entity)

    # Remove duplicates and overlapping predictions
    unique_entities = []
    seen = set()
    for entity in merged_entities:
        key = (entity["start"], entity["end"], entity["entity"])
        if key not in seen:
            unique_entities.append(entity)
            seen.add(key)

    return unique_entities

# Test the model on additional examples
texts = [
    "John Doe works at Acme Corp. His email is john.doe@acme.com and phone is 123-456-7890.",
    "Reach out to Jane Smith at jane.smith@domain.org or call 987-654-3210.",
    "Contact Alice Johnson at alice.johnson@example.com or 555-123-4567.",
    "Michael Brown's phone number is 800-555-0199 and email is michael.brown@domain.com.",
    "Sarah Connor works at Skynet. Her email is sarah.connor@skynet.com."
]

for text in texts:
    entities = ner_pipeline(text)
    cleaned_entities = post_process_predictions(entities)
    print(f"Text: {text}")
    print(f"Entities: {cleaned_entities}")

Text: John Doe works at Acme Corp. His email is john.doe@acme.com and phone is 123-456-7890.
Entities: [{'entity': 'PERSON', 'score': np.float32(0.44597924), 'index': 1, 'word': 'john', 'start': 0, 'end': 4}, {'entity': 'PERSON', 'score': np.float32(0.37545097), 'index': 2, 'word': 'doe', 'start': 5, 'end': 8}, {'entity': 'EMAIL', 'score': np.float32(0.7598781), 'index': 13, 'word': '.doe@acme.com', 'start': 46, 'end': 59}, {'entity': 'PHONE', 'score': np.float32(0.7718025), 'index': 23, 'word': '123-456-7890', 'start': 73, 'end': 85}]
Text: Reach out to Jane Smith at jane.smith@domain.org or call 987-654-3210.
Entities: [{'entity': 'EMAIL', 'score': np.float32(0.3876797), 'index': 5, 'word': 'smith', 'start': 18, 'end': 23}, {'entity': 'EMAIL', 'score': np.float32(0.83082944), 'index': 7, 'word': 'jane.smith@domain.org', 'start': 27, 'end': 48}, {'entity': 'PHONE', 'score': np.float32(0.86973166), 'index': 16, 'word': '987-654-3210', 'start': 57, 'end': 69}]
Text: Contact Alice Johnso

9. Save and Load JSON:

In [61]:
import json

# Store results in a list
results = []

for text in texts:
    entities = ner_pipeline(text)
    cleaned_entities = post_process_predictions(entities)

    # Convert np.float32 to float for JSON serialization
    for entity in cleaned_entities:
        entity["score"] = float(entity["score"])

    results.append({"text": text, "entities": cleaned_entities})

# Save to a JSON file
with open("ner_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("Results saved to ner_results.json")

Results saved to ner_results.json
