<a href="https://colab.research.google.com/github/CogNetSys/ModernBERT/blob/main/ModernBERT_Experiment_12_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Hugging Face Transformers and other dependencies
!pip install transformers==4.48.0
!pip install torch
!pip install scikit-learn
!pip install tqdm
!pip install seaborn
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Validate installation
!python3 -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.11/dist-packages/spacy[0m

NAME             SPACY            VERSION                            
en_core_web_sm   >=3.7.2,<3.8.0   [38;5;2m3.7.1[0m   [38;5;2m✔[0m



In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoModel, AutoTokenizer
import re
import random
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import nltk
import spacy

# Download NLTK data
nltk.download('wordnet')

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Define comprehensive entity lists
persons = [
    "John Doe", "Alice Smith", "Maria Garcia", "Bob Johnson", "Charlie Lee",
    "David Brown", "Emma Wilson", "Frank Moore", "Grace Taylor", "Henry Anderson"
]
aliases = [
    "Johnny", "Ally", "Mia", "Bobby", "Chuck",
    "Dave", "Em", "Frankie", "Gracie", "Hank"
]
titles = [
    "Dr.", "Prof.", "Mr.", "Ms.", "Mrs.",
    "CEO", "CTO", "Manager", "Director", "Lead"
]
roles = [
    "Software Engineer", "Data Scientist", "Product Manager", "Sales Executive", "HR Specialist",
    "Marketing Coordinator", "Financial Analyst", "Customer Support Representative", "DevOps Engineer", "UX Designer"
]
organizations = [
    "Acme Corp", "Global Tech", "Finance Department", "HR Team", "IT Services",
    "Research Division", "Marketing Department", "Sales Team", "Operations Unit", "Customer Support"
]
business_names = organizations.copy()  # Assuming business names align with organizations
business_ids = [
    "BUS123456", "BUS234567", "BUS345678", "BUS456789", "BUS567890",
    "BUS678901", "BUS789012", "BUS890123", "BUS901234", "BUS012345"
]
locations = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "San Francisco", "Boston", "Seattle", "Denver", "Miami"
]
ips = [
    "192.168.1.1", "10.0.0.5", "172.16.0.3", "192.168.0.100", "10.0.1.25",
    "192.168.1.255", "10.0.0.8", "172.16.0.45", "192.168.0.55", "10.0.1.99"
]
mac_addresses = [
    "00:1A:2B:3C:4D:5E", "11:22:33:44:55:66", "AA:BB:CC:DD:EE:FF",
    "12:34:56:78:9A:BC", "DE:F0:12:34:56:78",
    "98:76:54:32:10:FE", "AB:CD:EF:12:34:56", "12:AB:34:CD:56:EF",
    "FE:DC:BA:98:76:54", "65:43:21:09:87:65"
]
projects = [
    "Project Phoenix", "Apollo", "Zeus", "Hermes", "Athena",
    "Project Titan", "Orion", "Elysium", "Nebula", "Vortex"
]
dates = [
    "Monday", "Tuesday", "Wednesday", "Saturday", "Sunday",
    "April 5th", "July 20th", "September 15th", "December 1st", "January 10th"
]
times = [
    "10:00 AM", "2:30 PM", "5:45 PM", "9:15 AM", "1:00 PM",
    "3:20 PM", "4:50 PM", "11:30 AM", "6:00 PM", "8:15 PM"
]
durations = [
    "2 hours", "30 minutes", "45 minutes", "1 hour", "3 hours",
    "15 minutes", "1.5 hours", "4 hours", "25 minutes", "50 minutes"
]
events = [
    "Annual Meeting", "Quarterly Review", "Product Launch", "Team Building Retreat", "Client Presentation",
    "Security Audit", "System Upgrade", "Sales Conference", "Marketing Workshop", "HR Training"
]
emails = [
    "john.doe@example.com", "alice.smith@globaltech.com", "maria.garcia@acmecorp.com",
    "bob.johnson@finance.dept.com", "charlie.lee@itservices.com"
]
phones = [
    "+1-202-555-0156", "+1-303-555-0198", "+1-404-555-0133", "+1-505-555-0177", "+1-606-555-0111",
    "+1-707-555-0144", "+1-808-555-0188", "+1-909-555-0122", "+1-101-555-0166", "+1-212-555-0100"
]
urls = [
    "https://acme.com/login", "https://globaltech.com/dashboard", "https://finance.dept.com/reports",
    "https://hrteam.com/profile", "https://itservices.com/support",
    "https://research.division.com/data", "https://marketing.dept.com/campaigns", "https://sales.team.com/leads",
    "https://operations.unit.com/status", "https://customersupport.com/help"
]
devices = [
    "Laptop-01", "Server-12", "Router-5", "Firewall-3", "Workstation-7",
    "Tablet-4", "Smartphone-9", "Printer-2", "Scanner-6", "NAS-8"
]
device_ids = devices.copy()  # Assuming device IDs align with devices
passwords = [
    "P@ssw0rd!", "Secure#123", "Admin@2024", "User*Pass1", "Qwerty!234",
    "Welcome#1", "Passw0rd$", "Login*123", "MyPass#456", "Access@789"
]
access_keys = [
    "AK12345XYZ", "AK67890ABC", "AK54321DEF", "AK09876GHI", "AK11223JKL",
    "AK44556MNO", "AK77889PQR", "AK99000STU", "AK13579VWX", "AK24680YZA"
]
social_security_numbers = [
    "123-45-6789", "987-65-4321", "555-55-5555", "111-22-3333", "444-55-6666",
    "777-88-9999", "222-33-4444", "333-44-5555", "666-77-8888", "999-00-1111"
]
credit_cards = [
    "4111-1111-1111-1111", "5500-0000-0000-0004", "3400-0000-0000-009", "3000-0000-0000-04",
    "6011-0000-0000-0004", "2014-0000-0000-009", "3088-0000-0000-0009", "3600-0000-0000-0008",
    "3800-0000-0000-0028", "6304-0000-0000-0003"
]
bank_accounts = [
    "BA123456789", "BA987654321", "BA555555555", "BA111222333", "BA444555666",
    "BA777888999", "BA000111222", "BA333444555", "BA666777888", "BA999000111"
]
license_plates = [
    "ABC-1234", "XYZ-5678", "LMN-9012", "DEF-3456", "GHI-7890",
    "JKL-2345", "MNO-6789", "PQR-0123", "STU-4567", "VWX-8901"
]
hazmats = [
    "Hazmat Material A", "Hazmat Substance B", "Hazmat Agent C", "Hazmat Compound D", "Hazmat Material E",
    "Hazmat Substance F", "Hazmat Agent G", "Hazmat Compound H", "Hazmat Material I", "Hazmat Substance J"
]
money = [
    "$1000", "$2500", "$500", "$750", "$1200",
    "$3000", "$450", "$600", "$800", "$950"
]
currencies = [
    "USD", "EUR", "GBP", "JPY", "AUD",
    "CAD", "CHF", "CNY", "SEK", "NZD"
]
invoices = [
    "INV1001", "INV1002", "INV1003", "INV1004", "INV1005",
    "INV1006", "INV1007", "INV1008", "INV1009", "INV1010"
]
transactions = [
    "TXN5001", "TXN5002", "TXN5003", "TXN5004", "TXN5005",
    "TXN5006", "TXN5007", "TXN5008", "TXN5009", "TXN5010"
]
accounts = [
    "ACCT3001", "ACCT3002", "ACCT3003", "ACCT3004", "ACCT3005",
    "ACCT3006", "ACCT3007", "ACCT3008", "ACCT3009", "ACCT3010"
]
ticket_ids = [
    "TICKET1001", "TICKET1002", "TICKET1003", "TICKET1004", "TICKET1005",
    "TICKET1006", "TICKET1007", "TICKET1008", "TICKET1009", "TICKET1010"
]
issue_types = [
    "Login Issue", "Password Reset", "Account Lockout", "Data Breach", "System Downtime",
    "Payment Failure", "Feature Request", "Bug Report", "Access Denied", "Performance Lag"
]
priorities = [
    "Low", "Medium", "High", "Critical", "Urgent",
    "Low", "Medium", "High", "Critical", "Urgent"
]
resolution_statuses = [
    "Resolved", "Unresolved", "In Progress", "Pending", "Escalated",
    "Resolved", "Unresolved", "In Progress", "Pending", "Escalated"
]
leads = [
    "Lead1001", "Lead1002", "Lead1003", "Lead1004", "Lead1005",
    "Lead1006", "Lead1007", "Lead1008", "Lead1009", "Lead1010"
]
opportunities = [
    "Opp2001", "Opp2002", "Opp2003", "Opp2004", "Opp2005",
    "Opp2006", "Opp2007", "Opp2008", "Opp2009", "Opp2010"
]
campaigns = [
    "Camp3001", "Camp3002", "Camp3003", "Camp3004", "Camp3005",
    "Camp3006", "Camp3007", "Camp3008", "Camp3009", "Camp3010"
]
discount_codes = [
    "DISC10", "DISC20", "DISC30", "DISC40", "DISC50",
    "DISC60", "DISC70", "DISC80", "DISC90", "DISC100"
]
custom1 = [
    "CustomEntity1", "CustomEntity2", "CustomEntity3", "CustomEntity4", "CustomEntity5",
    "CustomEntity6", "CustomEntity7", "CustomEntity8", "CustomEntity9", "CustomEntity10"
]
custom2 = [
    "CustomEntityA", "CustomEntityB", "CustomEntityC", "CustomEntityD", "CustomEntityE",
    "CustomEntityF", "CustomEntityG", "CustomEntityH", "CustomEntityI", "CustomEntityJ"
]
username = [
    "user123", "admin456", "guest789", "member012", "user345",
    "admin678", "guest901", "member234", "user567", "admin890"
]
address_line1 = [
    "123 Maple Street", "456 Oak Avenue", "789 Pine Road", "321 Birch Lane", "654 Cedar Blvd",
    "987 Spruce Drive", "213 Elm Street", "546 Ash Avenue", "879 Fir Road", "132 Willow Lane"
]
city = locations.copy()  # Assuming city aligns with locations
state = [
    "NY", "CA", "IL", "TX", "AZ",
    "MA", "WA", "CO", "FL", "NJ"
]


In [5]:
# Define 'O' label
O_label = "O"

# Comprehensive entity_label_map with initial mappings
entity_label_map = {
    "person": {"B": "B-PER", "I": "I-PER"},
    "alias": {"B": "B-ALIAS", "I": "I-ALIAS"},
    "title": {"B": "B-TITLE", "I": "I-TITLE"},
    "role": {"B": "B-ROLE", "I": "I-ROLE"},
    "organization": {"B": "B-ORG", "I": "I-ORG"},
    "business_name": {"B": "B-BUS", "I": "I-BUS"},
    "business_id": {"B": "B-BUSID", "I": "I-BUSID"},
    "location": {"B": "B-LOC", "I": "I-LOC"},
    "ip": {"B": "B-IP", "I": "I-IP"},
    "mac_address": {"B": "B-MAC", "I": "I-MAC"},
    "project": {"B": "B-PROJ", "I": "I-PROJ"},
    "date": {"B": "B-DATE", "I": "I-DATE"},
    "time": {"B": "B-TIME", "I": "I-TIME"},
    "duration": {"B": "B-DUR", "I": "I-DUR"},
    "event": {"B": "B-EVENT", "I": "I-EVENT"},
    "email": {"B": "B-EMAIL", "I": "I-EMAIL"},
    "phone": {"B": "B-PHONE", "I": "I-PHONE"},
    "url": {"B": "B-URL", "I": "I-URL"},
    "device": {"B": "B-DEV", "I": "I-DEV"},
    "device_id": {"B": "B-DEV_ID", "I": "I-DEV_ID"},
    "password": {"B": "B-PASS", "I": "I-PASS"},
    "access_key": {"B": "B-KEY", "I": "I-KEY"},
    "social_security": {"B": "B-SSN", "I": "I-SSN"},
    "credit_card": {"B": "B-CC", "I": "I-CC"},
    "bank_account": {"B": "B-BANK", "I": "I-BANK"},
    "license_plate": {"B": "B-PLATE", "I": "I-PLATE"},
    "hazmat": {"B": "B-HAZMAT", "I": "I-HAZMAT"},
    "money": {"B": "B-MONEY", "I": "I-MONEY"},
    "currency": {"B": "B-CUR", "I": "I-CUR"},
    "invoice": {"B": "B-INVOICE", "I": "I-INVOICE"},
    "transaction": {"B": "B-TRANS", "I": "I-TRANS"},
    "account": {"B": "B-ACCT", "I": "I-ACCT"},
    "ticket_id": {"B": "B-TICKET", "I": "I-TICKET"},
    "issue_type": {"B": "B-ISSUE", "I": "I-ISSUE"},
    "priority": {"B": "B-PRIORITY", "I": "I-PRIORITY"},
    "resolution_status": {"B": "B-STATUS", "I": "I-STATUS"},
    "lead": {"B": "B-LEAD", "I": "I-LEAD"},
    "opportunity": {"B": "B-OPP", "I": "I-OPP"},
    "campaign": {"B": "B-CAMP", "I": "I-CAMP"},
    "discount_code": {"B": "B-DISC", "I": "I-DISC"},
    "custom1": {"B": "B-CUST1", "I": "I-CUST1"},
    "custom2": {"B": "B-CUST2", "I": "I-CUST2"},
    "username": {"B": "B-USERNAME", "I": "I-USERNAME"},
    "address_line1": {"B": "B-ADDR1", "I": "I-ADDR1"},
    "city": {"B": "B-CITY", "I": "I-CITY"},
    "state": {"B": "B-STATE", "I": "I-STATE"},
    # Add other labels as needed
}


In [6]:
# Define anomaly and normal templates
anomaly_scenarios = [
    "Unauthorized login attempt detected for user {username} from IP {ip}.",
    "Suspicious activity from IP address {ip} detected in {city}.",
    "Multiple failed login attempts for {username} on device {device}.",
    "Unexpected shutdown of the main {organization} server in {location}.",
    "{username} reported a security breach in the {organization} affecting project {project}.",
    "Intrusion detected in the {location} server room by IP {ip}.",
    "Anomaly detected: unusual access patterns from IP {ip} targeting {project}.",
    "Alert: {username} accessed restricted data without authorization from {city}.",
    "System compromised: {organization} data integrity at risk due to {device}.",
    "Abnormal behavior observed from user {username} at {city} accessing {url}.",
    "Security alert: {username} attempted unauthorized access to {business_name} using {access_key}.",
    "Hazmat spill reported at {address_line1}, {city}, {state} by {username}.",
    "Emergency response initiated for {hazmat} incident at {location}.",
    "Data leak detected involving {credit_card} from {device}.",
    "{username} changed their password using device {device_id} from IP {ip}.",
    "Multiple transactions flagged: {transaction} from {bank_account}.",
    "Invalid access key {access_key} used by {username} from {ip}.",
    "{username}'s social security number {social_security} was exposed during {event}.",
    "License plate {license_plate} associated with unauthorized entry at {location}.",
    "Customer {username} reported issue type {issue_type} with ticket {ticket_id}.",
    "Anomaly in financial report: {money} discrepancy detected in {account}."
]

normal_scenarios = [
    "{username} accessed the secure server from IP {ip}.",
    "The server located in {city} was rebooted at {date} {time}.",
    "{username} updated their password successfully from device {device}.",
    "System maintenance scheduled in {city} on {date} for {duration}.",
    "Backup completed successfully for {project} using {device}.",
    "{username} joined the {organization} team as a {role}.",
    "{username} left the {organization}.",
    "New project {project} has been initiated by {username}.",
    "Meeting scheduled with {username} in {city} on {date} at {time}.",
    "{username} submitted the quarterly report to {organization} via {url}.",
    "{username} received an invoice {invoice} for project {project}.",
    "Transaction {transaction} of {money} approved for account {account}.",
    "Marketing campaign {campaign} launched with discount code {discount_code}.",
    "Customer support ticket {ticket_id} assigned to {username} with priority {priority}.",
    "{username} attended the {event} held at {location}.",
    "Sales opportunity {opportunity} created by {username} in {campaign}.",
    "{username} updated contact information including email {email} and phone {phone}.",
    "Finance department reconciled bank account {bank_account} with transactions {transaction}.",
    "{username} accessed CRM system using username {username} and device ID {device_id}.",
    "Lead {lead} converted to opportunity {opportunity} by {username}.",
    "HR team updated employee {username}'s role to {role}."
]


In [30]:
from transformers import AutoTokenizer

# Initialize the tokenizer with ModernBERT
model_name = "answerdotai/modernbert-large"  # Replace with "answerdotai/modernbert-base" if available
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [31]:
def generate_sample(anomaly=False, entity_lists=None, tokenizer=None, entity_label_map=None, O_label="O"):
    """
    Generates a single synthetic data sample.

    Args:
        anomaly (bool): Whether to generate an anomaly sample.
        entity_lists (dict): Dictionary containing all entity lists.
        tokenizer: The tokenizer instance.
        entity_label_map (dict): Mapping for entity labels.
        O_label (str): The label for non-entity tokens.

    Returns:
        Dict: A dictionary containing 'text', 'ner_labels', 'anomaly_label', and 'entities'.
    """
    if entity_lists is None:
        entity_lists = {}

    if entity_label_map is None:
        raise ValueError("entity_label_map must be provided")

    if tokenizer is None:
        raise ValueError("tokenizer must be provided")

    if anomaly:
        template = random.choice(anomaly_scenarios)
        anomaly_label = 1
    else:
        template = random.choice(normal_scenarios)
        anomaly_label = 0

    # Find all placeholders in the template
    placeholders = re.findall(r"\{(.*?)\}", template)
    unique_placeholders = list(set(placeholders))

    # Initialize entity selections
    selected_entities = {}

    # Select entities for each placeholder type
    for placeholder in unique_placeholders:
        if placeholder in entity_lists and len(entity_lists[placeholder]) > 0:
            selected_entities[placeholder] = random.choice(entity_lists[placeholder])
        else:
            selected_entities[placeholder] = "Unknown"  # Fallback for undefined placeholders

    # Replace placeholders with selected entities
    filled_text = template
    for placeholder, entity in selected_entities.items():
        filled_text = filled_text.replace(f"{{{placeholder}}}", entity)

    # Prepare entities list with character offsets
    entities = []
    for placeholder, entity in selected_entities.items():
        # Find all occurrences of the entity in text to handle multiple instances
        start_indices = [m.start() for m in re.finditer(re.escape(entity), filled_text)]
        for start_char in start_indices:
            end_char = start_char + len(entity)
            entities.append({
                "text": entity,
                "type": placeholder,
                "start_char": start_char,
                "end_char": end_char
            })

    # Tokenize and assign labels
    ner_labels = tokenize_and_align_labels(filled_text, entities, tokenizer, entity_label_map, O_label)

    return {
        "text": filled_text,
        "ner_labels": ner_labels,
        "anomaly_label": anomaly_label,
        "entities": entities  # Include entities for validation
    }

def tokenize_and_align_labels(text, entities, tokenizer, entity_label_map, O_label="O"):
    """
    Tokenizes the text and aligns the NER labels with the tokenized output using character offsets.

    Args:
        text (str): The input text.
        entities (List[Dict]): A list of entities with 'text', 'type', 'start_char', and 'end_char'.
        tokenizer: The tokenizer instance.
        entity_label_map (dict): Mapping for entity labels.
        O_label (str): The label for non-entity tokens.

    Returns:
        List[str]: A list of BIO labels aligned with the tokenized text.
    """
    # Initialize labels as 'O'
    encoding = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
    offsets = encoding['offset_mapping']
    labels = [O_label] * len(encoding['input_ids'])

    # Process each entity
    for entity in entities:
        entity_type = entity["type"].lower()
        start_char = entity["start_char"]
        end_char = entity["end_char"]
        # Assign labels to tokens
        for idx, (token_start, token_end) in enumerate(offsets):
            if token_start >= end_char:
                break
            if token_end <= start_char:
                continue
            if token_start >= start_char and token_end <= end_char:
                if token_start == start_char:
                    label_key = entity_label_map.get(entity_type, {}).get("B", O_label)
                    labels[idx] = label_key
                else:
                    label_key = entity_label_map.get(entity_type, {}).get("I", O_label)
                    labels[idx] = label_key

    return labels

def generate_dataset(num_samples=5000, anomaly_ratio=0.3, seed=None, entity_lists=None, tokenizer=None, entity_label_map=None, O_label="O"):
    """
    Generates a synthetic dataset.

    Args:
        num_samples (int): Total number of samples to generate.
        anomaly_ratio (float): Proportion of samples that are anomalies.
        seed (int, optional): Random seed for reproducibility.
        entity_lists (dict): Dictionary containing all entity lists.
        tokenizer: The tokenizer instance.
        entity_label_map (dict): Mapping for entity labels.
        O_label (str): The label for non-entity tokens.

    Returns:
        List[Dict]: A list of synthetic data samples.
    """
    if seed is not None:
        random.seed(seed)

    dataset = []
    for _ in tqdm(range(num_samples), desc="Generating Synthetic Data"):
        is_anomaly = random.random() < anomaly_ratio
        sample = generate_sample(
            anomaly=is_anomaly,
            entity_lists=entity_lists,
            tokenizer=tokenizer,
            entity_label_map=entity_label_map,
            O_label=O_label
        )
        dataset.append(sample)

    # Shuffle the dataset to mix anomaly and normal samples
    random.shuffle(dataset)

    return dataset


In [32]:
# Create a dictionary of all entities for easy access
entity_lists = {
    "person": persons,
    "alias": aliases,
    "title": titles,
    "role": roles,
    "organization": organizations,
    "business_name": business_names,
    "business_id": business_ids,
    "location": locations,
    "ip": ips,
    "mac_address": mac_addresses,
    "project": projects,
    "date": dates,
    "time": times,
    "duration": durations,
    "event": events,
    "email": emails,
    "phone": phones,
    "url": urls,
    "device": devices,
    "device_id": device_ids,
    "password": passwords,
    "access_key": access_keys,
    "social_security": social_security_numbers,
    "credit_card": credit_cards,
    "bank_account": bank_accounts,
    "license_plate": license_plates,
    "hazmat": hazmats,
    "money": money,
    "currency": currencies,
    "invoice": invoices,
    "transaction": transactions,
    "account": accounts,
    "ticket_id": ticket_ids,
    "issue_type": issue_types,
    "priority": priorities,
    "resolution_status": resolution_statuses,
    "lead": leads,
    "opportunity": opportunities,
    "campaign": campaigns,
    "discount_code": discount_codes,
    "custom1": custom1,
    "custom2": custom2,
    "username": username,
    "address_line1": address_line1,
    "city": city,
    "state": state
}

# Generate synthetic training data
synthetic_training_data = generate_dataset(
    num_samples=5000,
    anomaly_ratio=0.3,
    seed=42,
    entity_lists=entity_lists,
    tokenizer=tokenizer,
    entity_label_map=entity_label_map,
    O_label=O_label
)

# Display first 5 samples for verification
for i, sample in enumerate(synthetic_training_data[:5], 1):
    print(f"Sample {i}:")
    print(f"Text: {sample['text']}")
    print(f"NER Labels: {sample['ner_labels']}")
    print(f"Anomaly Label: {sample['anomaly_label']}\n")


Generating Synthetic Data: 100%|██████████| 5000/5000 [00:01<00:00, 4927.02it/s]

Sample 1:
Text: guest901 attended the Annual Meeting held at New York.
NER Labels: ['B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'I-EVENT', 'O', 'O', 'O', 'I-LOC', 'O']
Anomaly Label: 0

Sample 2:
Text: Emergency response initiated for Hazmat Substance B incident at Denver.
NER Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'I-HAZMAT', 'I-HAZMAT', 'I-HAZMAT', 'I-HAZMAT', 'O', 'O', 'O', 'O']
Anomaly Label: 1

Sample 3:
Text: Anomaly detected: unusual access patterns from IP 172.16.0.3 targeting Apollo.
NER Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-IP', 'I-IP', 'I-IP', 'I-IP', 'I-IP', 'I-IP', 'O', 'O', 'O']
Anomaly Label: 1

Sample 4:
Text: Anomaly in financial report: $500 discrepancy detected in ACCT3009.
NER Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-MONEY', 'O', 'O', 'O', 'O', 'I-ACCT', 'I-ACCT', 'I-ACCT', 'O']
Anomaly Label: 1

Sample 5:
Text: Marketing campaign Camp3006 launched with discount code DISC80.
NER Labels: ['O', 'O', 'O', 'O', 'I-CAMP', 'I-CAM




In [33]:
# Create label_to_id mapping
def create_label_to_id_map(label_map, O_label="O"):
    """
    Creates a mapping from label strings to unique integer IDs.

    Args:
        label_map (Dict[str, Dict[str, str]]): Updated entity label map.
        O_label (str): The label for non-entity tokens.

    Returns:
        Dict[str, int]: Mapping from label strings to unique IDs.
    """
    unique_labels = set()
    for entity, sub_map in label_map.items():
        if isinstance(sub_map, dict):
            unique_labels.update(sub_map.values())
        else:
            unique_labels.add(sub_map)  # Handle 'O' if present

    unique_labels.add(O_label)  # Ensure 'O' is included

    sorted_labels = sorted(unique_labels)  # Sorting for consistency
    label_to_id = {label: idx for idx, label in enumerate(sorted_labels)}
    return label_to_id

# Create the label_to_id mapping
label_to_id = create_label_to_id_map(entity_label_map, O_label=O_label)
print("Label to ID mapping:", label_to_id)


Label to ID mapping: {'B-ACCT': 0, 'B-ADDR1': 1, 'B-ALIAS': 2, 'B-BANK': 3, 'B-BUS': 4, 'B-BUSID': 5, 'B-CAMP': 6, 'B-CC': 7, 'B-CITY': 8, 'B-CUR': 9, 'B-CUST1': 10, 'B-CUST2': 11, 'B-DATE': 12, 'B-DEV': 13, 'B-DEV_ID': 14, 'B-DISC': 15, 'B-DUR': 16, 'B-EMAIL': 17, 'B-EVENT': 18, 'B-HAZMAT': 19, 'B-INVOICE': 20, 'B-IP': 21, 'B-ISSUE': 22, 'B-KEY': 23, 'B-LEAD': 24, 'B-LOC': 25, 'B-MAC': 26, 'B-MONEY': 27, 'B-OPP': 28, 'B-ORG': 29, 'B-PASS': 30, 'B-PER': 31, 'B-PHONE': 32, 'B-PLATE': 33, 'B-PRIORITY': 34, 'B-PROJ': 35, 'B-ROLE': 36, 'B-SSN': 37, 'B-STATE': 38, 'B-STATUS': 39, 'B-TICKET': 40, 'B-TIME': 41, 'B-TITLE': 42, 'B-TRANS': 43, 'B-URL': 44, 'B-USERNAME': 45, 'I-ACCT': 46, 'I-ADDR1': 47, 'I-ALIAS': 48, 'I-BANK': 49, 'I-BUS': 50, 'I-BUSID': 51, 'I-CAMP': 52, 'I-CC': 53, 'I-CITY': 54, 'I-CUR': 55, 'I-CUST1': 56, 'I-CUST2': 57, 'I-DATE': 58, 'I-DEV': 59, 'I-DEV_ID': 60, 'I-DISC': 61, 'I-DUR': 62, 'I-EMAIL': 63, 'I-EVENT': 64, 'I-HAZMAT': 65, 'I-INVOICE': 66, 'I-IP': 67, 'I-ISSUE': 68

In [34]:
# Create label_to_id mapping
def create_label_to_id_map(label_map, O_label="O"):
    """
    Creates a mapping from label strings to unique integer IDs.

    Args:
        label_map (Dict[str, Dict[str, str]]): Updated entity label map.
        O_label (str): The label for non-entity tokens.

    Returns:
        Dict[str, int]: Mapping from label strings to unique IDs.
    """
    unique_labels = set()
    for entity, sub_map in label_map.items():
        if isinstance(sub_map, dict):
            unique_labels.update(sub_map.values())
        else:
            unique_labels.add(sub_map)  # Handle 'O' if present

    unique_labels.add(O_label)  # Ensure 'O' is included

    sorted_labels = sorted(unique_labels)  # Sorting for consistency
    label_to_id = {label: idx for idx, label in enumerate(sorted_labels)}
    return label_to_id

# Create the label_to_id mapping
label_to_id = create_label_to_id_map(entity_label_map, O_label=O_label)
print("Label to ID mapping:", label_to_id)


Label to ID mapping: {'B-ACCT': 0, 'B-ADDR1': 1, 'B-ALIAS': 2, 'B-BANK': 3, 'B-BUS': 4, 'B-BUSID': 5, 'B-CAMP': 6, 'B-CC': 7, 'B-CITY': 8, 'B-CUR': 9, 'B-CUST1': 10, 'B-CUST2': 11, 'B-DATE': 12, 'B-DEV': 13, 'B-DEV_ID': 14, 'B-DISC': 15, 'B-DUR': 16, 'B-EMAIL': 17, 'B-EVENT': 18, 'B-HAZMAT': 19, 'B-INVOICE': 20, 'B-IP': 21, 'B-ISSUE': 22, 'B-KEY': 23, 'B-LEAD': 24, 'B-LOC': 25, 'B-MAC': 26, 'B-MONEY': 27, 'B-OPP': 28, 'B-ORG': 29, 'B-PASS': 30, 'B-PER': 31, 'B-PHONE': 32, 'B-PLATE': 33, 'B-PRIORITY': 34, 'B-PROJ': 35, 'B-ROLE': 36, 'B-SSN': 37, 'B-STATE': 38, 'B-STATUS': 39, 'B-TICKET': 40, 'B-TIME': 41, 'B-TITLE': 42, 'B-TRANS': 43, 'B-URL': 44, 'B-USERNAME': 45, 'I-ACCT': 46, 'I-ADDR1': 47, 'I-ALIAS': 48, 'I-BANK': 49, 'I-BUS': 50, 'I-BUSID': 51, 'I-CAMP': 52, 'I-CC': 53, 'I-CITY': 54, 'I-CUR': 55, 'I-CUST1': 56, 'I-CUST2': 57, 'I-DATE': 58, 'I-DEV': 59, 'I-DEV_ID': 60, 'I-DISC': 61, 'I-DUR': 62, 'I-EMAIL': 63, 'I-EVENT': 64, 'I-HAZMAT': 65, 'I-INVOICE': 66, 'I-IP': 67, 'I-ISSUE': 68

In [35]:
class JointNERAnomalyDataset(Dataset):
    def __init__(self, data, tokenizer, label_to_id, max_length=128):
        """
        Initializes the dataset.

        Args:
            data (List[Dict]): The synthetic dataset.
            tokenizer: The tokenizer instance.
            label_to_id (Dict[str, int]): Mapping from label strings to IDs.
            max_length (int): Maximum sequence length.
        """
        self.data = data
        self.tokenizer = tokenizer
        self.label_to_id = label_to_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        ner_labels = item['ner_labels']
        anomaly_label = item['anomaly_label']
        entities = item.get('entities', [])  # Retrieve entities if available

        # Tokenize the input text
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_offsets_mapping=True,
            return_tensors='pt',
            is_split_into_words=False
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        offsets = encoding['offset_mapping'].squeeze().tolist()

        # Initialize labels as 'O'
        labels = [O_label] * len(input_ids)

        # Assign labels based on entities
        for entity in entities:
            entity_type = entity["type"].lower()  # Ensure lowercase for consistency
            start_char = entity["start_char"]
            end_char = entity["end_char"]
            for idx_token, (token_start, token_end) in enumerate(offsets):
                if token_start >= end_char:
                    break
                if token_end <= start_char:
                    continue
                if token_start >= start_char and token_end <= end_char:
                    if token_start == start_char:
                        labels[idx_token] = entity_label_map[entity_type]["B"]
                    else:
                        labels[idx_token] = entity_label_map[entity_type]["I"]

        # Convert labels to IDs
        ner_label_ids = [self.label_to_id.get(label, self.label_to_id[O_label]) for label in labels]

        # Handle padding labels
        if len(ner_label_ids) < self.max_length:
            ner_label_ids += [self.label_to_id[O_label]] * (self.max_length - len(ner_label_ids))
        elif len(ner_label_ids) > self.max_length:
            ner_label_ids = ner_label_ids[:self.max_length]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'ner_labels': torch.tensor(ner_label_ids, dtype=torch.long),
            'anomaly_labels': torch.tensor(anomaly_label, dtype=torch.long)
        }


In [36]:
# Initialize the dataset
combined_dataset = JointNERAnomalyDataset(
    data=synthetic_training_data,
    tokenizer=tokenizer,
    label_to_id=label_to_id,
    max_length=128  # Adjust as needed
)

# Split into training and validation sets (80-20 split)
train_size = int(0.8 * len(combined_dataset))
val_size = len(combined_dataset) - train_size
train_dataset, val_dataset = random_split(combined_dataset, [train_size, val_size])

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")


Training samples: 4000
Validation samples: 1000


In [37]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [38]:
# Iterate through the DataLoader to verify no KeyErrors occur
try:
    for batch in train_loader:
        print("Batch successfully loaded:")
        print({
            'input_ids': batch['input_ids'].shape,
            'attention_mask': batch['attention_mask'].shape,
            'ner_labels': batch['ner_labels'].shape,
            'anomaly_labels': batch['anomaly_labels'].shape
        })
        break  # Only verify the first batch
except KeyError as e:
    print(f"KeyError encountered: {e}. Please ensure all entity types are mapped.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Batch successfully loaded:
{'input_ids': torch.Size([16, 128]), 'attention_mask': torch.Size([16, 128]), 'ner_labels': torch.Size([16, 128]), 'anomaly_labels': torch.Size([16])}


In [39]:
import torch.nn as nn

class JointNERAnomalyModel(nn.Module):
    def __init__(self, base_model, num_ner_labels, num_anomaly_labels):
        super(JointNERAnomalyModel, self).__init__()
        self.base_model = base_model
        self.hidden_size = base_model.config.hidden_size

        # NER head
        self.ner_classifier = nn.Linear(self.hidden_size, num_ner_labels)

        # Anomaly Detection head
        self.anomaly_classifier = nn.Linear(self.hidden_size, num_anomaly_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # For NER

        # Use the CLS token's hidden state for anomaly classification
        cls_output = sequence_output[:, 0, :]  # [batch_size, hidden_size]

        ner_logits = self.ner_classifier(sequence_output)
        anomaly_logits = self.anomaly_classifier(cls_output)

        return ner_logits, anomaly_logits



In [40]:
# Load ModernBERT model
base_model = AutoModel.from_pretrained(model_name)

# Define number of labels
num_ner_labels = len(label_to_id)  # Total unique NER labels
num_anomaly_labels = 2  # Binary classification: Normal or Anomaly

# Initialize the joint model
model = JointNERAnomalyModel(base_model, num_ner_labels, num_anomaly_labels)

# Move model to GPU
model.to(device)

# Define loss functions
ner_loss_fn = nn.CrossEntropyLoss(ignore_index=label_to_id[O_label])  # Ignore 'O' label in loss
anomaly_loss_fn = nn.CrossEntropyLoss()

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

In [41]:
from transformers import get_linear_schedule_with_warmup

epochs = 3  # Adjust based on your requirements and Colab's runtime limits
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [42]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        ner_labels = batch['ner_labels'].to(device)
        anomaly_labels = batch['anomaly_labels'].to(device)

        ner_logits, anomaly_logits = model(input_ids, attention_mask)

        # Compute NER loss
        ner_loss = ner_loss_fn(ner_logits.view(-1, num_ner_labels), ner_labels.view(-1))

        # Compute Anomaly Detection loss
        anomaly_loss = anomaly_loss_fn(anomaly_logits, anomaly_labels)

        # Total loss
        loss = ner_loss + anomaly_loss
        total_loss += loss.item()

        # Backpropagation
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer step
        optimizer.step()

        # Scheduler step
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss}")


Training Epoch 1: 100%|██████████| 250/250 [05:48<00:00,  1.40s/it]


Epoch 1 Average Loss: 0.34516885855142027


Training Epoch 2: 100%|██████████| 250/250 [05:45<00:00,  1.38s/it]


Epoch 2 Average Loss: 0.0005255920746130868


Training Epoch 3: 100%|██████████| 250/250 [05:45<00:00,  1.38s/it]

Epoch 3 Average Loss: 0.00032167482742806897





In [43]:
def evaluate(model, dataloader, label_to_id, O_label="O"):
    """
    Evaluates the model on the given dataloader.

    Args:
        model: The trained model.
        dataloader: DataLoader for evaluation.
        label_to_id: Dictionary mapping labels to IDs.
        O_label (str): The label for non-entity tokens.

    Returns:
        None
    """
    model.eval()
    ner_preds = []
    ner_true = []
    anomaly_preds = []
    anomaly_true = []

    id_to_label = {v: k for k, v in label_to_id.items()}

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ner_labels = batch['ner_labels'].to(device)
            anomaly_labels = batch['anomaly_labels'].to(device)

            ner_logits, anomaly_logits = model(input_ids, attention_mask)

            # NER Predictions
            ner_pred = torch.argmax(ner_logits, dim=-1).cpu().numpy()
            ner_true_labels = ner_labels.cpu().numpy()
            ner_preds.extend(ner_pred.flatten())
            ner_true.extend(ner_true_labels.flatten())

            # Anomaly Predictions
            anomaly_pred = torch.argmax(anomaly_logits, dim=-1).cpu().numpy()
            anomaly_true_labels = anomaly_labels.cpu().numpy()
            anomaly_preds.extend(anomaly_pred)
            anomaly_true.extend(anomaly_true_labels)

    # Remove padding tokens ('O' label)
    o_label_id = label_to_id.get(O_label, -1)
    valid_indices = [i for i, label in enumerate(ner_true) if label != o_label_id]
    filtered_ner_preds = [ner_preds[i] for i in valid_indices]
    filtered_ner_true = [ner_true[i] for i in valid_indices]

    # Map label IDs back to labels
    filtered_ner_preds_labels = [id_to_label.get(id, O_label) for id in filtered_ner_preds]
    filtered_ner_true_labels = [id_to_label.get(id, O_label) for id in filtered_ner_true]

    # NER Classification Report
    print("NER Classification Report:")
    print(classification_report(filtered_ner_true_labels, filtered_ner_preds_labels, digits=4))

    # Anomaly Detection Classification Report
    print("Anomaly Detection Classification Report:")
    print(classification_report(anomaly_true, anomaly_preds, digits=4))

    # Plot Confusion Matrix for Anomaly Detection
    plt.figure(figsize=(6,6))
    cm = confusion_matrix(anomaly_true, anomaly_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Anomaly Detection Confusion Matrix')
    plt.show()


In [44]:
from google.colab import drive
drive.mount('/content/drive')
import os

# Create the directory if it doesn't exist
model_dir = "/content/drive/MyDrive/models"
os.makedirs(model_dir, exist_ok=True)

# Save the model to Google Drive
torch.save(model.state_dict(), os.path.join(model_dir, "joint_ner_anomaly_model_lg.pth"))
print(f"Model saved to {model_dir}/joint_ner_anomaly_model.pth")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to /content/drive/MyDrive/models/joint_ner_anomaly_model.pth
