# Data Preprocessing
Raw data is unstructured

https://www.kaggle.com/datasets/shivamaggarwal513/dlai-annotated-named-entity-recognition/data

In [None]:
import spacy
import random
from spacy.tokens import DocBin

def read_data_files(sentences_file, labels_file):
    """
    Read the sentences and labels files using ISO-8859-1 encoding
    """
    with open(sentences_file, 'r', encoding='iso-8859-1') as f:
        sentences = [line.strip() for line in f if not line.startswith('//')]
    
    with open(labels_file, 'r', encoding='iso-8859-1') as f:
        labels = [line.strip().split() for line in f if not line.startswith('//')]
    
    return sentences, labels

def validate_data(sentences, token_labels):
    """Validate data compatibility and fix basic issues"""
    valid_pairs = []
    skipped = 0
    
    for i, (sentence, labels) in enumerate(zip(sentences, token_labels)):
        tokens = sentence.split()
        
        # Check if tokens and labels match in length
        if len(tokens) != len(labels):
            print(f"Example {i}: token count ({len(tokens)}) != label count ({len(labels)})")
            skipped += 1
            continue
        
        # Check for empty sentences
        if not sentence.strip():
            print(f"Example {i}: empty sentence")
            skipped += 1
            continue
        
        valid_pairs.append((sentence, labels))
    
    print(f"Validation complete: {len(valid_pairs)} valid pairs, {skipped} skipped")
    return valid_pairs

def convert_to_spacy_format(sentences, token_labels):
    """Convert IOB formatted sentences and labels to spaCy training format"""
    training_data = []
    
    for sentence, labels in zip(sentences, token_labels):
        tokens = sentence.split()
        
        # Calculate token offsets
        offset = 0
        spans = []
        for token in tokens:
            spans.append((offset, offset + len(token)))
            offset += len(token) + 1  # +1 for the space
        
        # Extract entity spans using IOB tags
        entities = []
        current_entity = None
        
        for i, (label, (start, end)) in enumerate(zip(labels, spans)):
            if label.startswith('B-'):
                # End any active entity
                if current_entity:
                    entity_start, entity_type = current_entity
                    entity_end = spans[i-1][1]
                    entities.append((entity_start, entity_end, entity_type))
                
                # Start new entity
                current_entity = (start, label[2:])
            
            elif label.startswith('I-'):
                # Continue entity if there's an active one
                if not current_entity:
                    # This is an I- tag without a preceding B- tag
                    # Start a new entity as if it were a B- tag
                    current_entity = (start, label[2:])
            
            elif label == 'O' and current_entity:
                # End active entity
                entity_start, entity_type = current_entity
                entity_end = spans[i-1][1]
                entities.append((entity_start, entity_end, entity_type))
                current_entity = None
        
        # Add final entity if sentence ends with one
        if current_entity:
            entity_start, entity_type = current_entity
            entity_end = spans[-1][1]
            entities.append((entity_start, entity_end, entity_type))
        
        training_data.append((sentence, {"entities": entities}))
    
    return training_data

def split_data(data, train_ratio=0.8):
    """Split data into training and evaluation sets"""
    random.seed(42)  # For reproducibility
    random.shuffle(data)
    split_point = int(len(data) * train_ratio)
    return data[:split_point], data[split_point:]

def create_spacy_docs(data, output_file, nlp):
    """Create and save spaCy DocBin objects for training"""
    db = DocBin()
    skipped_entities = 0
    
    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []
        token_to_span = {}  # Track which token belongs to which span
        
        # Sort entities by start position
        sorted_entities = sorted(annotations["entities"], key=lambda x: (x[0], -x[1]))
        
        for start, end, label in sorted_entities:
            span = doc.char_span(start, end, label=label, alignment_mode="strict")
            
            if span is None:
                # Try with different alignment modes
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    span = doc.char_span(start, end, label=label, alignment_mode="expand")
                    if span is None:
                        skipped_entities += 1
                        continue
            
            # Check for overlapping entities
            is_overlapping = False
            for token in span:
                if token.i in token_to_span:
                    is_overlapping = True
                    break
            
            if not is_overlapping:
                ents.append(span)
                for token in span:
                    token_to_span[token.i] = len(ents) - 1
        
        # Set entities and add to DocBin
        doc.ents = ents
        db.add(doc)
    
    db.to_disk(output_file)
    print(f"Saved to {output_file} (skipped {skipped_entities} problematic entities)")


In [3]:
# Paths
sentences_file = "data/sentences.txt"
labels_file = "data/labels.txt"
output_dir = "corpus"
config_path = "config.cfg"

# Load data with correct ISO-8859-1 encoding
sentences, token_labels = read_data_files(sentences_file, labels_file)
print(f"Loaded {len(sentences)} sentences and {len(token_labels)} label sets")

# Validate and clean data
valid_data = validate_data(sentences, token_labels)

# Map labels to standard spaCy labels
label_mapping = {
    'art': 'WORK_OF_ART',
    'eve': 'EVENT',
    'geo': 'LOC',
    'gpe': 'GPE',
    'nat': 'NORP',
    'org': 'ORG',
    'per': 'PERSON',
    'tim': 'DATE' 
}

# Convert IOB tags to spaCy format with label mapping
def preprocess_data(sentences, token_labels):
    processed_data = []
    
    # Convert to spaCy format
    raw_data = convert_to_spacy_format([s for s, _ in valid_data], [l for _, l in valid_data])
    
    # Apply preprocessing to each example
    for text, annotations in raw_data:
        # Basic text normalization (remove extra whitespace)
        text = ' '.join(text.split())
        
        # Map labels to standard spaCy labels
        entities = []
        for start, end, label in annotations["entities"]:
            standard_label = label_mapping.get(label, label)
            entities.append((start, end, standard_label))
        
        processed_data.append((text, {"entities": entities}))
    
    # Remove duplicates (based on text)
    seen_texts = set()
    deduplicated_data = []
    for text, annot in processed_data:
        if text not in seen_texts:
            seen_texts.add(text)
            deduplicated_data.append((text, annot))
    
    print(f"Removed {len(processed_data) - len(deduplicated_data)} duplicate examples")
    return deduplicated_data

# Apply preprocessing
training_data = preprocess_data(sentences, token_labels)
print(f"Processed {len(training_data)} examples to spaCy format")

# Split data
train_data, eval_data = split_data(training_data)
print(f"Split into {len(train_data)} training and {len(eval_data)} evaluation examples")

# Initialize spaCy
nlp = spacy.blank("en")

# Create and save DocBin objects
create_spacy_docs(train_data, f"{output_dir}/train.spacy", nlp)
create_spacy_docs(eval_data, f"{output_dir}/eval.spacy", nlp)

# Extract unique labels (now using standard spaCy labels)
unique_labels = set()
for _, annotations in training_data:
    for _, _, label in annotations["entities"]:
        unique_labels.add(label)

label_list = sorted(list(unique_labels))

Loaded 47959 sentences and 47959 label sets
Example 76: token count (37) != label count (38)
Example 10051: token count (52) != label count (54)
Example 19817: token count (28) != label count (29)
Example 47591: token count (30) != label count (29)
Validation complete: 47955 valid pairs, 4 skipped
Removed 384 duplicate examples
Processed 47571 examples to spaCy format
Split into 38056 training and 9515 evaluation examples
Saved to corpus/train.spacy (skipped 0 problematic entities)
Saved to corpus/eval.spacy (skipped 0 problematic entities)
