In [17]:
import json
import hashlib
import time
import uuid
import spacy
import os
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the English tokenizer from spaCy
nlp = spacy.load("en_core_web_sm")

# Function to create input hash (based on Prodigy's approach)
def create_hash(text):
    return int(hashlib.md5(text.encode()).hexdigest(), 16) % (2**32)

def find_token_span(doc, start, end):
    """
    Find the most comprehensive token span for a given character span
    with more flexible matching
    """
    token_start = None
    token_end = None
    
    # First, attempt exact matching
    for idx, token in enumerate(doc):
        if token.idx <= start and (token.idx + len(token.text)) >= end:
            token_start = idx
            token_end = idx
            break
    
    # If exact match fails, try more flexible matching
    if token_start is None:
        matching_tokens = []
        for idx, token in enumerate(doc):
            # Check for overlap
            token_start_overlap = max(start, token.idx)
            token_end_overlap = min(end, token.idx + len(token.text))
            
            if token_start_overlap < token_end_overlap:
                matching_tokens.append(idx)
        
        # If we found matching tokens
        if matching_tokens:
            token_start = min(matching_tokens)
            token_end = max(matching_tokens)
    
    return token_start, token_end

def doccano_to_prodigy(doccano_jsonl):
    prodigy_data = []
    
    for entry in doccano_jsonl:
        if "text" not in entry:
            logger.warning("Entry missing 'text' field")
            continue
        
        # Preserve original text exactly
        full_text = entry["text"]
        full_labels = entry.get("label", [])
        
        # Tokenize original text with spaCy
        doc = nlp(full_text)
        
        # Create tokens list
        tokens = []
        for i, token in enumerate(doc):
            tokens.append({
                "text": token.text,
                "start": token.idx,
                "end": token.idx + len(token.text),
                "id": i,
                "ws": token.whitespace_ != ""
            })
        
        # Convert labels to Prodigy-style spans
        spans = []
        skipped_labels = []
        
        for start, end, label in full_labels:
            # Find the most comprehensive token span
            token_start, token_end = find_token_span(doc, start, end)
            
            # Only add span if we found valid token indices
            if token_start is not None and token_end is not None:
                spans.append({
                    "start": start,
                    "end": end,
                    "token_start": token_start,
                    "token_end": token_end,
                    "label": label
                })
            else:
                skipped_labels.append((start, end, label))
        
        # Log any skipped labels for debugging
        if skipped_labels:
            logger.warning(f"Skipped labels: {skipped_labels}")
        
        # Generate Prodigy metadata
        prodigy_entry = {
            "text": full_text,
            "_input_hash": create_hash(full_text),
            "_task_hash": create_hash(str(spans)),  
            "_is_binary": False,
            "tokens": tokens,
            "_view_id": "ner_manual",
            "spans": spans,
            "answer": "accept",
            "_timestamp": int(time.time()),  
            "_annotator_id": str(uuid.uuid4()),  
            "_session_id": str(uuid.uuid4())  
        }
        
        prodigy_data.append(prodigy_entry)
    
    return prodigy_data

def convert_files(input_dir, output_dir):
    """Convert Doccano JSONL files to Prodigy JSONL format."""
    try:
        for root, _, files in os.walk(input_dir):
            for file in files:
                if not file.endswith(".jsonl"):
                    continue
                
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                relative_path = os.path.splitext(relative_path)[0] 
                output_path = os.path.join(output_dir, relative_path + ".jsonl")
                
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                
                doccano_data = []
                with open(input_path, "r", encoding="utf-8") as file:
                    for line in file:
                        try:
                            if line.strip():
                                doccano_data.append(json.loads(line))
                        except json.JSONDecodeError as e:
                            logger.error(f"Skipping invalid line in {input_path}: {line.strip()}")
                
                # Convert multiple entries and write to file
                prodigy_json_entries = doccano_to_prodigy(doccano_data)
                
                with open(output_path, "w", encoding="utf-8") as file:
                    for entry in prodigy_json_entries:
                        file.write(json.dumps(entry) + "\n")
                
                print(f"Conversion complete! File saved to {output_path}")
    except Exception as e:
        logger.error(f"Error in conversion: {e}")

def main():
    input_dir = "../Doccano_Annotations/JSON_Reports_Annotated"
    output_dir = "../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated"
    convert_files(input_dir, output_dir)

if __name__ == "__main__":
    main()

Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013540851\414653.jsonl
Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013540851\433023.jsonl
Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013612611\311711.jsonl
Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013612611\537699.jsonl
Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013612611\537705.jsonl
Conversion complete! File saved to ../Prodigy_Annotations/Doccano_to_Prodigy/JSON_Reports_Annotated\9013612611\569928.jsonl
