In [None]:
#| default_exp nlp

# nlp

> Utility functions for nlp work.

## token annotation widget

In [None]:
#| export
import anywidget
import traitlets

In [None]:
#| export
class TokenAnnotator(anywidget.AnyWidget):
    _esm = """
    function render({ model, el }) {
    
        // Setup reactive state getters
        let getText = () => model.get('text');
        let getAnnotations = () => model.get('annotations');
        let getlabels = () => model.get('labels');
        
        // Create control panel
        const controls = document.createElement('div');
        controls.className = "annotation-labels"
        
        // Create radio buttons from labels
        getlabels().forEach(cls => {
            const label = document.createElement('label');
            const input = document.createElement('input');
            input.type = 'radio';
            input.name = 'class';
            input.value = cls;
            if (cls === getlabels()[0]) input.checked = true;
            label.appendChild(input);
            label.appendChild(document.createTextNode(' ' + cls));
            controls.appendChild(label);
            controls.appendChild(document.createTextNode(' '));
        });
        el.appendChild(controls);
        
        function preprocessText(text) {
            return text
                .replace(/^(# .+\\n?)/gm, '<h1>$1</h1>')
                .replace(/^(## .+\\n?)/gm, '<h2>$1</h2>')
                .replace(/^(### .+\\n?)/gm, '<h3>$1</h3>')
        }
        
        // Create content div
        const textDiv = document.createElement('div');
        
        //textDiv.textContent = getText();
        textDiv.innerHTML = preprocessText(getText());
        
        textDiv.className = "annotation-container";
        el.appendChild(textDiv);
        
        // Setup annotation tracking
        const originalText = getText();
        const colors = ['#ffd1dc', '#90EE90', '#87CEEB', '#FFB347', '#C8A2C8', '#FFE4B5'];
        let colorIndex = 0;
        const classColors = new Map();
        
        function getColorForClass(className) {
            if (!classColors.has(className)) {
                classColors.set(className, colors[colorIndex++ % colors.length]);
            }
            return classColors.get(className);
        }
        
        function getOriginalTextPosition(node, offset) {
            let currentPos = 0;
            const walker = document.createTreeWalker(textDiv, NodeFilter.SHOW_TEXT, null, false);
            
            let currentNode = walker.nextNode();
            while (currentNode) {
                if (currentNode === node) {
                    return currentPos + offset;
                }
                currentPos += currentNode.textContent.length;
                currentNode = walker.nextNode();
            }
            return offset;
        }
        
        function unwrapMark(mark) {
            const startPos = getOriginalTextPosition(mark.firstChild, 0);
            const annotations = getAnnotations().filter(a => a.start !== startPos);
            mark.replaceWith(...mark.childNodes);
            model.set('annotations', annotations);
            model.save_changes();
        }
        
        function findNodeAndOffsetAtPosition(container, targetPosition) {
            let currentPos = 0;
            const walker = document.createTreeWalker(container, NodeFilter.SHOW_TEXT, null, false);

            let lastNode = null;
            let node = walker.nextNode();

            while (node) {
                const nodeLength = node.textContent.length;
                if (currentPos + nodeLength >= targetPosition) {  // Changed > to >=
                    return {
                        node: node,
                        offset: targetPosition - currentPos
                    };
                }
                currentPos += nodeLength;
                lastNode = node;
                node = walker.nextNode();
            }

            // Handle the end-of-text case
            if (lastNode && targetPosition === currentPos) {
                return {
                    node: lastNode,
                    offset: lastNode.textContent.length
                };
            }

            return null;
        }
        // Then use it for initial annotations
        getAnnotations().forEach(annotation => {
            const startLoc = findNodeAndOffsetAtPosition(textDiv, annotation.start);
            const endLoc = findNodeAndOffsetAtPosition(textDiv, annotation.end);

            if (startLoc && endLoc) {
                const range = document.createRange();
                range.setStart(startLoc.node, startLoc.offset);
                range.setEnd(endLoc.node, endLoc.offset);

                const markElement = document.createElement('mark');
                markElement.style.backgroundColor = getColorForClass(annotation.class);
                markElement.title = `${annotation.class}`;

                markElement.addEventListener('click', e => {
                    e.preventDefault();
                    unwrapMark(markElement);
                });

                try {
                    range.surroundContents(markElement);
                } catch (e) {
                    console.error('Cannot wrap initial annotation that crosses multiple nodes', e);
                }
            }
        });
        
        textDiv.addEventListener('mouseup', () => {
            const selection = window.getSelection();
            
            if (selection.toString().trim().length > 0) {
                const range = selection.getRangeAt(0);
                const selectedClass = document.querySelector('input[name="class"]:checked').value;
                const markElement = document.createElement('mark');
                markElement.style.backgroundColor = getColorForClass(selectedClass);
                markElement.title = `${selectedClass}`;
                
                markElement.addEventListener('click', e => { 
                    e.preventDefault(); 
                    unwrapMark(markElement); 
                });
                
                try {
                    const startPos = getOriginalTextPosition(range.startContainer, range.startOffset);
                    const endPos = getOriginalTextPosition(range.endContainer, range.endOffset);
                    
                    const annotations = [...getAnnotations(), {
                        start: startPos,
                        end: endPos,
                        text: selection.toString(),
                        class: selectedClass
                    }];
                    
                    range.surroundContents(markElement);
                    selection.removeAllRanges();
                    
                    model.set('annotations', annotations);
                    model.save_changes();
                } catch (e) {
                    console.error('Cannot wrap selection that crosses multiple nodes', e);
                }
            }
        });
    }
    export default { render };
    """

    _css = """
    .annotation-container {
        padding: 20px;
        border: 1px solid #ccc;
        margin: 8px;
        white-space: pre-wrap;
        line-height: 1.6;

        /* Subtle lined paper effect using background-image */
        background-image: linear-gradient(transparent 1.5em, #f0f0f0 1.5em);
        background-size: 100% 1.6em;
        padding: 0.1em 1em;
    }
    
    .annotation-labels {
      display: flex;
      margin: 4px;
      gap: 1rem;
    }
    
    .annotation-labels label {
      display: flex;
      align-items: center;
      gap: 0.5rem; /* Space between radio button and label text */
      padding: 0.5rem 1rem;
      border: 1px solid #ccc;
      border-radius: 0.5rem;
      background: #f9f9f9;
      cursor: pointer;
    }
    
    mark {
        padding: 2px;
        border-radius: 2px;
        cursor: pointer;
    }
    
    mark:hover {
        filter: brightness(0.9);
    }
    
    /* Add to your _css string */
    h1, h2, h3 {
        /* Remove default heading margins since we're keeping newlines */
        margin: 0;
        font-weight: bold;
    }

    /* Different sizes for different heading levels */
    h1 { font-size: 1.8em; }
    h2 { font-size: 1.5em; }
    h3 { font-size: 1.3em; }
    """

    # Traitlets for syncing state
    text = traitlets.Unicode().tag(sync=True)
    annotations = traitlets.List([]).tag(sync=True)
    labels = traitlets.List([]).tag(sync=True)

Small example including some pre-computed annotation.

In [None]:
a = TokenAnnotator(text='John Smith visited Microsoft in Seattle last week.', 
               labels=['person', 'location', 'organization'],
               annotations=[{'start': 19, 'end': 28, 'class': 'organization'}])
a

TokenAnnotator(annotations=[{'start': 19, 'end': 28, 'class': 'organization'}], labels=['person', 'location', …

We can access the annotations from Python!

In [None]:
a.annotations

[{'start': 19, 'end': 28, 'class': 'organization'}]

.. it also works ok for longer documents:

In [None]:
x = '# Punkt 9: Kontrakt med Erhvervshus Nord 2024-2026\n\n### EMN-2023-01106\n## Bilag\n\n### Udkast - Operatøraftale om erhvervsudvikling i 2024-2026\n\n\n-----\n\n### 9 (Åben) Kontrakt med Erhvervshus Nord 2024-2026\n\n**Sags ID: EMN-2023-01106**\n\n**Ansvarligt center: Direktionssekretariat**\n\n**Beslutningskompetence**\nØU/BR\n\n**Sagsfremstilling**\nDer er udarbejdet en 3 årig aftale, der løber fra 1. januar 2024 – 31. december 2026.\n\nDette sker ved at arbejde målrettet gennem:\n\n  - erhvervsservice over for virksomhederne i Frederikshavn Kommune.\n\n  - erhvervsudvikling hvor Erhvervshus Nord gennemfører en aktiv\nerhvervsudviklingsindsats over for de enkelte virksomheder med\n\nunderstøtning af kommunens strategier, politikker og konkrete indsatser\nherunder Recycling City og den grønne omstilling.\n\n  - erhvervsstrukturudvikling, hvor Erhvervshus Nord arbejder Strategisk med den\nlangsigtede erhvervsstrukturudvikling i Frederikshavn Kommune, gennem\n\nløbende dialog med kommunen samt konkrete indsatser for at tiltrække\nvirksomheder og investeringer med stor positiv erhvervsøkonomisk afsmitning i\nlokalsamfundet.\n\n  - inddragelse og samarbejder med øvrige erhvervsfremmeaktører.\n\nFor løsning af erhvervsserviceopgaverne betales en ydelse på 4.400.000 kr. inkl. moms,\nbeløbet pristalsreguleres med virkning fra 2025. Betalingen sker i 2 årlige rater med\nførste halvdel ved årets begyndelse og anden halvdel pr. 1. juli, svarende til 2.200.000 kr.\npr. rate. Der reserveres desuden et årligt kommunalt beløb på 500.000 kr. til eventuelle\nsærlige indsatser. Beløb op til 500.000 kr. udbetales, hvis der i forlængelse af de årlige\nmøder aftales specielle indsatser med Økonomiudvalget.\n\nErhvervshus Nord ønsker, at der i aftalen indgår, at basisaftalen og aftalen om særlige\nindsatser pristalsreguleres i 2024, 2025 og 2026.\n\n**Indstilling**\nDirektionssekretariatet indstiller, at\n1. aftale for 2024-2026 mellem kommunen og Frederikshavn Erhvervsråd\ngodkendes\n2. basisaftalen pristalsreguleres med virkning fra 2024\n3. det eventuelle beløb til særlig indsatser ikke pristalsreguleres.\n\n**Tidligere beslutninger:**\n\n\n-----\n\nUdvalg: Økonomiudvalget 2022-2025\n\nDato: 26-04-2023\n\nAnbefales med den bemærkning, at der betales en ydelse på 4.400.000 kr. ekskl. moms.\n\n**Beslutninger:**\n\nØkonomiudvalgets indstilling godkendt.\n\nAfbud fra Martin Tøttrup Kelkelund (A). I stedet deltog Marie Gade Madsen (A).\nAfbud fra Almina Nikontovic (A). I stedet deltog Bent Hieronymus Pedersen (A).\n\n**Bilag**\n\n1. Udkast - Operatøraftale om erhvervsudvikling i 2024-2026 (DokumentID: 7146136 - EMN-2023-01106)\n\n\n-----\n\n'
a2 = TokenAnnotator(text = x, labels=['person', 'organization', 'date'], annotations=[{'start': 2316, 'end': 2340, 'class': 'person'}])
a2

TokenAnnotator(annotations=[{'start': 2316, 'end': 2340, 'class': 'person'}], labels=['person', 'organization'…

In [None]:
a2.annotations

[{'start': 2316, 'end': 2340, 'class': 'person'}]

### Weak Labelling with RegEx Patterns

In [None]:
#| export
import re
from typing import List, Dict, Any

In [None]:
#| export
def annotate(text: str, patterns: Dict[str, str]) -> Dict[str, Any]:
    """
    Annotate text with multiple patterns. If a pattern contains capture groups,
    uses the first capture group; otherwise uses the full match.
    
    Example:
        annotate("Hi there, John Doe!", {
            "greeting": r'Hi|Hello'
        })
    """
    annotations = []
    for cls, pattern in patterns.items():
        
        for m in re.finditer(pattern, text, re.UNICODE):
            start = m.start()
            end = m.end()
            
            """
            # If there's a capture group, use its positions
            if m.groups():
                start = m.start(1)  # position of first capture group
                end = m.end(1)
            else:
                start = m.start()   # position of full match
                end = m.end()
            """
            
            annotations.append({"class": cls, "start": start, "end": end})
    
    return {
        "text": text,
        "annotations": sorted(annotations, key=lambda x: x['start'])
    }

In [None]:
doc = annotate("Hi there, John Doe!", {
    "person": r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+',
    "greeting": r'Hi|Hello'
})
doc

{'text': 'Hi there, John Doe!',
 'annotations': [{'class': 'greeting', 'start': 0, 'end': 2},
  {'class': 'person', 'start': 10, 'end': 18}]}

We can pass this doc to the annotator class to visualize. 

Note: if we provide a list of labels to TokenAnnotator, we can also edit the weak annotations.

In [None]:
TokenAnnotator(**doc)

TokenAnnotator(annotations=[{'class': 'greeting', 'start': 0, 'end': 2}, {'class': 'person', 'start': 10, 'end…

### Patterns

A collection of common RegEx patterns that often come in handy. Useful for stuff like weak annotations.

Let's start with a pattern to match names

In [None]:
#| export

# First, let's handle names that can start with an apostrophe
base_start = r'[A-Z\u00C0-\u017F](?:\'[A-Z\u00C0-\u017F])?'  # Allows N' at start
base_rest = r'[a-z\u00C0-\u017F]+'

# Complete base name part with special characters
name_part = f'{base_start}{base_rest}(?:[-][A-Z\u00C0-\u017F][a-z\u00C0-\u017F]+)*'

# Particles (van, der, etc.)
particles = r'(?:\s+(?:van|der|de|den|von|und|le|la|di|el|al|bin|ibn|mac|mc|dos|das|do|da|of|af|av))*'

# Final pattern requiring at least two parts for a full name
re_person = fr'({name_part}(?:{particles}\s+{name_part})+)'

Got Sonnet to give some examples of people names to test :)

In [None]:
text = "Here are some people: Jean-Paul O'Connor, María-José García, Søren Kierkegaard, N'Golo Kanté, Smith-Jones, and François van der Meer"
TokenAnnotator(**annotate(text, {'person': re_person}))

TokenAnnotator(annotations=[{'class': 'person', 'start': 22, 'end': 40}, {'class': 'person', 'start': 42, 'end…

Email pattern

In [None]:
# todo: email pattern

URL pattern

In [None]:
# todo

### export

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()