# Data Annotation

To build an NER model we need to establish a training data set. The base of this can be NASA ADS archive data but we will need to map things into the right format to be compatible with spaCy.

In [44]:
from pathlib import Path
import re
import json

In [2]:
DATA_ROOT = Path("../data")
ADS_DATA = DATA_ROOT / "ads_abstracts"

In [3]:
abstract_files = list(ADS_DATA.glob("*.json"))

In [11]:
with open(abstract_files[0]) as f:
    abstract_batch = json.load(f).get("response")

In [None]:
for doc in abstract_batch.get("docs"):
    keywords = ' - '.join(doc.get("keyword", []))
    if ": individual:" in keywords.lower():
        print(doc)
        break

{'arxiv_class': ['Astrophysics - Solar and Stellar Astrophysics', 'Astrophysics - Astrophysics of Galaxies'], 'alternate_bibcode': ['2017arXiv170404592S'], 'identifier': ['2017arXiv170404592S', '2017ApJS..230....3S', '10.3847/1538-4365/aa6d76', '1704.04592', '2017arXiv170404592S', '10.3847/1538-4365/aa6d76'], 'first_author': 'Sung, Hwankyung', 'abstract': 'We present deep wide-field optical CCD photometry and mid-infrared Spitzer/IRAC and MIPS 24 μm data for about 100,000 stars in the young open cluster IC 1805. The members of IC 1805 were selected from their location in the various color-color and color-magnitude diagrams, and the presence of Hα emission, mid-infrared excess emission, and X-ray emission. The reddening law toward IC 1805 is nearly normal (R <SUB> V </SUB> = 3.05 ± 0.06). However, the distance modulus of the cluster is estimated to be 11.9 ± 0.2 mag (d=2.4+/- 0.2 kpc) from the reddening-free color-magnitude diagrams, which is larger than the distance to the nearby massi

In [21]:
doc.get("keyword")

['open clusters and associations: individual: IC 1805',
 'stars: formation',
 'stars: pre-main sequence',
 'Astrophysics - Solar and Stellar Astrophysics',
 'Astrophysics - Astrophysics of Galaxies']

In [72]:
def extract_astronomical_keyword_entities(doc: dict)->list[str]:
    """
    Extracts astronomical entities from the document keywords.
    """
    entities = []
    keywords = doc.get("keyword", [])
    for keyword in keywords:
        if "individual: " in keyword.lower():
            entity = keyword.split("individual: ")[-1].strip()
            entities.append(entity)
    return entities

def find_entities_in_text(text: str, 
                          entities: list[str]
                          )->dict[str: list[tuple[int]]]:
    """
    Finds the specified entities in the text and returns 
    the start and end index of each occurrence of each entity.
    """
    found_entities = {}
    for entity in entities:
        if locations:=find_exact_matches(text, entity):
            found_entities[entity] = locations
    return found_entities


def find_exact_matches(text: str, search_string: str) -> list[tuple[int, int]]:
    """
    Finds occurrences of the exact string, allowing a space or punctuation mark after it.
    This ensures that substrings like "34 Normae" are not matched.

    Args:
        text: The string to search within.
        search_string: The exact string to search for.

    Returns:
        A list of tuples, where each tuple contains the (start, end) index
        of a match.
    """
    if not search_string:
        return []

    # Construct the regex pattern.
    # We escape the search string in case it contains any regex special characters.
    # We use a positive lookahead to ensure the match is followed by a space or punctuation,
    # but without consuming that character for the next match.
    pattern = re.compile(re.escape(search_string) + r"(?=[\s\W])")

    occurrences = []
    for match in pattern.finditer(text):
        start_index = match.start()
        end_index = match.end() - 1  # End index is inclusive
        occurrences.append((start_index, end_index))
    return occurrences

def search_title_for_entities(doc: dict, entities: list[str]) -> dict[str, list[tuple[int]]]:
    """
    Searches the document title for the specified entities and returns their occurrences.
    """
    text = ' '.join(doc.get("title", []))
    found_entities = find_entities_in_text(text, entities)
    return text, found_entities

def search_abstract_for_entities(doc: dict, entities: list[str]) -> dict[str, list[tuple[int]]]:
    """
    Searches the document abstract for the specified entities and returns their occurrences.
    """
    text = doc.get("abstract", "")
    found_entities = find_entities_in_text(text, entities)
    return text, found_entities

def build_data_model(doc):
    """
    Builds the data structure for training a NER model.
    """
    entities = extract_astronomical_keyword_entities(doc)
    if not entities:
        return None
    
    data_model = {
        "doc_id": doc.get("id"),
        "title": doc.get("title", ""),
        "abstract": doc.get("abstract", ""),
        "keywords": doc.get("keyword", []),
        "objects": entities,
    }
    
    title, title_entities = search_title_for_entities(doc, entities)
    abstract, abstract_entities = search_abstract_for_entities(doc, entities)

    data_model["title_entities"] = title_entities
    data_model["abstract_entities"] = abstract_entities

    spacy_ner_data = []
    if title_entities:
        spacy_ner_data.extend(build_spacy_ner_data(title, title_entities))
    if abstract_entities:
        spacy_ner_data.extend(build_spacy_ner_data(abstract, abstract_entities))
    data_model["spacy_ner_data"] = spacy_ner_data
    
    return data_model

def build_spacy_ner_data(text: str, entities: dict[str, list[tuple[int]]]) -> list[dict]:
    """
    Builds the data structure for training a spaCy NER model.
    """
    ner_data = [text,
                {
                    "entities": [(*loc, "ASTRO_OBJ") for _, locs in entities.items() for loc in locs]
                },
                ]
    return ner_data

In [73]:
build_data_model(doc)

{'doc_id': '14926346',
 'title': ['An Optical and Infrared Photometric Study of the Young Open Cluster IC 1805 in the Giant H II Region W4 <SUP>†</SUP>'],
 'abstract': 'We present deep wide-field optical CCD photometry and mid-infrared Spitzer/IRAC and MIPS 24 μm data for about 100,000 stars in the young open cluster IC 1805. The members of IC 1805 were selected from their location in the various color-color and color-magnitude diagrams, and the presence of Hα emission, mid-infrared excess emission, and X-ray emission. The reddening law toward IC 1805 is nearly normal (R <SUB> V </SUB> = 3.05 ± 0.06). However, the distance modulus of the cluster is estimated to be 11.9 ± 0.2 mag (d=2.4+/- 0.2 kpc) from the reddening-free color-magnitude diagrams, which is larger than the distance to the nearby massive star-forming region W3(OH) measured from the radio VLBA astrometry. We also determined the age of IC 1805 ({τ }<SUB>{MSTO</SUB>}=3.5 Myr). In addition, we critically compared the age and 