In [1]:
!pip install transformers
!pip install fitz
!pip install spacy
!pip install PyMuPDF
!pip install torch

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl.metadata (6.6 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl.metadata (5.3 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting simplejson>=3.8.0 (from nipype->fitz)
  Downloading simplejson-3.19.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting traits!=5.0,<6.4,>=4.6 (from nipype->fitz)
  Downloading traits-6.3.2-cp310-cp310-manylinux_2_5_x86_64.ma

In [2]:
import re
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd

DEFINING REGEX PATTERNS, VALIDATING THE MATCH

In [9]:
nlp = spacy.load("en_core_web_sm")

supernova_name_pattern = re.compile(r'\bSN\d{4}[A-Za-z]*\b')
type_pattern = re.compile(r'\bType [I|II][a-c|p|n|l|b]?\b')
luminosity_pattern = re.compile(r'-?\d+\.\d+ mag')
host_galaxy_pattern = re.compile(r'NGC \d+|IC \d+|Messier \d+|UGC \d+|ESO \d+-\d+|PGC \d+|Mrk \d+|SDSS J\d+|RX J\d+|2MASX J\d+|A\d+|PKS \d+')
redshift_pattern = re.compile(r'z=\d+\.\d+')
distance_pattern = re.compile(r'\d+\.\d+ Mpc')

def validate_entity(entity, label):
    supernova_name_pattern = re.compile(r'^SN\d{4}[A-Za-z]*$')
    type_pattern = re.compile(r'^Type [I|II][a-c|p|n|l|b]?$')
    luminosity_pattern = re.compile(r'^-?\d+\.\d+ mag$')
    host_galaxy_pattern = re.compile(r'^(NGC \d+|IC \d+|Messier \d+|UGC \d+|ESO \d+-\d+|PGC \d+|Mrk \d+|SDSS J\d+|RX J\d+|2MASX J\d+|PKS \d+)$')
    redshift_pattern = re.compile(r'^z=\d+\.\d+$')
    distance_pattern = re.compile(r'^\d+\.\d+ Mpc$')


    if label == "Supernova Name" and supernova_name_pattern.match(entity):
        return True
    elif label == "Type" and type_pattern.match(entity):
        return True
    elif label == "Luminosity" and luminosity_pattern.match(entity):
        return True
    elif label == "Host Galaxy" and host_galaxy_pattern.match(entity):
        return True
    elif label == "Redshift" and redshift_pattern.match(entity):
        return True
    elif label == "Distance" and distance_pattern.match(entity):
        return True


    return False

DEFINING RULE BASED ANNOTATION

In [10]:
def initial_annotation(text):
    annotations = []

    supernova_names = supernova_name_pattern.findall(text)
    types = type_pattern.findall(text)
    luminosities = luminosity_pattern.findall(text)
    host_galaxies = host_galaxy_pattern.findall(text)
    redshifts = redshift_pattern.findall(text)
    distances = distance_pattern.findall(text)

    annotations.extend([("Supernova Name", name) for name in supernova_names if validate_entity(name, "Supernova Name")])
    annotations.extend([("Type", type_) for type_ in types if validate_entity(type_, "Type")])
    annotations.extend([("Luminosity", lum) for lum in luminosities if validate_entity(lum, "Luminosity")])
    annotations.extend([("Host Galaxy", galaxy) for galaxy in host_galaxies if validate_entity(galaxy, "Host Galaxy")])
    annotations.extend([("Redshift", redshift) for redshift in redshifts if validate_entity(redshift, "Redshift")])
    annotations.extend([("Distance", distance) for distance in distances if validate_entity(distance, "Distance")])

    return annotations

TOKENIZING AND CHUNKING

In [11]:
def tokenize_and_chunk(text, tokenizer, max_length=512):
    tokens = tokenizer(text, truncation=True, max_length=max_length, return_overflowing_tokens=False, return_tensors='pt')
    input_ids = tokens['input_ids']
    chunks = [tokenizer.decode(input_ids[0][i:i + max_length], skip_special_tokens=True) for i in range(0, len(input_ids[0]), max_length)]
    return chunks

NER ANNOTATION

In [12]:
def ner_annotation(text):
    tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased")
    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


    chunks = tokenize_and_chunk(text, tokenizer)
    all_results = []

    for chunk in chunks:
        if chunk.strip():
            results = nlp_pipeline(chunk)
            for result in results:
                word = result['word']
                entity = result['entity']


                label = None
                if "SN" in word:
                    label = "Supernova Name"
                elif "Type" in word:
                    label = "Type"
                elif "mag" in word:
                    label = "Luminosity"
                elif "z=" in word:
                    label = "Redshift"
                elif "Mpc" in word:
                    label = "Distance"
                elif any(prefix in word for prefix in ["NGC", "IC", "Messier", "UGC", "ESO", "PGC", "Mrk", "SDSS", "RX", "2MASX", "PKS"]):
                    label = "Host Galaxy"

                if label and validate_entity(word, label):
                    all_results.append((word, label))

    return all_results

COMBINING THE ANNOTATIONS

In [13]:
def combined_annotation(text):
    if not text.strip():
        return []

    rule_based_anns = initial_annotation(text)
    ner_anns = ner_annotation(text)

    combined_anns = list(set(rule_based_anns + ner_anns))

    valid_annotations = [ann for ann in combined_anns if validate_combined_annotation(ann)]

    return valid_annotations


def validate_combined_annotation(annotation):
    return True

FUNCTION TO EXTRACT TEXT FROM PDF

In [14]:
def extract_text_from_pdf(pdf_path):
    import fitz
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def process_pdfs(pdf_paths):
    all_annotations = []

    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path)
        if text.strip():
            annotations = combined_annotation(text)
            all_annotations.extend(annotations)

    return all_annotations


pdf_paths = ['/content/sample_data/Supernovae_dataset/0303428v1.pdf','/content/sample_data/Supernovae_dataset/0601261v1.pdf','/content/sample_data/Supernovae_dataset/0612072v1.pdf','/content/sample_data/Supernovae_dataset/1211.1378v1.pdf','/content/sample_data/Supernovae_dataset/1803.01875v2.pdf']
annotations = process_pdfs(pdf_paths)


for ann in annotations:
    print(ann)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert

('Supernova Name', 'SN1974G')
('Luminosity', '22.1 mag')
('Luminosity', '0.04 mag')
('Supernova Name', 'SN1989B')
('Supernova Name', 'SN1986G')
('Luminosity', '0.6 mag')
('Luminosity', '0.01 mag')
('Supernova Name', 'SN1972E')
('Type', 'Type Ia')
('Supernova Name', 'SN1960F')
('Luminosity', '0.12 mag')
('Supernova Name', 'SN1999by')
('Luminosity', '19.5 mag')
('Supernova Name', 'SN1990N')
('Supernova Name', 'SN1998bu')
('Luminosity', '0.7 mag')
('Supernova Name', 'SN1998eq')
('Luminosity', '0.06 mag')
('Supernova Name', 'SN1991bg')
('Luminosity', '0.02 mag')
('Supernova Name', 'SN1991T')
('Supernova Name', 'SN1992bi')
('Supernova Name', 'SN1895B')
('Luminosity', '0.2 mag')
('Luminosity', '0.25 mag')
('Supernova Name', 'SN1981B')
('Supernova Name', 'SN1937C')
('Supernova Name', 'SN1999em')
('Supernova Name', 'SN1987A')
('Type', 'Type Ia')
('Type', 'Type I')
('Supernova Name', 'SN1987A')
('Type', 'Type Ib')
('Luminosity', '20.7 mag')
('Supernova Name', 'SN2017egm')
('Host Galaxy', 'NGC 3

CREATING LABELS FROM ANNOTATIONS

TRAINING THE MODEL