In [2]:
!pip install pdfplumber
!pip install transformers
!pip install datasets

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pdfplumber
import re
import string
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
# Load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# Initialize the pipeline for named entity recognition
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# Extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

#Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text


In [25]:
#Regex patterns for rule-based annotation
supernova_name_pattern = re.compile(r'\bSN\d{4}[A-Za-z]*\b')
type_pattern = re.compile(r'\bType [I|II][a-c|p|n|l|b]?\b')
luminosity_pattern = re.compile(r'-?\d+\.\d+ mag')
host_galaxy_pattern = re.compile(r'NGC \d+|IC \d+|Messier \d+|UGC \d+|ESO \d+-\d+|PGC \d+|Mrk \d+|SDSS J\d+|RX J\d+|2MASX J\d+|PKS \d+')
redshift_pattern = re.compile(r'z=\d+\.\d+')
distance_pattern = re.compile(r'\d+\.\d+ Mpc')

# Rule-based annotation
def initial_annotation(text):
    annotations = []
    supernova_names = supernova_name_pattern.findall(text)
    types = type_pattern.findall(text)
    luminosities = luminosity_pattern.findall(text)
    host_galaxies = host_galaxy_pattern.findall(text)
    redshifts = redshift_pattern.findall(text)
    distances = distance_pattern.findall(text)
    annotations.extend([("Supernova Name", name) for name in supernova_names])
    annotations.extend([("Type", type_) for type_ in types])
    annotations.extend([("Luminosity", lum) for lum in luminosities])
    annotations.extend([("Host Galaxy", galaxy) for galaxy in host_galaxies])
    annotations.extend([("Redshift", redshift) for redshift in redshifts])
    annotations.extend([("Distance", distance) for distance in distances])
    return annotations


In [26]:
# TOKENIZING AND CHUNKING
def tokenize_and_chunk(text, tokenizer, max_length=512):
    tokens = tokenizer(text, truncation=True, max_length=max_length, return_overflowing_tokens=False, return_tensors='pt')
    input_ids = tokens['input_ids']
    chunks = [tokenizer.decode(input_ids[0][i:i + max_length], skip_special_tokens=True) for i in range(0, len(input_ids[0]), max_length)]
    return chunks


In [27]:
# Annotate using BERT NER model with debugging
def ner_annotation(text):
    chunks = tokenize_and_chunk(text, tokenizer)
    all_results = []
    for chunk in chunks:
        if chunk.strip():
            tokens = tokenizer(chunk, return_tensors="pt")
            print("Token IDs:", tokens['input_ids'])
            try:
                results = nlp_pipeline(chunk)
                for result in results:
                    word = result['word']
                    entity = result['entity']
                    label = None
                    if "SN" in word:
                        label = "Supernova Name"
                    elif "Type" in word:
                        label = "Type"
                    elif "mag" in word:
                        label = "Luminosity"
                    elif "z=" in word:
                        label = "Redshift"
                    elif "Mpc" in word:
                        label = "Distance"
                    elif any(prefix in word for prefix in ["NGC", "IC", "Messier", "UGC", "ESO", "PGC", "Mrk", "SDSS", "RX", "2MASX", "PKS"]):
                        label = "Host Galaxy"
                    if label:
                        all_results.append((word, label))
            except IndexError as e:
                print(f"Error processing chunk: {chunk}")
                print(f"Error: {e}")
                continue
    return all_results


In [28]:
# Combining rule-based and BERT NER annotations
def combined_annotation(text):
    if not text.strip():
        return []

    rule_based_anns = initial_annotation(text)
    ner_anns = ner_annotation(text)

    combined_anns = list(set(rule_based_anns + ner_anns))

    return combined_anns


In [29]:
# Process the PDFs
all_annotations = []
pdf_paths = ['/content/sample_data/Supernovae_dataset/0303428v1.pdf',
             '/content/sample_data/Supernovae_dataset/0601261v1.pdf',
             '/content/sample_data/Supernovae_dataset/0611295v2.pdf',
             '/content/sample_data/Supernovae_dataset/0612072v1.pdf',
             '/content/sample_data/Supernovae_dataset/0706.4088v1.pdf',
             '/content/sample_data/Supernovae_dataset/0708.2749v1.pdf',
             '/content/sample_data/Supernovae_dataset/0801.3297v3.pdf',
             '/content/sample_data/Supernovae_dataset/0905.4125v1.pdf',
             '/content/sample_data/Supernovae_dataset/0907.4524v1.pdf',
             '/content/sample_data/Supernovae_dataset/0908.4277v1.pdf',
             '/content/sample_data/Supernovae_dataset/0910.5597v2.pdf',
             '/content/sample_data/Supernovae_dataset/1211.1378v1.pdf',
             '/content/sample_data/Supernovae_dataset/1803.01875v2.pdf',
             '/content/sample_data/Supernovae_dataset/1805.03207v1.pdf',
             '/content/sample_data/Supernovae_dataset/2103.05230v1.pdf',
             '/content/sample_data/Supernovae_dataset/2105.00665v2.pdf',
             '/content/sample_data/Supernovae_dataset/2312.04621v2.pdf',
             '/content/sample_data/Supernovae_dataset/2407.03721v1.pdf',
             '/content/sample_data/Supernovae_dataset/2408.06287v1.pdf',
             '/content/sample_data/Supernovae_dataset/9805201v1.pdf']

for pdf_path in pdf_paths:
    text = extract_text_from_pdf(pdf_path)
    if text.strip():
        annotations = combined_annotation(text)
        all_annotations.extend(annotations)


Token IDs: tensor([[  101,  3998,  2475,  8223,  6282,  1015,  2615,  2620, 18827, 14142,
         14142,  1013,  6522,  1011,  2030, 27110,  1024,  6819,  2595,  2527,
          9854,  2522, 25855,  6483,  2007,  3565, 13455,  2063, 16897,  2566,
         13728, 26878,  2487,  1998,  4422,  1052,  1012, 12940,  2475,  1015,
          5584,  2407,  1010,  5623,  8256,  2120,  5911,  1010,  2118, 11253,
          2662,  1010,  8256,  1010,  6187,  6365,  2581, 11387,  1010,  3915,
          1016,  2470,  2082,  1997, 12799,  1998, 28625, 15638,  1010,  1996,
          2827,  2120,  2118,  1010,  3081, 13124,  2121, 16428,  1010, 12755,
          3636,  1010,  2552, 24441,  2487,  1010,  2660, 10061,  1012,  2058,
         10760, 19707,  2102,  3207, 21869,  1010,  3565, 13455, 11106, 10696,
         21382, 20800,  8883, 14045, 15794, 29122, 14122, 11452,  1011, 11865,
         23223, 27896, 14192,  5243, 26210, 23496, 18413, 29181,  7911, 13306,
         10521, 26897,  2015,  1012, 1518

In [30]:
from sklearn.preprocessing import LabelEncoder
# Store the entities as records
all_extracted_records = []
current_record = {"Supernova Name": None, "Luminosity": None, "Type": None, "Host Galaxy": None, "Redshift": None, "Distance": None}
for label, entity in all_annotations:
    if label == "Supernova Name":
        if current_record["Supernova Name"] is not None:
            all_extracted_records.append(current_record)
            current_record = {"Supernova Name": None, "Luminosity": None, "Type": None, "Host Galaxy": None, "Redshift": None, "Distance": None}
        current_record["Supernova Name"] = entity
    elif label in current_record:
        current_record[label] = entity
if current_record["Supernova Name"] is not None:
    all_extracted_records.append(current_record)

# Handle missing values and label encoding
for record in all_extracted_records:
    for key in record.keys():
        if record[key] is None:
            if key in ["Luminosity", "Redshift", "Distance"]:
                record[key] = 0.0
            else:
                record[key] = "Unknown"

label_encoders = {}
for key in ["Supernova Name", "Type", "Host Galaxy", "Redshift"]:
    encoder = LabelEncoder()
    values = [record[key] for record in all_extracted_records]
    label_encoders[key] = encoder.fit(values)
    for record in all_extracted_records:
        record[key] = encoder.transform([record[key]])[0]

for record in all_extracted_records:
    record["Luminosity"] = float(record["Luminosity"].replace(" mag", "")) if isinstance(record["Luminosity"], str) else float(record["Luminosity"])
    record["Distance"] = float(record["Distance"].replace(" Mpc", "")) if isinstance(record["Distance"], str) else float(record["Distance"])


In [31]:
# Predict from PDF using the BERT model
def predict_from_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    predictions = combined_annotation(text)
    return predictions

# Example of use with debug output
pdf_path = '/content/sample_data/Supernovae_dataset/0303428v1.pdf'
predictions = predict_from_pdf(pdf_path)

print(predictions)

for prediction in predictions:
    if isinstance(prediction, tuple):
        entity, label = prediction[:2]
        print(f"Entity: {entity}, Label: {label}")
    else:
        print(f"Prediction: {prediction}")


Token IDs: tensor([[  101,  3998,  2475,  8223,  6282,  1015,  2615,  2620, 18827, 14142,
         14142,  1013,  6522,  1011,  2030, 27110,  1024,  6819,  2595,  2527,
          9854,  2522, 25855,  6483,  2007,  3565, 13455,  2063, 16897,  2566,
         13728, 26878,  2487,  1998,  4422,  1052,  1012, 12940,  2475,  1015,
          5584,  2407,  1010,  5623,  8256,  2120,  5911,  1010,  2118, 11253,
          2662,  1010,  8256,  1010,  6187,  6365,  2581, 11387,  1010,  3915,
          1016,  2470,  2082,  1997, 12799,  1998, 28625, 15638,  1010,  1996,
          2827,  2120,  2118,  1010,  3081, 13124,  2121, 16428,  1010, 12755,
          3636,  1010,  2552, 24441,  2487,  1010,  2660, 10061,  1012,  2058,
         10760, 19707,  2102,  3207, 21869,  1010,  3565, 13455, 11106, 10696,
         21382, 20800,  8883, 14045, 15794, 29122, 14122, 11452,  1011, 11865,
         23223, 27896, 14192,  5243, 26210, 23496, 18413, 29181,  7911, 13306,
         10521, 26897,  2015,  1012, 1518