<a href="https://colab.research.google.com/github/DheebaT/FinalProject/blob/main/FINAL_PROJECT_NER_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install fitz
!pip install spacy
!pip install PyMuPDF
!pip install torch

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl.metadata (6.6 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl.metadata (5.3 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting simplejson>=3.8.0 (from nipype->fitz)
  Downloading simplejson-3.19.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting traits!=5.0,<6.4,>=4.6 (from nipype->fitz)
  Downloading traits-6.3.2-cp310-cp310-manylinux_2_5_x86_64.ma

In [28]:
import re
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from nltk.corpus import stopwords

In [29]:
import nltk
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
import string
import re
# Function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

DEFINING REGEX PATTERNS, VALIDATING THE MATCH

In [31]:
nlp = spacy.load("en_core_web_sm")

supernova_name_pattern = re.compile(r'\bSN\d{4}[A-Za-z]*\b')
type_pattern = re.compile(r'\bType [I|II][a-c|p|n|l|b]?\b')
luminosity_pattern = re.compile(r'-?\d+\.\d+ mag')
host_galaxy_pattern = re.compile(r'NGC \d+|IC \d+|Messier \d+|UGC \d+|ESO \d+-\d+|PGC \d+|Mrk \d+|SDSS J\d+|RX J\d+|2MASX J\d+|A\d+|PKS \d+')
redshift_pattern = re.compile(r'z=\d+\.\d+')
distance_pattern = re.compile(r'\d+\.\d+ Mpc')

def validate_entity(entity, label):
    supernova_name_pattern = re.compile(r'^SN\d{4}[A-Za-z]*$')
    type_pattern = re.compile(r'^Type [I|II][a-c|p|n|l|b]?$')
    luminosity_pattern = re.compile(r'^-?\d+\.\d+ mag$')
    host_galaxy_pattern = re.compile(r'^(NGC \d+|IC \d+|Messier \d+|UGC \d+|ESO \d+-\d+|PGC \d+|Mrk \d+|SDSS J\d+|RX J\d+|2MASX J\d+|PKS \d+)$')
    redshift_pattern = re.compile(r'^z=\d+\.\d+$')
    distance_pattern = re.compile(r'^\d+\.\d+ Mpc$')


    if label == "Supernova Name" and supernova_name_pattern.match(entity):
        return True
    elif label == "Type" and type_pattern.match(entity):
        return True
    elif label == "Luminosity" and luminosity_pattern.match(entity):
        return True
    elif label == "Host Galaxy" and host_galaxy_pattern.match(entity):
        return True
    elif label == "Redshift" and redshift_pattern.match(entity):
        return True
    elif label == "Distance" and distance_pattern.match(entity):
        return True


    return False

DEFINING RULE BASED ANNOTATION

In [32]:
def initial_annotation(text):
    annotations = []

    supernova_names = supernova_name_pattern.findall(text)
    types = type_pattern.findall(text)
    luminosities = luminosity_pattern.findall(text)
    host_galaxies = host_galaxy_pattern.findall(text)
    redshifts = redshift_pattern.findall(text)
    distances = distance_pattern.findall(text)

    annotations.extend([("Supernova Name", name) for name in supernova_names if validate_entity(name, "Supernova Name")])
    annotations.extend([("Type", type_) for type_ in types if validate_entity(type_, "Type")])
    annotations.extend([("Luminosity", lum) for lum in luminosities if validate_entity(lum, "Luminosity")])
    annotations.extend([("Host Galaxy", galaxy) for galaxy in host_galaxies if validate_entity(galaxy, "Host Galaxy")])
    annotations.extend([("Redshift", redshift) for redshift in redshifts if validate_entity(redshift, "Redshift")])
    annotations.extend([("Distance", distance) for distance in distances if validate_entity(distance, "Distance")])

    return annotations

TOKENIZING AND CHUNKING

In [33]:
def tokenize_and_chunk(text, tokenizer, max_length=512):
    tokens = tokenizer(text, truncation=True, max_length=max_length, return_overflowing_tokens=False, return_tensors='pt')
    input_ids = tokens['input_ids']
    chunks = [tokenizer.decode(input_ids[0][i:i + max_length], skip_special_tokens=True) for i in range(0, len(input_ids[0]), max_length)]
    return chunks

NER ANNOTATION

In [34]:
def ner_annotation(text):
    tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased")
    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


    chunks = tokenize_and_chunk(text, tokenizer)
    all_results = []

    for chunk in chunks:
        if chunk.strip():
            results = nlp_pipeline(chunk)
            for result in results:
                word = result['word']
                entity = result['entity']


                label = None
                if "SN" in word:
                    label = "Supernova Name"
                elif "Type" in word:
                    label = "Type"
                elif "mag" in word:
                    label = "Luminosity"
                elif "z=" in word:
                    label = "Redshift"
                elif "Mpc" in word:
                    label = "Distance"
                elif any(prefix in word for prefix in ["NGC", "IC", "Messier", "UGC", "ESO", "PGC", "Mrk", "SDSS", "RX", "2MASX", "PKS"]):
                    label = "Host Galaxy"

                if label and validate_entity(word, label):
                    all_results.append((word, label))

    return all_results

COMBINING THE ANNOTATIONS

In [35]:
def combined_annotation(text):
    if not text.strip():
        return []

    rule_based_anns = initial_annotation(text)
    ner_anns = ner_annotation(text)

    combined_anns = list(set(rule_based_anns + ner_anns))

    valid_annotations = [ann for ann in combined_anns if validate_combined_annotation(ann)]

    return valid_annotations


def validate_combined_annotation(annotation):
    return True

FUNCTION TO EXTRACT TEXT FROM PDF

In [36]:
def extract_text_from_pdf(pdf_path):
    import fitz
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def process_pdfs(pdf_paths):
    all_annotations = []

    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path)
        if text.strip():
            annotations = combined_annotation(text)
            all_annotations.extend(annotations)

    return all_annotations


pdf_paths = ['/content/sample_data/Supernovae_dataset/0303428v1.pdf',
             '/content/sample_data/Supernovae_dataset/0601261v1.pdf',
             '/content/sample_data/Supernovae_dataset/0611295v2.pdf',
             '/content/sample_data/Supernovae_dataset/0612072v1.pdf',
             '/content/sample_data/Supernovae_dataset/0706.4088v1.pdf',
             '/content/sample_data/Supernovae_dataset/0708.2749v1.pdf',
             '/content/sample_data/Supernovae_dataset/0801.3297v3.pdf',
             '/content/sample_data/Supernovae_dataset/0905.4125v1.pdf',
             '/content/sample_data/Supernovae_dataset/0907.4524v1.pdf',
             '/content/sample_data/Supernovae_dataset/0908.4277v1.pdf',
             '/content/sample_data/Supernovae_dataset/0910.5597v2.pdf',
             '/content/sample_data/Supernovae_dataset/1211.1378v1.pdf',
             '/content/sample_data/Supernovae_dataset/1803.01875v2.pdf',
             '/content/sample_data/Supernovae_dataset/1805.03207v1.pdf',
             '/content/sample_data/Supernovae_dataset/2103.05230v1.pdf',
             '/content/sample_data/Supernovae_dataset/2105.00665v2.pdf',
             '/content/sample_data/Supernovae_dataset/2312.04621v2.pdf',
             '/content/sample_data/Supernovae_dataset/2407.03721v1.pdf',
             '/content/sample_data/Supernovae_dataset/2408.06287v1.pdf',
             '/content/sample_data/Supernovae_dataset/9805201v1.pdf'
             ]
annotations = process_pdfs(pdf_paths)


for ann in annotations:
    print(ann)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert

('Supernova Name', 'SN1960F')
('Supernova Name', 'SN1974G')
('Supernova Name', 'SN1992bi')
('Luminosity', '0.02 mag')
('Supernova Name', 'SN1991T')
('Luminosity', '0.25 mag')
('Supernova Name', 'SN1990N')
('Luminosity', '19.5 mag')
('Luminosity', '0.06 mag')
('Supernova Name', 'SN1986G')
('Supernova Name', 'SN1989B')
('Luminosity', '0.7 mag')
('Supernova Name', 'SN1972E')
('Supernova Name', 'SN1999by')
('Supernova Name', 'SN1981B')
('Luminosity', '0.01 mag')
('Supernova Name', 'SN1998bu')
('Supernova Name', 'SN1991bg')
('Supernova Name', 'SN1998eq')
('Luminosity', '0.6 mag')
('Type', 'Type Ia')
('Luminosity', '0.04 mag')
('Supernova Name', 'SN1895B')
('Supernova Name', 'SN1987A')
('Luminosity', '0.12 mag')
('Luminosity', '22.1 mag')
('Luminosity', '0.2 mag')
('Supernova Name', 'SN1999em')
('Supernova Name', 'SN1937C')
('Type', 'Type Ia')
('Type', 'Type I')
('Luminosity', '0.022 mag')
('Supernova Name', 'SN2002cx')
('Luminosity', '0.034 mag')
('Luminosity', '1.13 mag')
('Luminosity', '0

STORING ENTITIES AS RECORDS

In [37]:
all_extracted_records = []

current_record = {"Supernova Name": None, "Luminosity": None, "Type": None, "Host Galaxy": None, "Redshift": None, "Distance": None}

for label, entity in annotations:
    if label == "Supernova Name":

        if current_record["Supernova Name"] is not None:
            all_extracted_records.append(current_record)

            current_record = {"Supernova Name": None, "Luminosity": None, "Type": None, "Host Galaxy": None, "Redshift": None, "Distance": None}
        current_record["Supernova Name"] = entity
    elif label in current_record:
        current_record[label] = entity

if current_record["Supernova Name"] is not None:
    all_extracted_records.append(current_record)

for i, record in enumerate(all_extracted_records, start=1):
    print(f"Record {i}: {record}")

Record 1: {'Supernova Name': 'SN1960F', 'Luminosity': None, 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 2: {'Supernova Name': 'SN1974G', 'Luminosity': None, 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 3: {'Supernova Name': 'SN1992bi', 'Luminosity': '0.02 mag', 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 4: {'Supernova Name': 'SN1991T', 'Luminosity': '0.25 mag', 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 5: {'Supernova Name': 'SN1990N', 'Luminosity': '0.06 mag', 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 6: {'Supernova Name': 'SN1986G', 'Luminosity': None, 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 7: {'Supernova Name': 'SN1989B', 'Luminosity': '0.7 mag', 'Type': None, 'Host Galaxy': None, 'Redshift': None, 'Distance': None}
Record 8: {'Supernova Name': 'SN1972E', 'Luminosity': None,

HANDLING MISSING VALUES AND LABEL ENCODING

In [38]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


for record in all_extracted_records:
    for key in record.keys():
        if record[key] is None:
            if key in ["Luminosity", "Redshift", "Distance"]:
                record[key] = 0.0
            else:
                record[key] = "Unknown"


label_encoders = {}
for key in ["Supernova Name", "Type", "Host Galaxy", "Redshift"]:
    encoder = LabelEncoder()
    values = [record[key] for record in all_extracted_records]
    label_encoders[key] = encoder.fit(values)
    for record in all_extracted_records:
        record[key] = encoder.transform([record[key]])[0]


for record in all_extracted_records:
    record["Luminosity"] = float(record["Luminosity"].replace(" mag", "")) if isinstance(record["Luminosity"], str) else float(record["Luminosity"])
    record["Distance"] = float(record["Distance"].replace(" Mpc", "")) if isinstance(record["Distance"], str) else float(record["Distance"])


X = np.array([[record["Supernova Name"], record["Type"], record["Luminosity"], record["Host Galaxy"], record["Redshift"], record["Distance"]] for record in all_extracted_records])
y = np.array([record["Type"] for record in all_extracted_records])


print("Feature Matrix (X):")
print(X)
print("\nLabels (y):")
print(y)


Feature Matrix (X):
[[2.000e+00 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [4.000e+00 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [1.200e+01 2.000e+00 2.000e-02 3.000e+00 0.000e+00 0.000e+00]
 [1.000e+01 2.000e+00 2.500e-01 3.000e+00 0.000e+00 0.000e+00]
 [9.000e+00 2.000e+00 6.000e-02 3.000e+00 0.000e+00 0.000e+00]
 [6.000e+00 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [8.000e+00 2.000e+00 7.000e-01 3.000e+00 0.000e+00 0.000e+00]
 [3.000e+00 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [1.500e+01 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [5.000e+00 2.000e+00 1.000e-02 3.000e+00 0.000e+00 0.000e+00]
 [1.300e+01 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [1.100e+01 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [1.400e+01 1.000e+00 4.000e-02 3.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 2.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00]
 [7.000e+00 2.000e+00 2.000e-01 3.000e+00 0.000e+00 0.000e+00]
 [1.600e+01 2.000e+00 0.000e+00 3.0

TRAINING THE MODEL

In [53]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.regularizers import l2
from imblearn.over_sampling import RandomOverSampler


seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y)

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_resampled.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoders["Type"].classes_), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

class_weights = {0: 2.0, 1: 3.0, 2: 0.5}

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_resampled, y_resampled, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping], class_weight=class_weights)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 133ms/step - accuracy: 0.4590 - loss: 1.2029 - val_accuracy: 0.2069 - val_loss: 1.0160
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.4590 - loss: 1.0363 - val_accuracy: 0.2069 - val_loss: 0.9958
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.4969 - loss: 0.9201 - val_accuracy: 0.3448 - val_loss: 0.9690
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.6050 - loss: 0.8353 - val_accuracy: 0.3448 - val_loss: 0.9329
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.7561 - loss: 0.7710 - val_accuracy: 0.3448 - val_loss: 0.8898
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.7561 - loss: 0.7200 - val_accuracy: 0.6552 - val_loss: 0.8461
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

In [54]:
y_pred = model.predict(X_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)

print("Classification Report:")
print(classification_report(y, y_pred_classes))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.80      0.67      0.73         6
           2       1.00      0.98      0.99        48

    accuracy                           0.95        56
   macro avg       0.77      0.88      0.79        56
weighted avg       0.96      0.95      0.95        56



CONFUSION MATRIX

In [55]:
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred_classes))

Confusion Matrix:
[[ 2  0  0]
 [ 2  4  0]
 [ 0  1 47]]


In [60]:
def predict_from_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    predictions = combined_annotation(text)
    return predictions

pdf_path = '/content/Supernovae.pdf'
predictions = predict_from_pdf(pdf_path)

print(predictions)

for prediction in predictions:
    if isinstance(prediction, tuple):
        entity, label = prediction[:2]
        print(f"Entity: {entity}, Label: {label}")
    else:
        print(f"Prediction: {prediction}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'Luminosity': '24.5 mag', 'Host_Galaxy': 'A22', 'Type': 'Type Ia', 'Relations': []}
Prediction: Luminosity
Prediction: Host_Galaxy
Prediction: Type
Prediction: Relations
