In [31]:
import PyPDF2
from transformers import BertTokenizerFast, BertForTokenClassification
import torch


In [32]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

In [33]:
# Replace with your model's Hugging Face repository name
repo_name = "darmendarizp/eafit-ner-v1"

# Load the tokenizer and model from the Hugging Face Model Hub
tokenizer = BertTokenizerFast.from_pretrained(repo_name)
model = BertForTokenClassification.from_pretrained(repo_name)



In [34]:
def get_companies(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    # Tokenize the PDF text
    tokens = tokenizer(pdf_text.split(), return_tensors="pt", is_split_into_words=True, truncation=True, padding=True)
    
    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**tokens)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert predictions to label IDs
    predicted_labels = predictions[0].tolist()
    
    # Get the labels and corresponding tokens
    labels = [model.config.id2label[label_id] for label_id in predicted_labels]
    tokens_list = tokens.tokens()
    
    # Extract company names based on the 'ORG' label
    company_names = []
    current_company = []

    for token, label in zip(tokens_list, labels):
        if label == 'B-ORG' or label == 'I-ORG':
            current_company.append(token)
        else:
            if current_company:
                company_names.append(" ".join(current_company))
                current_company = []
    # Add the last company name if there's one left
    if current_company:
        company_names.append(" ".join(current_company))
    print("Companies found:", company_names)


In [35]:
numbers = [f"{i:03}" for i in range(1, 101)]
for i in numbers:
    pdf_path = f"../../data/asc_842/lease_agreements/lease{i}.pdf"
    print(f"Extracting companies from {pdf_path}")
    get_companies(pdf_path)
    print("\n")

Extracting companies from ../../data/asc_842/lease_agreements/lease001.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease002.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease003.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease004.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease005.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease006.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease007.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease008.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease009.pdf
Companies found: []


Extracting companies from ../../data/asc_842/lease_agreements/lease010.pdf
Companies found: []


Extracting companies from ../.