In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pymupdf

In [16]:
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    full_text = ''
    for page in doc:
        full_text += page.get_text()
    return full_text

In [17]:
pdf_path = f"../../data/asc_842/lease_agreements/lease001.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

In [18]:
pdf_text

'EX-10.32 3 ex1032hvelease2022.htm EX-10.32\nExhibit 10.32\nLEASE AGREEMENT\nSTATE OF TEXAS        §\n                §\nCOUNTY OF ELLIS        §\nThis Lease Agreement (“Lease”) is made and entered into effective as of this 25  day of January, 2022 (“Effective Date”)\nby and between BarBell Real Estate, LLC, a Texas limited liability company (“Landlord”) and HVE, Inc, a Delaware\nCorporation (“Tenant”).\n    WHEREAS, Landlord is the current owner of the Leased Premises; and\n    WHEREAS, Landlord desires to lease the Leased Premises to Tenant, and Tenant desires to lease the Leased Premises\nfrom Landlord subject to the terms stated in this Lease Agreement and in any other documents duly referenced and\nincorporated herein.\n    NOW, THEREFORE, in consideration of the premises and the mutual covenants and agreements contained herein and\nfor other good and valuable consideration, the receipt and sufficiency of which are herby acknowledged and confessed by\nthe Parties, Landlord hereby 

In [19]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

ner_results = nlp(pdf_text)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
entities_found = 0

for result in ner_results:
    if result['entity'] == 'B-ORG' or result['entity'] == 'I-ORG':
        entities_found += 1
        print(result)

{'entity': 'I-ORG', 'score': 0.42792597, 'index': 44, 'word': '##EX', 'start': 83, 'end': 85}
{'entity': 'B-ORG', 'score': 0.99871266, 'index': 92, 'word': 'Bar', 'start': 276, 'end': 279}
{'entity': 'I-ORG', 'score': 0.9984137, 'index': 93, 'word': '##B', 'start': 279, 'end': 280}
{'entity': 'I-ORG', 'score': 0.9955655, 'index': 94, 'word': '##ell', 'start': 280, 'end': 283}
{'entity': 'I-ORG', 'score': 0.9991903, 'index': 95, 'word': 'Real', 'start': 284, 'end': 288}
{'entity': 'I-ORG', 'score': 0.99911636, 'index': 96, 'word': 'Estate', 'start': 289, 'end': 295}
{'entity': 'I-ORG', 'score': 0.9910499, 'index': 97, 'word': ',', 'start': 295, 'end': 296}
{'entity': 'I-ORG', 'score': 0.97094595, 'index': 98, 'word': 'LLC', 'start': 297, 'end': 300}
{'entity': 'B-ORG', 'score': 0.9711041, 'index': 107, 'word': 'Land', 'start': 338, 'end': 342}
{'entity': 'I-ORG', 'score': 0.9751914, 'index': 108, 'word': '##lord', 'start': 342, 'end': 346}
{'entity': 'I-ORG', 'score': 0.6087705, 'index'

In [21]:
print(f"Found {entities_found} entities in the text")

Found 28 entities in the text
