In [17]:
import ollama
from langchain_community.document_loaders import PyPDFLoader
import spacy
import re
from collections import defaultdict
import geotext

# Load a more comprehensive NER model
nlp = spacy.load("en_core_web_lg")

# Load the PDF and extract text
loader = PyPDFLoader("/Users/lijou/Documents/Documents/AI litterature/EU/EU_AI.pdf")
pages = loader.load()
pdf_text = '\n'.join([page.page_content for page in pages])
# Remove newline characters and extra white space
pdf_text = re.sub('\s+', ' ', pdf_text)

# Apply NER to the extracted text
doc = nlp(pdf_text)
entities = defaultdict(list)

# Use GeoText for better place name recognition
geo_text = geotext.GeoText(pdf_text)

# Filter entities and remove duplicates
for ent in doc.ents:
    if ent.label_ == "GPE":
        entities[ent.label_].append(ent.text)
    elif ent.label_ == "PERSON":
        # Use a regular expression to filter out invalid names
        if re.match(r"^[A-Z][a-z]* [A-Z][a-z]*$", ent.text):
            entities[ent.label_].append(ent.text)

# Adding GeoText results to GPE
entities["GPE"].extend(geo_text.countries)

# Remove duplicates by converting lists to sets
entities = {key: list(set(values)) for key, values in entities.items()}

# Create system message with structured format
system_message = f"Use the following document to answer the question. Here are the entities identified in the text:\nGPE (Countries): {', '.join(entities['GPE'])}\nPERSON (People): {', '.join(entities['PERSON'])}\nList all the countries and people named in the document."

question = 'List all the GPE and PERSON in this document'

messages = [
    {'role': 'system', 'content': system_message},
    {'role': 'user', 'content': question},
]

response = ollama.chat(model='llama3-gradient', messages=messages)
print(response['message']['content'])


Here are the entities identified in the document:

GPE (Countries):
1. South Korea
2. United States
3. United Kingdom
4. Canada
5. Australia
6. Singapore
7. France
8. Japan
9. Germany
10. Belgium
11. Finland
12. New Zealand
13. Poland
14. Slovenia
15. India

PERSON:
1. Andrea Renda
2. Matthew Linares
3. Alan F
4. Peter Cihon


In [19]:
print(entities)

{'GPE': ['South Korea', 'Bristol', 'Poland', 'Belgium', 'Finland', 'India', 'some Member States', 'ri', 'Singapore', 'New Zealand', 'Japan', 'the United States', 'France', 'AI', 'US', 'Slovenia', 'proach', 'China', 'behavio', 'United States', 'Australia', 'Canada', 'Germany', 'Biometr ics', 'UK', 'United Kingdom'], 'PERSON': ['Andrea Renda', 'Standardisation Standardi', 'Mihalis Kritikos', 'Peter Cihon', 'Matthew Linares', 'Alan F']}
