In [3]:
import spacy
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
import json
from transformers import pipeline

  from tqdm.autonotebook import tqdm, trange


In [13]:
# Load the spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Load Sentence-BERT model for embeddings
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example text input
text = """
The president of the united states Barack Obama has started to travel in ASIA.
Obama is still trying to prevent a war in Taiwan.
Barack as his wife call him, is really appreciated in Japan by the population.
The president is not going to Europe.
"""


### Step 1: Extract Entities from Text
doc = nlp(text)
entities = []
for ent in doc.ents:
    print(ent.label_, ent)
    # Only add entities that are commonly useful like PERSON, ORG, GPE (places), etc.
    if ent.label_ in ["PERSON", "ORG", "GPE"]:
        entities.append(ent.text)

# Remove duplicate entities
entities = list(set(entities))
print("entities:", entities)

### Step 2: Create Embeddings and Cluster Entities
# Generate embeddings for each entity using Sentence-BERT
embeddings = sbert_model.encode(entities)

# Cluster similar entities using Agglomerative Clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.0)
clusters = clustering_model.fit_predict(embeddings)

PERSON Barack Obama
LOC ASIA
PERSON Obama
GPE Taiwan
PERSON Barack
GPE Japan
LOC Europe
entities: ['Barack Obama', 'Barack', 'Obama', 'Japan', 'Taiwan']


In [15]:
entities

['Barack Obama', 'Barack', 'Obama', 'Japan', 'Taiwan']