In [5]:
import spacy
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
import json
from transformers import pipeline
import numpy as np 

In [6]:
entities

['Japan', 'Barack Obama', 'Obama', 'Taiwan', 'Barack', 'Olabama']

# Gliner

In [4]:
import spacy
from gliner_spacy.pipeline import GlinerSpacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Add GlinerSpacy pipeline component
try:
    nlp.add_pipe("gliner_spacy", config={"labels": ["person", "organization", "location"]})
except ValueError as e:
    print("Error adding the gliner_spacy pipeline:", e)

# Process the text
doc = nlp(
    "The president of USA is actually in Paris. He is accompanied by multiple other figures. "
    "Barack Obama will be with President Emmanuel Macron at the Elysee Palace. "
    "They will discuss the situation in Ukraine. Obama wants to discuss further with Macron "
    "about the relationship between the two countries."
)

# Print entities and their labels
if doc.ents:
    for ent in doc.ents:
        print(ent.text, ent.label_)
else:
    print("No entities were recognized.")


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 69042.04it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


USA location
Paris location
Barack Obama person
Emmanuel Macron person
Elysee Palace location
Ukraine location
countries location


In [24]:
from itertools import groupby
# Group entities by their label
entities = {key: list(g) for key, g in groupby(sorted(doc.ents, key=lambda x: x.label_), lambda x: x.label_)}
print(entities)

{'CARDINAL': [two], 'FAC': [the Elysee Palace], 'GPE': [USA, Paris, Ukraine], 'PERSON': [Barack Obama, Emmanuel Macron, Obama, Macron]}


In [25]:
# Grouping 
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-hf"  # Llama 2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


Loading checkpoint shards: 100%|██████████| 2/2 [00:42<00:00, 21.46s/it]


In [26]:
import torch

# Check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

def resolve_entities(entities):
    prompt = (
        "Group the following entities that refer to the same thing:\n"
        + "\n".join(entities)
        + "\n\nOutput the groups as a list:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, max_new_tokens=100, temperature=0.7, top_p=0.9
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


Using device: mps


In [27]:
import torch
print(torch.backends.mps.is_available())  # Should return True if MPS is functional


True


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Force CPU usage
device = torch.device("cpu")

# Load Llama 2 model
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.float32
).to(device)

# Define entity resolution function
def resolve_entities(entities):
    prompt = (
        "Group the following entities that refer to the same thing:\n"
        + "\n".join(entities)
        + "\n\nOutput the groups as a list:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    with tqdm(total=1, desc="Generating response", bar_format='{l_bar}{bar}') as pbar:
        outputs = model.generate(
            inputs.input_ids, 
            max_new_tokens=100, 
            temperature=0.7, 
            top_p=0.9
        )
        pbar.update(1)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example entities
entities = [
    "Barack Obama",
    "President Obama",
    "President of USA",
    "Obama",
    "Donald Trump",
]

print(resolve_entities(entities))

# For clustering, use Sentence-BERT embeddings
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Add tqdm for embedding encoding
embeddings = sbert_model.encode(
    entities, 
    show_progress_bar=True
)

# Cluster similar entities
with tqdm(total=1, desc="Clustering Entities") as pbar:
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)
    pbar.update(1)

entity_groups = {}
for label, entity in zip(clustering.labels_, entities):
    entity_groups.setdefault(label, []).append(entity)
print("Entity Groups:", entity_groups)

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [11]:
from gliner_spacy.pipeline import GlinerSpacy


In [15]:
model = GlinerSpacy.from_pretrained("urchade/gliner_base")

AttributeError: type object 'GlinerSpacy' has no attribute 'from_pretrained'

In [22]:
from gliner import GLiNER
import time
from rich.console import Console
c = Console()
model = GLiNER.from_pretrained("urchade/gliner_base")

labels = ["individuals", "person", "location"]

with open("atlas.txt", "r") as file:
    text = file.read()

entities = model.predict_entities(text, labels)
with c.pager():
    c.print(entities)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 50686.45it/s]


ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [None]:
- This politic of chaos is undeniably a real problem for kurds 
- Those guys are coming, bringing their poons