In [1]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")



In [2]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
from spacy.kb import KnowledgeBase

nlp.add_pipe("entityLinker", last=True)

<spacy_entity_linker.EntityLinker.EntityLinker at 0x7fa57ff167a0>

In [3]:
from urllib.parse import urlparse


def get_wikidata_id_from_url(url):
    path = urlparse(url).path
    wikidata_id = path.split("/")[-1]
    return wikidata_id

In [7]:
import json
from tqdm import tqdm

filepath = "./train_converted2.jsonl"

processed = []

with open(filepath, "r") as file:
    for line in file:
        # Load each line as a JSON object
        data_line = json.loads(line)
        # Extract the text and the annotations
        for item in tqdm(data_line):
            text = item["context"]
            doc = nlp(text)
            linked_ents = []
            ents = doc._.linkedEntities
            for ent in ents:
                linked_ents.append(
                    (
                        ent.original_alias,
                        ent.url,
                        ent.description,
                        get_wikidata_id_from_url(ent.url),
                    )
                )
            item["linked_ents"] = linked_ents
            processed.append(item)

# save to file
with open("./train2.jsonl", "w") as file:
    for line in processed:
        json.dump(line, file)
        file.write("\n")

  0%|          | 0/7939 [00:00<?, ?it/s]

100%|██████████| 7939/7939 [01:30<00:00, 87.81it/s] 


In [12]:
from spacy import displacy

doc = nlp(processed[0]['context'])
displacy.serve(doc, style="ent", port=5001)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...

Shutting down server on port 5001.


In [6]:
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.metrics import pairwise_distances


def compute_clusters(vectors):
    try:
        labels = DBSCAN(eps=1, min_samples=2).fit_predict(vectors)
        unique_labels = np.unique(labels)
        vectors = np.array(vectors)
        selected_label = unique_labels[0]
        cluster_points = vectors[labels == selected_label]
        intra_cluster_distances = pairwise_distances(cluster_points)
        average_intra_cluster_distance = np.sum(intra_cluster_distances) / (
            len(cluster_points) * (len(cluster_points) - 1)
        )
        min_cohesion = average_intra_cluster_distance
        # Calculate intra-cluster distance for each cluster
        for cluster_label in unique_labels:
            if cluster_label == -1:
                cluster_points = vectors[labels == cluster_label]
                if (
                    len(cluster_points) > 1
                ):  # Ensure there are at least two points in the cluster for distance calculation
                    intra_cluster_distances = pairwise_distances(cluster_points)
                    average_intra_cluster_distance = np.sum(intra_cluster_distances) / (
                        len(cluster_points) * (len(cluster_points) - 1)
                    )
                    if average_intra_cluster_distance < min_cohesion:
                        min_cohesion = average_intra_cluster_distance
                        selected_label = cluster_label
                    # print(
                    #     f"Cluster {cluster_label}: Average Intra-Cluster Distance = {average_intra_cluster_distance}"
                    # )

                    # print(
                    #     f"Cluster {cluster_label} has too few points for intra-cluster distance calculation."
                    # )

        # Get the indices of the items in the most common cluster
        indices = np.arange(len(labels))  # Array of indices
        most_common_cluster_indices = indices[labels == selected_label]
        return most_common_cluster_indices
    except Exception as e:
        print(e)
        return []

In [None]:
import json
from tqdm import tqdm
import requests
from multiprocessing import Pool

filepath = "./train2.jsonl"

processed = []


def get_wikidata_embedding(wikidata_id):
    url = "http://localhost:5000/api/vector/"
    response = requests.get(url + wikidata_id)
    if response.status_code == 200:
        return response.json()["vector"]
    else:
        return []


def getRelatedEntities(entry):
    embeddings = []
    for linked in entry["linked_ents"]:
        embedding = get_wikidata_embedding(linked[3])
        if len(embedding) > 0:
            embeddings.append(embedding)
    related = compute_clusters(embeddings)
    most_related = []
    for index in related:
        most_related.append(entry["linked_ents"][index])
    entry["most_related"] = most_related
    return entry

if __name__ == "__main__":
    with open(filepath, "r") as file:
        dataset = []
        processed = []
        for line in tqdm(file):
            dataset.append(json.loads(line))
        
        # with Pool(2) as p:
        #     processed = list(tqdm(p.imap(getRelatedEntities, dataset), total=len(dataset)))
        for line in tqdm(dataset):
            processed.append(getRelatedEntities(line))
        
        
with open("./train4.jsonl", "w") as file:
    for line in processed:
        json.dump(line, file)
        file.write("\n")

In [None]:
getRelatedEntities(dataset[0])

In [40]:
compute_clusters(test_vects)

array([0, 3, 4, 8])

## Alternative for non converted DS

In [6]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
from spacy.kb import KnowledgeBase

nlp.add_pipe("entityLinker", last=True)

from urllib.parse import urlparse


def get_wikidata_id_from_url(url):
    path = urlparse(url).path
    wikidata_id = path.split("/")[-1]
    return wikidata_id

In [8]:
import json
from tqdm import tqdm

filepath = "./ace2004-test-kilt.jsonl"

processed = []

with open(filepath, "r") as file:
    for line in tqdm(file):
        # Load each line as a JSON object
        data_line = json.loads(line)

        # Extract the text and the annotations
        text = data_line["input"]
        doc = nlp(text)
        linked_ents = []
        ents = doc._.linkedEntities
        for ent in ents:
            linked_ents.append(
                (
                    ent.original_alias,
                    ent.url,
                    ent.description,
                    get_wikidata_id_from_url(ent.url),
                )
            )
        data_line["linked_ents"] = linked_ents
        processed.append(data_line)

# save to file
with open("./ace2004-nerjsonl", "w") as file:
    for line in processed:
        json.dump(line, file)
        file.write("\n")

257it [00:08, 32.04it/s]


In [10]:
import json
from tqdm import tqdm
import requests
from multiprocessing import Pool
from sklearn.metrics import pairwise_distances
import numpy as np

filepath = "./msnbc-test-kilt-ner.jsonl"

processed = []

from sklearn.cluster import DBSCAN
import numpy as np


def compute_clusters(vectors):
    try:
        labels = DBSCAN(eps=1, min_samples=2).fit_predict(vectors)
        unique_labels = np.unique(labels)
        print(unique_labels)
        vectors = np.array(vectors)
        selected_label = unique_labels[0]
        cluster_points = vectors[labels == selected_label]
        intra_cluster_distances = pairwise_distances(cluster_points)
        average_intra_cluster_distance = np.sum(intra_cluster_distances) / (
            len(cluster_points) * (len(cluster_points) - 1)
        )
        min_cohesion = average_intra_cluster_distance
        # Calculate intra-cluster distance for each cluster
        for cluster_label in unique_labels:
            if cluster_label == -1:
                cluster_points = vectors[labels == cluster_label]
                print(len(cluster_points))
                if (
                    len(cluster_points) > 1
                ):  # Ensure there are at least two points in the cluster for distance calculation
                    intra_cluster_distances = pairwise_distances(cluster_points)
                    average_intra_cluster_distance = np.sum(intra_cluster_distances) / (
                        len(cluster_points) * (len(cluster_points) - 1)
                    )
                    if average_intra_cluster_distance < min_cohesion:
                        min_cohesion = average_intra_cluster_distance
                        selected_label = cluster_label
                    # print(
                    #     f"Cluster {cluster_label}: Average Intra-Cluster Distance = {average_intra_cluster_distance}"
                    # )

                    # print(
                    #     f"Cluster {cluster_label} has too few points for intra-cluster distance calculation."
                    # )

        # Get the indices of the items in the most common cluster
        indices = np.arange(len(labels))  # Array of indices
        most_common_cluster_indices = indices[labels == selected_label]
        return most_common_cluster_indices
    except Exception as e:
        print(e)
        return []


def get_wikidata_embedding(wikidata_id):
    url = "http://localhost:5000/api/vector/"
    response = requests.get(url + wikidata_id)
    if response.status_code == 200:
        return response.json()["vector"]
    else:
        return []


def getRelatedEntities(entry):
    embeddings = []
    for linked in entry["linked_ents"]:
        embedding = get_wikidata_embedding(linked[3])
        if len(embedding) > 0:
            embeddings.append(embedding)
    related = compute_clusters(embeddings)
    most_related = []
    for index in related:
        most_related.append(entry["linked_ents"][index])
    entry["most_related"] = most_related
    related_string = "<additional> "
    for related in entry["linked_ents"][:10]:
        if len(related) == 4:
            if related[2] is not None:
                related_string += related[2] + " "
    related_string += "</additional>"
    entry["input"] = entry["input"] + related_string
    return entry


if __name__ == "__main__":
    with open(filepath, "r") as file:
        dataset = []
        processed = []
        for line in tqdm(file):
            dataset.append(json.loads(line))

        # with Pool(2) as p:
        #     processed = list(tqdm(p.imap(getRelatedEntities, dataset), total=len(dataset)))
        for line in tqdm(dataset):
            processed.append(getRelatedEntities(line))


with open("./msnbc-ner2.jsonl", "w") as file:
    for line in processed:
        json.dump(line, file)
        file.write("\n")

 99%|█████████▉| 652/656 [00:44<00:00, 13.90it/s]

[-1  0  1  2]
21
[-1  0  1  2]
20
[-1  0  1  2  3]
19


100%|██████████| 656/656 [00:44<00:00, 14.78it/s]

[-1  0  1  2]
16
[-1  0  1]
17
[-1  0  1]
15



