#### Imports

In [1]:
import os
import json
from tqdm import tqdm

from configs import ConfigPath
from data_preprocessing.text_splitter import TextSplitter
from utils.utils import read_json_file
from llms.embedding_model import EmbeddingModel
from knowledge_graph.loader import GraphLoader
from knowledge_graph.crud import GraphCrud
from configs.config import ConfigEnv

#### Initializations

In [2]:
data = read_json_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "pqa_labeled.json"))  

text_splitter = TextSplitter()
embedding_model = EmbeddingModel()
crud = GraphCrud(uri=ConfigEnv.NEO4J_URI, 
                 user=ConfigEnv.NEO4J_USER,
                 password=ConfigEnv.NEO4J_PASSWORD,
                 database=ConfigEnv.NEO4J_DB)

graph_loader = GraphLoader(text_splitter=text_splitter,
                           embedding_model=embedding_model,
                           crud=crud,
                           data=data)



2025-03-01 00:22:09,217 [DEBUG] embedding_model - CUDA is available, using GPU
2025-03-01 00:22:27,741 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings


Connection successful!


In [4]:
records = graph_loader.get_embeddings_from_graph()

In [5]:
import numpy as np

ids_list = [element['id'] for element in records]
embeddings_list = [element['embedding'] for element in records]
embeddings_array = np.array(embeddings_list)

In [6]:
# similarities
from sklearn.metrics.pairwise import cosine_similarity

def _compute_similarities(embeddings):
    """
    Computes the cosine similarity matrix for the embeddings.
    
    Args:
    - embeddings (numpy.ndarray): An array of embeddings.
    
    Returns:
    - numpy.ndarray: A cosine similarity matrix for the embeddings.
    """
    return cosine_similarity(embeddings)

similarity_results = _compute_similarities(embeddings=embeddings_array)

In [8]:
def filter_similarities(similarity_matrix, node_ids, threshold=0.8):
    """
    Filters a cosine similarity matrix to find pairs of nodes with similarity above a threshold.

    Args:
        similarity_matrix (numpy.ndarray): The cosine similarity matrix.
        node_ids (list): A list of node IDs corresponding to the rows/columns of the matrix.
        threshold (float): The similarity threshold.

    Returns:
        list: A list of dictionaries, where each dictionary represents a pair of nodes with
              similarity above the threshold and contains:
                - node1: The ID of the first node.
                - node2: The ID of the second node.
                - similarity: The cosine similarity between the two nodes.
    """

    num_nodes = len(node_ids)
    filtered_pairs = []

    # Check if similarity_matrix is empty
    if similarity_matrix.size == 0:
      return []

    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):  # Avoid duplicates and self-comparison
            similarity = similarity_matrix[i, j]
            if similarity >= threshold:
                filtered_pairs.append(
                    {
                        "node1": node_ids[i],
                        "node2": node_ids[j],
                        "similarity": similarity,
                    }
                )

    return filtered_pairs

filtered_results = filter_similarities(similarity_matrix=similarity_results, node_ids=ids_list, threshold=0.70)

In [10]:
len(filtered_results)

53

### Nodes deletion

In [8]:
# graph_loader.crud.delete_node(node_id="4:31b5014d-cb89-451c-ae23-47ad943d4cb4:3409")