# Sandbox
test functions and misc scripts

## print info about the directory

In [1]:
# #print eveything in this directory so who subdirectories and files
# import os

# def print_directory_contents(directory):
#     for root, dirs, files in os.walk(directory):
#         for name in dirs:
#             print(os.path.join(root, name))
#         for name in files:
#             print(os.path.join(root, name))

# # Example usage
# print_directory_contents(os.getcwd())

## update metadata

In [2]:
import GraphReasoning_Mod
import importlib

# Re-import all objects from the module
from GraphReasoning_Mod import *

from langchain_huggingface import HuggingFaceEndpoint

#Initialize the model endpoint
HOST_URL_INF = ":8080" #Mistral-NeMo-Instruct-2407
MAX_NEW_TOKENS = 1012

TEMPERATURE = 0.2
TIMEOUT = 120
TOP_P = .9

llm = HuggingFaceEndpoint(
    endpoint_url=HOST_URL_INF,
    task="text-generation",
    max_new_tokens=MAX_NEW_TOKENS,
    do_sample=False,
    temperature = TEMPERATURE,
    timeout=TIMEOUT,
    top_p=TOP_P
)

from langchain_huggingface import HuggingFaceEmbeddings

model_name = "dunzhang/stella_en_1.5B_v5" #"BAAI/bge-small-en-v1.5" #dunzhang/stella_en_1.5B_v5
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embd = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

import pickle
import os

# load pickled documents
pickle_file_path = './data/storage/full_all_documents.pkl'
if os.path.exists(pickle_file_path):
    with open(pickle_file_path, 'rb') as f:
        all_pdf_docs, all_yt_docs, all_blog_docs = pickle.load(f)
else:
    print("Pickle file not found.")

#check if the documents are loaded
print("Number of PDF documents:", len(all_pdf_docs))
print("Number of YouTube documents:", len(all_yt_docs))
print("Number of blog documents:", len(all_blog_docs))

#standardize the metadata
all_pdf_docs, all_yt_docs, all_blog_docs = standardize_document_metadata(all_pdf_docs, all_yt_docs, all_blog_docs)

#print example of each documents metadata
print("PDF document example:")
print(all_pdf_docs[0].metadata)
print("YouTube document example:")
print(all_yt_docs[0].metadata)
print("Blog document example:")
print(all_blog_docs[0].metadata)


# Combine all documents into a single list
all_docs = all_pdf_docs+  all_yt_docs+  all_blog_docs

print(f"Total number of documents: {len(all_docs)}")





Number of PDF documents: 2048
Number of YouTube documents: 442
Number of blog documents: 11
PDF document example:
{'source': 'Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf - page: 0', 'source_type': 'Textbook_PDF', 'title': 'Linear Algebra and its Applications (4th ed.)', 'author': 'Gilbert Strang'}
YouTube document example:
{'source': 'ZKUqtErZCiU', 'source_type': 'youtube', 'title': "What's the big idea of Linear Algebra?    **Course Intro**", 'author': ''}
Blog document example:
{'source': 'https://www.3blue1brown.com/topics/linear-algebra', 'source_type': 'blog', 'title': '3Blue1Brown', 'author': ''}
Total number of documents: 2501


In [3]:
G_new, node_embeddings, res= add_new_subgraph_from_docs(
    all_pdf_docs[10:11], llm, embd, 
    data_dir_output='./data/temp/', verbatim=False, size_threshold=10,
    chunk_size=2500, do_Louvain_on_new_graph=True,
    include_contextual_proximity=False, repeat_refine=0,
    similarity_threshold=0.95, do_simplify_graph=True,
    return_only_giant_component=False, save_common_graph=False,
    G_exisiting=None, existing_node_embeddings=None
    )

if G_new:
    print("----New subgraph created successfully.")
    print("Nodes:", G_new.nodes())
    print("Edges:", G_new.edges(data=True))
else:
    print("Failed to create a new subgraph.")

DEBUG make_graph_from_doc_batch concept list: [{'node_1': 'System of equations {1x + 2y = 3, 4x + 5y = 6}', 'node_2': 'Two unknowns x and y', 'edge': 'Involves', 'chunk_id': '03a28f59f00a4855b83109df3f286a0d', 'source': 'Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf - page: 10', 'metadata': {'source': 'Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf - page: 10', 'source_type': 'Textbook_PDF', 'title': 'Linear Algebra and its Applications (4th ed.)', 'author': 'Gilbert Strang', 'start_index': 0}}, {'node_1': 'Elimination method', 'node_2': 'System of equations {1x + 2y = 3, 4x + 5y = 6}', 'edge': 'Is applied to', 'chunk_id': '03a28f59f00a4855b83109df3f286a0d', 'source': 'Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf - page: 10', 'metadata': {'source': 'Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf - page: 10', 'source_type': 'Textbook_PDF', 'title': 'Linear Algebra and its Applications (4th ed.)', 'author'

In [4]:
import pandas as pd

# create a df from a csv file
file_path = 'C:\\Users\\jonathan.kasprisin\\gitlab\\DNoK_GraphRAG\\data\\temp\\graph_new_graph.csv'
df = pd.read_csv(file_path, sep='|')


# Extract unique nodes and their corresponding sources from both node_1 and node_2
unique_nodes_sources = pd.concat([
    df[['node_1', 'source', 'chunk_id']].rename(columns={'node_1': 'node'}),
    df[['node_2', 'source', 'chunk_id']].rename(columns={'node_2': 'node'})
]).drop_duplicates()

print("Unique nodes shape: ", unique_nodes_sources.shape)

# Merge metadata from df into unique_nodes_sources based on chunk_id
metadata_df = df[['chunk_id', 'metadata']].drop_duplicates()

unique_nodes_sources = unique_nodes_sources.merge(
    metadata_df,
    on='chunk_id',
    how='left'
)

print("Unique nodes merged with metadata shape: ", unique_nodes_sources.shape)


# Create new rows with source as node_1 and unique nodes as node_2
# Create new rows using dictionary comprehension for better readability
new_rows_df = unique_nodes_sources.assign(
    node_1=unique_nodes_sources['source'],
    node_2=unique_nodes_sources['node'],
    edge='is source document of'
)[['node_1', 'node_2', 'chunk_id', 'source', 'metadata', 'edge']]

# Append new rows to the original DataFrame
df = pd.concat([df, new_rows_df], ignore_index=True)

print("Updated DF shape: ", df.shape)
display(df.head())

Unique nodes shape:  (16, 3)
Unique nodes merged with metadata shape:  (16, 4)
Updated DF shape:  (41, 6)


Unnamed: 0,node_1,node_2,edge,chunk_id,source,metadata
0,"system of equations {1x + 2y = 3, 4x + 5y = 6}",two unknowns x and y,Involves,03a28f59f00a4855b83109df3f286a0d,Gilbert_Strang_Linear_Algebra_and_Its_Applicat...,{'source': 'Gilbert_Strang_Linear_Algebra_and_...
1,elimination method,"system of equations {1x + 2y = 3, 4x + 5y = 6}",Is applied to,03a28f59f00a4855b83109df3f286a0d,Gilbert_Strang_Linear_Algebra_and_Its_Applicat...,{'source': 'Gilbert_Strang_Linear_Algebra_and_...
2,subtracting 4 times the first equation from th...,eliminating x from the second equation,Results in,03a28f59f00a4855b83109df3f286a0d,Gilbert_Strang_Linear_Algebra_and_Its_Applicat...,{'source': 'Gilbert_Strang_Linear_Algebra_and_...
3,equation -3y = -6,value of y = 2,Yields,03a28f59f00a4855b83109df3f286a0d,Gilbert_Strang_Linear_Algebra_and_Its_Applicat...,{'source': 'Gilbert_Strang_Linear_Algebra_and_...
4,value of y = 2,equation 1x + 2(2) = 3,Is substituted into,03a28f59f00a4855b83109df3f286a0d,Gilbert_Strang_Linear_Algebra_and_Its_Applicat...,{'source': 'Gilbert_Strang_Linear_Algebra_and_...


In [5]:
#caluclate the simularity between node_1[0] and all node_1 all node_2

node_embeddings = {}

# Combine unique nodes from node_1 and node_2
unique_nodes = set(df['node_1'].unique()).union(set(df['node_2'].unique()))

# Calculate embeddings for all unique nodes
for node in unique_nodes:
    node_embeddings[node] = embd.embed_query(node)


# Create a list of nodes and corresponding embeddings matrix
nodes = list(node_embeddings.keys())
embeddings_matrix = np.array([node_embeddings[node] for node in nodes])

similarity_matrix = cosine_similarity(embeddings_matrix)





In [9]:
similarity_thresholds = [0.95, 0.9, 0.85]
for similarity_threshold in similarity_thresholds:
    print(f"Similarity threshold: {similarity_threshold}")
    similarity_matrix_over_threshold = similarity_matrix > similarity_threshold
    percentage_over_threshold = np.sum(similarity_matrix_over_threshold) / similarity_matrix.size * 100
    print(f"Percentage of node pairs over the similarity threshold: {percentage_over_threshold:.2f}%\n")

# Print nodes pairs that are over the threshold of 0.90
print("Nodes pairs that are over the threshold of 0.90:")
for i in range(similarity_matrix.shape[0]):
    for j in range(i+1, similarity_matrix.shape[1]):
        if similarity_matrix[i, j] > 0.90:
            print(f"Similarity between '{nodes[i]}' and '{nodes[j]}': {similarity_matrix[i, j]}")


Similarity threshold: 0.95
Percentage of node pairs over the similarity threshold: 6.25%

Similarity threshold: 0.9
Percentage of node pairs over the similarity threshold: 9.38%

Similarity threshold: 0.85
Percentage of node pairs over the similarity threshold: 17.97%

Nodes pairs that are over the threshold of 0.90:
Similarity between 'value of y = 2' and 'value of x = -1': 0.9058615695282606
Similarity between 'equation -3y = -6' and 'determinant formula': 0.9388589393885567
Similarity between 'equation -3y = -6' and 'solution for y (and x)': 0.9481471150735821
Similarity between 'determinant formula' and 'solution for y (and x)': 0.9441655305461931
