# Knowledge Graph Generation using Langchain libraries

## setup llm and graphdb

In [1]:
%pip install langchain_experimental -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_huggingface import HuggingFaceEndpoint
# config for endpoints
#config
HOST_URL_INF = ":8080"
MAX_NEW_TOKENS = 2024


llm = HuggingFaceEndpoint(
    endpoint_url=HOST_URL_INF,
    task="text-generation",
    max_new_tokens=MAX_NEW_TOKENS,
    do_sample=False,
)
#print(llm.invoke("What is HuggingFace?"))

In [3]:
import pickle
import os

# load pickled documents
pickle_file_path = '../../data/storage/full_all_documents.pkl'
if os.path.exists(pickle_file_path):
    with open(pickle_file_path, 'rb') as f:
        all_pdf_docs, all_yt_docs, all_blog_docs = pickle.load(f)
else:
    print("Pickle file not found.")

#check if the documents are loaded
print("Number of PDF documents:", len(all_pdf_docs))
print("Number of YouTube documents:", len(all_yt_docs))
print("Number of blog documents:", len(all_blog_docs))

all_docs = all_pdf_docs + all_yt_docs + all_blog_docs
print("Total number of documents:", len(all_docs))

Number of PDF documents: 2048
Number of YouTube documents: 442
Number of blog documents: 11
Total number of documents: 2501


In [4]:
print(all_docs[0].metadata)

{'source': 'C:\\Users\\jonathan.kasprisin\\github\\Learning\\KG_ilp\\data\\pdfs\\Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf', 'page': 0}


## Create graph from documents each document

In [5]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from tqdm import tqdm

llm_transformer = LLMGraphTransformer(llm=llm)
dir= "../../data/langchain_KG/"
error_log_path = dir+"generation_error_log.txt"

#create directory if it does not exist
if not os.path.exists(dir):
    os.makedirs(dir)

#Initalize error log file
with open(error_log_path, "w") as f:
    f.write("Error log for langchain graph document generation\n")

# Convert documents to graph documents with a progress bar
graph_documents = []
for doc in tqdm(all_docs, desc="Converting documents to graph documents", unit="doc"):
    try:
        graph_doc = llm_transformer.convert_to_graph_documents([doc])
        graph_documents.extend(graph_doc)
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f" 'doc.metadata': '{doc.metadata}', 'error': {e}\n")
        

print(f"Converted {len(graph_documents)} documents to graph documents.")

print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Converting documents to graph documents: 100%|██████████| 2501/2501 [10:02:49<00:00, 14.46s/doc]  

Converted 2479 documents to graph documents.
Nodes:[]
Relationships:[]





In [6]:
#pickle graph documents
pickle_file_path = dir + 'full_graph_documents.pkl'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(graph_documents, f)

In [11]:
print(graph_documents[1])
print(graph_documents[1].source.metadata)

nodes=[] relationships=[] source=Document(metadata={'source': 'C:\\Users\\jonathan.kasprisin\\github\\Learning\\KG_ilp\\data\\pdfs\\Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf', 'page': 1}, page_content='Linear Algebra and Its Applications\nFourth Edition\nGilbert Strang\ny\nx y z \x1e \x0c \nz\nAx b\x1e\nb\n0\nAy b\x1e\n0Az \x1e\n0')
{'source': 'C:\\Users\\jonathan.kasprisin\\github\\Learning\\KG_ilp\\data\\pdfs\\Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf', 'page': 1}


### Create graph with networkx

In [12]:
# Store to graph
# Filter out nodes with empty labels
filtered_graph_documents = []
for doc in graph_documents:
    valid_nodes = [node for node in doc.nodes if node.type]
    if valid_nodes:
        doc.nodes = valid_nodes
        filtered_graph_documents.append(doc)
    else:
        with open(dir+"dropped_docs.txt", "a") as f:
            f.write(f" 'Dropped doc.metadata': '{doc.source.metadata}'\n")



In [13]:
print(f"Filtered {len(filtered_graph_documents)} documents to graph documents.")

Filtered 983 documents to graph documents.


In [14]:
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
import networkx as nx


def make_nxe_graph(graph_documents) -> nx.Graph:
    print(f"Making nx graph from {len(graph_documents)} graph documents")
    graph_nxe = NetworkxEntityGraph()
    for doc in graph_documents:
        try:
            for node in doc.nodes:
                graph_nxe.add_node(node.id)
            for edge in doc.relationships:
                graph_nxe._graph.add_edge(edge.source.id, edge.target.id, relation=edge.type)
        except Exception as e:
            print(f"Error adding document to nx graph: {doc.source.metadata}, {e}")
    print(f"nx graph built with {graph_nxe.get_number_of_nodes()} nodes.") 
    return graph_nxe


graph_nxe = make_nxe_graph(filtered_graph_documents)




Making nx graph from 983 graph documents
nx graph built with 10177 nodes.


In [15]:
#see example node and relationships
node = 1

# Try to get node data (adjust method name as needed)
try:
    triples = graph_nxe.get_triples()
    print(f"Example triple: {triples[0]}")
    entity= triples[0][0]
    print(f"Example node: {entity}")
    knowledge = graph_nxe.get_entity_knowledge(entity, 3)
    print(f"Example node knowledge: {knowledge}")

except AttributeError:
    print("Unable to access node data. Check the class documentation for the correct method.")


Example triple: ('Positive Definite Matrices', 'Minima, Maxima, and Saddle Points', 'INCLUDES')
Example node: Positive Definite Matrices
Example node knowledge: ['Positive Definite Matrices INCLUDES Minima, Maxima, and Saddle Points', 'Positive Definite Matrices INCLUDES Tests for Positive Definiteness', 'Positive Definite Matrices INCLUDES Singular Value Decomposition', 'Singular Value Decomposition DESCRIBED_BY The SVD is closely associated with the eigenvalue-eigenvector factorizationQΛQT of a positive deﬁnite matrix', 'Singular Value Decomposition IS_EXPLAINED_IN Proof of the Singular Value Decomposition', 'Positive Definite Matrices INCLUDES Minimum Principles', 'Positive Definite Matrices INCLUDES The Finite Element Method']


In [16]:
#save graph
graph_file_path = dir+'langchain_full_kg.graphml'
nx.write_graphml(graph_nxe._graph, graph_file_path)


##