In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

The same applies for Ollama. 

please setup your model using the tutorial : https://python.langchain.com/v0.2/docs/integrations/chat/ollama/
The same for embedding model : https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/

In [2]:
from langchain_ollama import ChatOllama, OllamaEmbeddings

llm = ChatOllama(
    model="deepseek-r1:32b",
    temperature=0,
)

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)

# iText2KG

* Use case: we aim to connect two scientific papers. 

* The objective is to detect common key concepts between the two papers and allowing for the identification of central themes, keywords, and topics that dominate each paper. These themes could be linked to show overlaps or gaps in coverage, helping researchers identify areas where more study might be needed or where novel connections could be made.

## Document Distiller

### Scientific articles

In [4]:
from langchain.document_loaders import PyPDFLoader
from itext2kg.documents_distiller import DocumentsDistiller, Article
from pydantic import BaseModel, Field
from typing import List, Tuple


class ArticleResults(BaseModel):
    abstract:str = Field(description="Brief summary of the article's abstract")
    key_findings:str = Field(description="The key findings of the article")
    limitation_of_sota : str=Field(description="limitation of the existing work")
    proposed_solution : str = Field(description="the proposed solution in details")
    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")

# Sample input data as a list of triplets
# It is structured in this manner : (document's path, page_numbers_to_exclude, blueprint, document_type)
documents_information = [
    ("/home/jovyan/my_code/itext2kg/datasets/demo_data/article/AD/19295912.pdf", [11, 12], ArticleResults, 'scientific article'),
    ("/home/jovyan/my_code/itext2kg/datasets/demo_data/article/AD/23273601.pdf", [i for i in range(14,21)], ArticleResults, 'scientific article')
]

def upload_and_distill(documents_information: List[Tuple[str, List[int], BaseModel]]):
    distilled_docs = []
    
    for path_, exclude_pages, blueprint, document_type in documents_information:
        
        loader = PyPDFLoader(path_)
        pages = loader.load_and_split()
        pages = [page for page in pages if page.metadata["page"]+1 not in exclude_pages] # Exclude some pages (unecessary pages, for example, the references)
        document_distiller = DocumentsDistiller(llm_model=llm)
        
        IE_query = f'''
        # DIRECTIVES : 
        - Act like an experienced information extractor.
        - You have a chunk of a {document_type}
        - If you do not find the right information, keep its place empty.
        '''
        
        # Distill document content with query
        distilled_doc = document_distiller.distill(
            documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages],
            IE_query=IE_query,
            output_data_structure=blueprint
        )
        
        # Filter and format distilled document results
        distilled_docs.append([
            f"{document_type}'s {key} - {value}".replace("{", "[").replace("}", "]") 
            for key, value in distilled_doc.items() 
            if value and value != []
        ])
    
    return distilled_docs


In [6]:
distilled_docs = upload_and_distill(documents_information=documents_information)

## iText2KG for graph construction

In [None]:
from itext2kg import iText2KG


itext2kg = iText2KG(llm_model = llm, embeddings_model = embeddings)

We construct the first knowledge graph of the first distilled documents (for the first article)

In [None]:
kg = itext2kg.build_graph(sections=distilled_docs[0], ent_threshold=0.7, rel_threshold=0.7)

[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'label': 'Disease', 'name': "Alzheimer's disease"}, {'label': 'Process', 'name': 'Synaptic dysfunction'}, {'label': 'Molecule', 'name': 'Beta-amyloid oligomers (AbO)'}, {'label': 'Process', 'name': 'Gene expression changes'}, {'label': 'Protein', 'name': 'GluR2 subunit of AMPARs'}, {'label': 'Technique', 'name': 'Microarray analyses'}, {'label': 'Structure/Organism part', 'name': 'Synaptoneurosomes'}, {'label': 'RNA/DNA', 'name': 'ApoE mRNA'}, {'label': 'Enzyme/Protein', 'name': 'PKMζ'}, {'label': 'Receptor', 'name': 'M3 muscarinic acetylcholine receptors'}, {'label': 'Technique', 'name': 'In vitro translation experiments'}, {'label': 'Software/Tool', 'name': 'UTRScan and ARED tools'}]}
[INFO] ------- Extracting Relations from the Document 1
{'relationships': [{'startNode': {'label': 'Enzyme_Protein', 'name': 'pkmζ'}, 'endNode': {'label': 'Process', 'name': 'synaptic plasticity'}, 'name': 'is_involved_in'}, {'startNo

We construct the second graph, noting that we already have an existing knowledge graph (for the first article).

In [None]:
# kg2 = itext2kg.build_graph(sections=distilled_docs[1], existing_knowledge_graph=kg, rel_threshold=0.7, ent_threshold=0.7)

[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'name': 'Cyber Threat Intelligence', 'label': 'Data Structure'}, {'name': 'Large Language Models', 'label': 'Methodology'}, {'name': 'Knowledge Graphs', 'label': 'Data Structure'}, {'name': 'Llama 2 7B chat', 'label': 'Model'}, {'name': 'Llama 70B chat', 'label': 'Model'}, {'name': 'Mistral', 'label': 'Model'}, {'name': 'Zephyr', 'label': 'Model'}, {'name': 'Prompt Engineering', 'label': 'Technique'}, {'name': 'Link Prediction', 'label': 'Technique'}]}
[Entity(name=large language models, label=Methodology, properties=embeddings=array([-0.00911226,  0.00577835, -0.02530644, ...,  0.00196522,
       -0.01056079,  0.0010359 ])), Entity(name=llama 70b chat, label=Model, properties=embeddings=array([-0.02101934, -0.0112489 , -0.0145149 , ..., -0.00411553,
       -0.00812611,  0.00033667])), Entity(name=zephyr, label=Model, properties=embeddings=array([-0.02932358, -0.00352876, -0.00856502, ...,  0.00673707,
       -0.0070

# Draw the graph
---

The final section involves visualizing the constructed knowledge graph using GraphIntegrator. The graph database Neo4j is accessed using specified credentials, and the resulting graph is visualized to provide a visual representation of the relationships and entities extracted from the document.

In [None]:
from itext2kg.graph_integration import GraphIntegrator

URI = "neo4j+s://d7f45b5f.databases.neo4j.io"
USERNAME = "neo4j"
PASSWORD = "uQK8FUAR8QUDyeKsMLcGh4DcLeq306tq-5CGCZWmZvk"

GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(knowledge_graph=kg)

NameError: name 'kg_' is not defined