In [1]:
%load_ext autoreload
%autoreload 2


import sys
sys.path.append("..")

Now, iText2KG is compatible with all language models supported by LangChain. 

To use iText2KG, you will need both a chat model and an embeddings model. 

For available chat models, refer to the options listed at: https://python.langchain.com/v0.2/docs/integrations/chat/. 
For embedding models, explore the choices at: https://python.langchain.com/v0.2/docs/integrations/text_embedding/. 

This notebook will show you how to run iText2KG using Mistral, Ollama, and OpenAI models. 

**Please ensure that you install the necessary package for each chat model before use.**

# Mistral

For Mistral, please set up your model using the tutorial here: https://python.langchain.com/v0.2/docs/integrations/chat/mistralai/. Similarly, for the embedding model, follow the setup guide here: https://python.langchain.com/v0.2/docs/integrations/text_embedding/mistralai/ .

In [3]:
from langchain_mistralai import ChatMistralAI
from langchain_mistralai import MistralAIEmbeddings

mistral_api_key = "##"
mistral_llm_model = ChatMistralAI(
    api_key = mistral_api_key,
    model="mistral-large-latest",
    temperature=0,
    max_retries=2,
)


mistral_embeddings_model = MistralAIEmbeddings(
    model="mistral-embed",
    api_key = mistral_api_key
)

ModuleNotFoundError: No module named 'langchain_mistralai'

# OpenAI

The same applies for OpenAI. 

please setup your model using the tutorial : https://python.langchain.com/v0.2/docs/integrations/chat/openai/
The same for embedding model : https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/

In [None]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "your_api_key"

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)

TypeError: Client.__init__() got an unexpected keyword argument 'proxies'

# ZhipuAI

In [None]:
from langchain_community.chat_models import ChatZhipuAI


# 设置 API key（推荐用环境变量）
zhipu_api_key = "your_api_key"

# 初始化智谱大语言模型
zhipu_llm_model = ChatZhipuAI(
    api_key=zhipu_api_key,
    model="glm-4",  # 可选模型：glm-3-turbo / glm-4 / glm-4v
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)



# Ollama

The same applies for Ollama. 

please setup your model using the tutorial : https://python.langchain.com/v0.2/docs/integrations/chat/ollama/
The same for embedding model : https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/

In [2]:
from langchain_ollama import ChatOllama, OllamaEmbeddings

llm = ChatOllama(
    model="llama3",
    temperature=0,
)

embeddings = OllamaEmbeddings(
    model="llama3",
)

# iText2KG

* Use case: we aim to connect two scientific papers. 

* The objective is to detect common key concepts between the two papers and allowing for the identification of central themes, keywords, and topics that dominate each paper. These themes could be linked to show overlaps or gaps in coverage, helping researchers identify areas where more study might be needed or where novel connections could be made.

## Document Distiller

### Scientific articles

In [5]:
from langchain.document_loaders import PyPDFLoader
from itext2kg.documents_distiller import DocumentsDistiller, Article
from pydantic import BaseModel, Field
from typing import List, Tuple


class ArticleResults(BaseModel):
    abstract:str = Field(description="Brief summary of the article's abstract")
    key_findings:str = Field(description="The key findings of the article")
    limitation_of_sota : str=Field(description="limitation of the existing work")
    proposed_solution : str = Field(description="the proposed solution in details")
    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")

# Sample input data as a list of triplets
# It is structured in this manner : (document's path, page_numbers_to_exclude, blueprint, document_type)
documents_information = [
    ("../datasets/llm-tikg.pdf", [11,10], ArticleResults, 'scientific article'),
    ("../datasets/actionable-cyber-threat.pdf", [12,11,10], ArticleResults, 'scientific article')
]

"""
将每个文档根据提示词蒸馏得到distilled_doc,
最终返回distilled_docs
"""

def upload_and_distill(documents_information: List[Tuple[str, List[int], BaseModel]]):
    distilled_docs = []
    
    for path_, exclude_pages, blueprint, document_type in documents_information:
        
        loader = PyPDFLoader(path_)
        pages = loader.load_and_split()
        pages = [page for page in pages if page.metadata["page"]+1 not in exclude_pages] # Exclude some pages (unecessary pages, for example, the references)
        document_distiller = DocumentsDistiller(llm_model=llm)
        
        IE_query = f'''
        # DIRECTIVES : 
        - Act like an experienced information extractor.
        - You have a chunk of a {document_type}
        - If you do not find the right information, keep its place empty.
        '''
        
        # Distill document content with query
        distilled_doc = document_distiller.distill(
            documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages],
            IE_query=IE_query,
            output_data_structure=blueprint
        )
        
        # Filter and format distilled document results
        distilled_docs.append([
            f"{document_type}'s {key} - {value}".replace("{", "[").replace("}", "]") 
            for key, value in distilled_doc.items() 
            if value and value != []
        ])
    
    return distilled_docs


In [6]:
def my_upload_and_distill(documents_information: List[Tuple[str, List[int], BaseModel]]):
    distilled_docs = []
    
    for path_, exclude_pages, blueprint, document_type in documents_information:
        
        loader = PyPDFLoader(path_)
        pages = loader.load_and_split()
        pages = [page for page in pages if page.metadata["page"]+1 not in exclude_pages] # Exclude some pages (unecessary pages, for example, the references)
        document_distiller = DocumentsDistiller(llm_model=llm)
        
        IE_query = f'''
        # DIRECTIVES : 
        - Act like an experienced information extractor.
        - You have a chunk of a {document_type}
        - If you do not find the right information, keep its place empty.
        '''
        
        # Distill document content with query
        distilled_doc = document_distiller.distill(
            documents=[page.page_content.replace("{", '[').replace("}", "]") for page in pages],
            IE_query=IE_query,
            output_data_structure=blueprint
        )
        
        # Filter and format distilled document results
        distilled_docs.append([
            f"{document_type}'s {key} - {value}".replace("{", "[").replace("}", "]") 
            for key, value in distilled_doc.items() 
            if value and value != []
        ])
    
    return distilled_docs

my_documents_information = [
    ("../datasets/scientific_articles/llm-tikg.pdf", [2,3,4,5,6,7,8,9,10,11], ArticleResults, 'scientific article')
]

my_distilled_docs = my_upload_and_distill(my_documents_information)

# Save distilled_docs to a Markdown file
with open("distilled_docs.md", "w", encoding="utf-8") as md_file:
    md_file.write("# Distilled Documents\n\n")
    for doc in my_distilled_docs:
        md_file.write("\n".join(doc) + "\n\n")

print("Distilled documents saved to distilled_docs.md")

Distilled documents saved to distilled_docs.md


In [6]:
distilled_docs = upload_and_distill(documents_information=documents_information)

## iText2KG for graph construction

In [7]:
from itext2kg import iText2KG


itext2kg = iText2KG(llm_model = llm, embeddings_model = embeddings)

We construct the first knowledge graph of the first distilled documents (for the first article)

In [8]:
kg = itext2kg.build_graph(sections=distilled_docs[0], ent_threshold=0.7, rel_threshold=0.7)
"""
success = False
attempts = 0

while not success:
    try:
        kg = itext2kg.build_graph(sections=distilled_docs[0], ent_threshold=0.7, rel_threshold=0.7)
        success = True
    except Exception as e:
        attempts += 1
        print(f"Attempt {attempts} failed with error: {e}") 
"""


[INFO] ------- Extracting Entities from the Document 1
[INFO] ------- Extracting Relations from the Document 1
[INFO] Verification of invented entities
[DEBUG] Processing relationship: {'startNode': {'label': 'Malware', 'name': 'identical malware techniques'}, 'endNode': {'label': 'Attackers', 'name': 'malware and attackers'}, 'name': 'employs'}
[INFO] ------- Extracting Entities from the Document 2
[INFO] ------- Extracting Relations from the Document 2
[INFO] Verification of invented entities
[DEBUG] Processing relationship: {'startNode': {'label': 'Key_Findings', 'name': 'The main results and discoveries presented in the article (second instance)'}, 'endNode': {'label': 'Proposed_Solution', 'name': 'The new approach, method, or technique proposed in the article to address the limitations of sota (second instance)'}, 'name': 'Addresses'}
[DEBUG] Processing relationship: {'startNode': {'label': 'Limitation_of_SOTA', 'name': 'The limitations of the current state of the art (sota) metho

'\nsuccess = False\nattempts = 0\n\nwhile not success:\n    try:\n        kg = itext2kg.build_graph(sections=distilled_docs[0], ent_threshold=0.7, rel_threshold=0.7)\n        success = True\n    except Exception as e:\n        attempts += 1\n        print(f"Attempt {attempts} failed with error: {e}") \n'

We construct the second graph, noting that we already have an existing knowledge graph (for the first article).

In [9]:
kg2 = itext2kg.build_graph(sections=distilled_docs[1], existing_knowledge_graph=kg, rel_threshold=0.7, ent_threshold=0.7)

[INFO] ------- Extracting Entities from the Document 1
[INFO] ------- Extracting Relations from the Document 1
[INFO] Verification of invented entities
[DEBUG] Processing relationship: {'startNode': {'label': 'Data_Structure', 'name': 'Cyber threat intelligence report'}, 'endNode': {'label': 'Methodology', 'name': 'Few shot prompting and fine tuning'}, 'name': 'Uses'}
[DEBUG] Processing relationship: {'startNode': {'label': 'Person', 'name': 'John Doe'}, 'endNode': {'label': 'Technique', 'name': 'Neural networks'}, 'name': 'Studies'}
[INFO] ------- Extracting Entities from the Document 2
[INFO] Wohoo! Entity was matched --- [entity types in the few shot examples or fine tuning data:Ontology] --merged--> [few shot prompting and fine tuning:Methodology]
[INFO] ------- Extracting Relations from the Document 2
[INFO] Verification of invented entities
[DEBUG] Processing relationship: {'startNode': {'label': 'Fine_tuned_model', 'name': '7b chat model'}, 'endNode': {'label': 'Entity_types', '

# Draw the graph
---

The final section involves visualizing the constructed knowledge graph using GraphIntegrator. The graph database Neo4j is accessed using specified credentials, and the resulting graph is visualized to provide a visual representation of the relationships and entities extracted from the document.

In [None]:
from itext2kg.graph_integration import GraphIntegrator


URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "your_password"

GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(knowledge_graph=kg2)

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] 由于目标计算机积极拒绝，无法连接。)
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] 由于目标计算机积极拒绝，无法连接。)