### Import necessary libraries

In [1]:
from llama_index.readers.obsidian import ObsidianReader
import openai
from llama_index.core.memory.chat_memory_buffer import MessageRole
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex, VectorStoreIndex
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core import Settings
from IPython.display import Markdown, display
from llama_index.llms.ollama import Ollama
import time
import os
from llama_index.core.llms import ChatMessage
from llama_index.core import StorageContext
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.memory import ChatMemoryBuffer
import logging
import sys
import ipywidgets as widgets
import json
from llama_index.core.callbacks import CallbackManager
from llama_index.core.callbacks import LlamaDebugHandler
from llama_index.core import ServiceContext
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)
import yaml
from llama_index.core import (
    load_index_from_storage,
    load_indices_from_storage,
    load_graph_from_storage,
)
import nest_asyncio

nest_asyncio.apply()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

### Set LLM with Ollama (Tinyllama)

In [3]:
llm = Ollama(model="tinyllama:latest", request_timeout=120.0)
Settings.llm = llm
Settings.chunk_size = 512

### Set embedding model (Bge-base-en-v1.5)

In [None]:
# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
#Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")


### Define data directory

In [2]:
raw_data = SimpleDirectoryReader(
    r"/Users/arthursarazin/Documents/RAGdesign/data"
).load_data()

### Utilitary function

In [None]:
def process_document(doc):
    # This function will be called for each document
    print(f"Processing document: {doc.doc_id}")
    return doc

# Construct stores and indexes

## Vector 

Data will be transformed and stored as vectors using the embedding model. The vector store will be used to retrieve relevant documents based on the query. A vector in this context is a numerical representation of the text data, which allows for efficient similarity search. A vector index will be created to facilitate this process.

### Construct vector store and index

In [None]:
vector_index = VectorStoreIndex.from_documents(
    documents = simple_glossary,
    show_progress=True)

### Save  index

In [6]:
vector_index.set_index_id("vector_index")
vector_index.storage_context.persist("vector")

## Property Graph

Data will be transformed into a property graph format. A property graph is a data structure that consists of nodes (entities) and edges (relationships) with properties. This format allows for complex queries and relationships to be represented and queried efficiently. The property graph store will be used to retrieve relevant documents based on the query. A property graph index will be created to facilitate this process.

### Define a knowledge graph extractor for property graph structure

To define what entities and relation types to extract from the text, we will use a knowledge graph extractor. This extractor will identify and extract relevant entities and relationships from the text data, which will then be used to construct the property graph store and index. 

In this blueprint, the entities are defined by the title of Obsidian marksdown files, and the relations are defined by the links between these files. The extractor will identify these entities and relationships and create a property graph structure that can be queried efficiently.



In [None]:
def extract_metadata_and_title(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        
        if content.startswith("---"):
            try:
                parts = content.split("---", 2)
                if len(parts) > 2:
                    yaml_block = parts[1]
                    metadata = yaml.safe_load(yaml_block)
                    return os.path.basename(file_path).replace(".md", ""), metadata
            except Exception as e:
                print(f"Erreur lors de l'extraction des métadonnées de {file_path}: {e}")
                return None, None
        return os.path.basename(file_path).replace(".md", ""), None

def process_obsidian_notes(directory):
    entities = []
    relations = []
    contents = []
    relation_types = {}
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                title, metadata = extract_metadata_and_title(file_path)
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    if content.startswith("---"):
                        parts = content.split("---", 2)
                        entity_content = parts[2].strip() if len(parts) > 2 else ""
                    else:
                        entity_content = content.strip()
                
                if title:
                    entities.append({"name": title})
                    contents.append({"content": entity_content})
                    
                    if metadata:
                        for key, value in metadata.items():
                            relations.append({
                                "type": key,
                                "source": title,
                                "target": value,
                            })
                            if title not in relation_types:
                                relation_types[title] = set()
                            relation_types[title].add(key)
    
    validation_schema = {entity: list(rels) for entity, rels in relation_types.items()}
    
    return entities, relations, contents, validation_schema

#### Define the entities and relations that will be used by the knwoledge graph extractor

The entities and relations are stored in a specific folder called "ontology". Think of an ontology as organizing principles for the knowledge that is inside the Obsidian vault. It will be transmitted to the knowledge graph extractor. 

In [None]:
ontology_dir = "**/ontology"
entities, relations, contents, validation_schema = process_obsidian_notes(ontology_dir)

In [None]:
# Création d'un knowlege extractor qui va mapper des données non-structurées en suivant le modèle de connaissance
kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    strict=False,
    num_workers=4,
    possible_entities=[entity["name"] for entity in entities],  # Si vous ne limitez pas les types d'entités
    possible_relations=[relation["type"] for relation in relations],  # Si vous ne limitez pas les types de relations
    kg_validation_schema=validation_schema,
    possible_entity_props=[content["content"] for content in contents],
    possible_relation_props=None,
)

#### Construct property graph store and index

In [None]:
pg_store = SimpleGraphStore()
pg_storage_context = StorageContext.from_defaults(graph_store=pg_store)

In [None]:
property_graph_index = PropertyGraphIndex.from_documents(
    documents=[process_document(doc) for doc in data],
    llm=llm,
    storage_context=onto_storage_context,
    embed_kg_nodes=True,
    kg_extractors=[kg_extractor],
    show_progress=True,
    graph_store=onto_store 

)

In [None]:
# save index to disk
property_graph_index.set_index_id("pg_index")
property_graph_index.storage_context.persist("pg_store")
property_graph_index.persist(persist_dir="pg_store")