In [9]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from typing import List

# Load and index documents
documents = SimpleDirectoryReader('data').load_data()
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)

# Assume each node has a 'topic', 'position', 'file_name', and 'file_path' in its metadata
for i, node in enumerate(nodes):
    node.metadata['position'] = i
    node.metadata['file_name'] = f"file_{i}.txt"  # Example file name for each node
    node.metadata['file_path'] = f"/path/to/file_{i}.txt"  # Example file path
    node.excluded_llm_metadata_keys = ['file_path']  # Exclude file_path from LLM metadata

index = VectorStoreIndex(nodes)

def get_topic_position(query: str) -> int:
    # This function should identify the topic and return its position
    # For simplicity, we're using a basic retrieval method here
    retriever = VectorIndexRetriever(index=index, similarity_top_k=1)
    retrieved_nodes = retriever.retrieve(query)
    return retrieved_nodes[0].metadata['position']

def filter_nodes(nodes: List, position: int) -> List:
    return [node for node in nodes if node.metadata['position'] <= position]

def group_nodes(nodes: List) -> dict:
    grouped = {}
    for node in nodes:
        topic = node.metadata.get('topic', 'Unknown')
        if topic not in grouped:
            grouped[topic] = []
        grouped[topic].append(node)
    return grouped

def chatbot():
    # Get query from user input
    query = input("Enter your query: ")

    # Get the position of the topic from the query
    topic_position = get_topic_position(query)

    # Filter the nodes before the topic position and the current one
    filtered_nodes = filter_nodes(index.docstore.docs.values(), topic_position)

    # Group the nodes separately (apply any specific grouping logic here)
    grouped_nodes = group_nodes(filtered_nodes)

    # Create a new index with the filtered and grouped nodes
    new_index = VectorStoreIndex(list(filtered_nodes))
    
    # Create a retriever and postprocessor
    retriever = VectorIndexRetriever(index=new_index, similarity_top_k=2)
    postprocessor = MetadataReplacementPostProcessor(target_metadata_key="topic")
    
    # Create a query engine with the retriever and postprocessor
    query_engine = RetrieverQueryEngine(retriever, node_postprocessors=[postprocessor])

    # Query the engine and get the response
    response = query_engine.query(query)

    # Extract the first source node
    source_node = response.source_nodes[0].node  # Get the first source node
    node_content = source_node.get_content()  # Get the node content (text)

    # Assuming the metadata has 'file_name', 'position', and 'file_path' fields
    file_name = source_node.metadata.get('file_name', 'Unknown File')
    node_position = source_node.metadata.get('position', 'Unknown Position')

    # Retrieve the excluded metadata (file_path in this case)
    file_path = source_node.metadata.get('file_path', 'Unknown Path') if 'file_path' in source_node.excluded_llm_metadata_keys else 'No File Path'

    # Print only the response and the relevant metadata (file name, node position, and file path)
    print(f"Answer: {response}")
    print(f"File Name: {file_name}")
    print(f"Node Number: {node_position}")
    print(f"File Path: {file_path}")

# Run the chatbot
chatbot()


ValueError: No files found in data.

In [2]:
nodes[18]

TextNode(id_='e023f4b4-4dc9-4afb-ad03-ec5468e54d3e', embedding=None, metadata={'page_label': '7', 'file_name': 'file_18.txt', 'file_path': '/path/to/file_18.txt', 'file_type': 'application/pdf', 'file_size': 105401, 'creation_date': '2024-08-27', 'last_modified_date': '2024-08-27', 'position': 18}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_path'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='400635f0-7808-4d15-9e55-e490dcc1a36a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '7', 'file_name': 'IME_Podcast_014_Valentine_s_Day-PDF.pdf', 'file_path': 'c:\\Users\\aashm\\OneDrive\\Desktop\\App Project\\data\\IME_Podcast_014_Valentine_s_Day-PDF.pdf', 'file_type': 'application/pdf', 'file_size': 105401, 'creation_date': '2024-08-27', 'last_modified_date': '2024-08-27'}, hash='c392b0fdbcd7c827c984e75999519637c7a1854aa3d1f67470b81b

In [None]:
from docx import Document
from llama_index.core import Document as LLamaDocument

# Function to load .docx content
def load_docx(file_path):
    doc = Document('data\\transcribed_data')
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return "\n".join(text)

# Loading .docx file into LlamaIndex
def load_docx_into_llama(file_path):
    content = load_docx('data\\transcribed_data')
    llama_doc = LLamaDocument(content)
    return llama_doc

# Example usage
docx_file = "path_to_your_file.docx"  # Replace with your actual file path
llama_doc = load_docx_into_llama(docx_file)

# Assuming we have GPTSimpleVectorIndex available
from llama_index import GPTSimpleVectorIndex
index = GPTSimpleVectorIndex([llama_doc])

# Example query
response = index.query("Your query here")
print(response)
