In [3]:
import os
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import fitz  # PyMuPDF
import docx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from rich.console import Console
from rich.table import Table
from rich import box


In [4]:

# Initialize Elasticsearch client (Adjust this section based on your setup)
es = Elasticsearch(hosts=["http://localhost:9200"])

# Check if the client is connected
if not es.ping():
    raise ValueError("Could not connect to Elasticsearch")


ValueError: Could not connect to Elasticsearch

In [None]:

# Read PDF documents
def read_pdf(file_path):
    doc = fitz.open(file_path)
    content = ""
    for page in doc:
        content += page.get_text()
    return content

# Read DOCX documents
def read_docx(file_path):
    doc = docx.Document(file_path)
    content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    return content

# Read documents from a folder
def read_documents_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
        elif filename.endswith(".pdf"):
            content = read_pdf(os.path.join(folder_path, filename))
        elif filename.endswith(".docx"):
            content = read_docx(os.path.join(folder_path, filename))
        else:
            continue
        
        documents.append({
            "title": filename,
            "content": content
        })
    return documents


In [None]:

folder_path = 'Documents'
documents = read_documents_from_folder(folder_path)

# Index documents into Elasticsearch
def index_documents(es, index_name, documents):
    actions = [
        {
            "_index": index_name,
            "_source": doc
        }
        for doc in documents
    ]
    bulk(es, actions)

index_name = 'documents'
index_documents(es, index_name, documents)


In [None]:

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Process user query
def process_query(query):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(query)
    filtered_query = [w for w in word_tokens if w.lower() not in stop_words]
    return ' '.join(filtered_query)

# Search in Elasticsearch
def search_documents(es, index_name, query):
    body = {
        "query": {
            "match": {
                "content": query
            }
        }
    }
    res = es.search(index=index_name, body=body)
    return res['hits']['hits']


In [None]:

# Create a console object for rich
console = Console()

# User input
user_query = "Find the document with content about document 1"
processed_query = process_query(user_query)

# Execute search
search_results = search_documents(es, index_name, processed_query)

# Display results
console.print("\n[bold magenta]🔍 Search Results:[/bold magenta]")
results_table = Table(box=box.ROUNDED, show_header=True, header_style="bold blue")
results_table.add_column("Title", style="dim", width=20)
results_table.add_column("Content", style="cyan", width=50)

for result in search_results:
    results_table.add_row(result['_source']['title'], result['_source']['content'])

console.print(results_table)
