In [None]:
import os
import json
import numpy as np
import faiss
import PyPDF2
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Configuration
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
pdf_files = ["pdf_2.pdf", "pdf_4.pdf", "pdf_5.pdf", "pdf_6.pdf", "pdf_7.pdf"]

# Extract paragraphs from PDF
def extract_paragraphs_from_pdf(pdf_path):
    paragraphs = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text = page.extract_text()
            if text:
                lines = text.split("\n")
                clean_paragraphs = [line.strip() for line in lines if line.strip()]
                paragraphs.extend(clean_paragraphs)
    return paragraphs

# Data stores
all_paragraphs, paragraph_sources = [], []

# Process each PDF
for pdf in tqdm(pdf_files, desc="üìÑ Reading PDFs"):
    if not os.path.exists(pdf):
        print(f"‚ö†Ô∏è File not found: {pdf}")
        continue

    paragraphs = extract_paragraphs_from_pdf(pdf)

    # Save original paragraphs
    all_paragraphs.extend(paragraphs)
    paragraph_sources.extend([pdf] * len(paragraphs))
# Encode and build indices
def build_faiss_index(data, filename_prefix):
    print(f"üî¢ Encoding {filename_prefix} data...")
    embeddings = model.encode(data, batch_size=32, show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")

    print(f"üíæ Saving {filename_prefix} index...")
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, f"{filename_prefix}.index")
    np.save(f"{filename_prefix}_embeddings.npy", embeddings)

    return embeddings

# Build both indices
paragraph_embeddings = build_faiss_index(all_paragraphs, "paragraph")

# Save JSON mappings
with open("all_paragraphs.json", "w") as f:
    json.dump(all_paragraphs, f)
with open("paragraph_sources.json", "w") as f:
    json.dump(paragraph_sources, f)

print("‚úÖ All databases built and saved.")

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import ipywidgets as widgets
from IPython.display import display, Markdown
import pdfplumber
import re
import logging






"""
Sentance Searcher


"""

logging.getLogger("pdfminer").setLevel(logging.ERROR)

def search_sentence_in_pdf(pdf_path, sentence):
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(pdf.pages)):
            page = pdf.pages[page_num]
            text = page.extract_text()

            if text:
                # Normalize whitespace
                clean_text = ' '.join(text.split())

                # Split into sentences using punctuation
                sentence_list = re.split(r'(?<=[.!?])\s+', clean_text)

                # Find index of the sentence containing the search phrase
                for idx, sent in enumerate(sentence_list):
                    if sentence in sent:
                        # Get current + next 3 sentences
                        end_idx = min(idx + 11, len(sentence_list))
                        context_sentences = sentence_list[idx:end_idx]
                        return ' '.join(context_sentences)
    return " "

def search_in_multiple_pdfs(pdf_list, sentence):
    results = {}
    result = search_sentence_in_pdf(pdf_list, sentence)
    results[pdf_list] = result
    return results


pdf_files = ["pdf_2.pdf", "pdf_4.pdf", "pdf_5.pdf", "pdf_6.pdf", "pdf_7.pdf"]

"""
Model Searcher
"""

# Load model
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Load paragraph DB
paragraph_index = faiss.read_index("paragraph.index")
paragraph_texts = json.load(open("all_paragraphs.json"))
paragraph_sources = json.load(open("paragraph_sources.json"))

output = widgets.Output()

# Search function
def search_faiss(query, k=10):
    query_vec = model.encode([query]).astype("float32")
    distances, indices = paragraph_index.search(query_vec, k)
    results = [(paragraph_texts[i], paragraph_sources[i], distances[0][rank]) for rank, i in enumerate(indices[0])]

    return results

# Display function
def display_results(results, query_title):
    with output:
        display(Markdown(f"## üîç Query: *{query_title}*"))
        final_rank = 0
        ranking = 1
        for rank, (text, source, dist) in enumerate(results, 1):
            if ranking < 6:
                temp_2 = search_in_multiple_pdfs(source, text)[source]
                if temp_2.strip():
                    final_rank += 1
                    display(Markdown(f"### Match #{ranking}"))
                    display(Markdown(f"**From PDF**: `{source}`"))
                    display(Markdown(f"**Distance**: `{dist:.4f}`"))
                    display(Markdown(f"**Text**:\n\n{temp_2}"))
                    display(Markdown("---"))
                    ranking += 1
scrollable_output = widgets.Box([output], layout=widgets.Layout(overflow='auto',
                                                                 border='1px solid gray',
                                                                 width='100%',
                                                                 height='600px',
                                                                 flex_flow='column',
                                                                 display='flex'))
display(scrollable_output)
# Render UI
results_array=[]
# List of queries
queries = [
    "What features does MATLAB offer to help shorten response times and reduce data transmission over the network?",
    "How did Baker Hughes engineers use MATLAB to develop pump health monitoring software?",
    "Why is it important for training data in predictive maintenance systems to include instances from both normal and fault conditions?",
    "What is the recall performance of the proposed ENBANN method in comparison to other methods?",
    "What is cross-sectional prediction and how can it be applied in estimating component lifespan?",
    "Why are gas leak detectors important in environments with many pneumatic valves, and what type of detectors are considered non-intrusive?",
    "What new Industry 4.0 technologies are being used for remote asset monitoring, and what tools support them?",
    "What does the simulation model of the SUDM policy evaluate, and what assumptions are made about workstation operations?",
    "How were the prior parameters for the Weibull and exponential degradation models estimated, and what assumptions were made about the error terms?",
    "How does fuzzy logic contribute to diagnostics in machine failure and maintenance management?",
    "Why are artificial neural networks suitable for prognostics in machine failure, and what limitations do traditional systems face?",
    "How do Big Data platforms and CMMS contribute to the formulation of maintenance strategies?",
    "What is the relationship between diagnostics and prognostics in the context of machine degradation and failure?"
]

# Loop through each query and display results
for query in queries:
    results = search_faiss(query)
    display_results(results, query)
