In [None]:
import pdfplumber
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import logging
from transformers import pipeline
import os
import warnings

# Set up logging and environment
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.basicConfig(level=logging.ERROR, format='%(asctime)s-%(levelname)s-%(message)s')
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

# Initialize models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
llm = pipeline("text-generation", model='gpt2')

def extract_all_pages(pdf_path):
    extracted_data = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            extracted_data[page_number + 1] = page.extract_text()
    return extracted_data

def chunk_text(text, chunk_size=200):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def create_embeddings(chunks):
    return embedding_model.encode(chunks)

def main():
    pdf_files = [r"C:\\Users\\Bhavani\\OneDrive\\Desktop\\durga\\RAG pipeline task.pdf"]
    all_chunks = []
    all_embeddings = []
    
    for pdf_path in pdf_files:
        extracted_data = extract_all_pages(pdf_path)
        for page_number, content in extracted_data.items():
            if content:
                chunks = chunk_text(content)
                all_chunks.extend(chunks)
                embeddings = create_embeddings(chunks)
                all_embeddings.append(embeddings)
    
    all_embeddings = np.vstack(all_embeddings)
    index = faiss.IndexFlatL2(all_embeddings.shape[1])
    index.add(all_embeddings)

    while True:
        user_input = input("Enter your query (or 'exit' to quit): ")
        if user_input.lower() == 'exit':
            print("Exiting the program.")
            break
        
        # Generate query embedding
        query_embedding = embedding_model.encode([user_input])
        
        # Perform the search on the FAISS index
        D, I = index.search(query_embedding, k=5)
        
        # Get the results
        results = [(all_chunks[i], D[0][j]) for j, i in enumerate(I[0])]
        
        if results:
            print("\nResults found:")
            for chunk, distance in results:
                print(f"\nChunk (Distance: {distance:.4f}):\n{chunk}\n")
            
            # Use GPT-2 model to generate a response based on the top result
            response = llm(f"Based on the following information: {results[0][0]}, generate a response.",max_length=150)
            print("\nGenerated Response:\n", response[0]['generated_text'].strip())
        else:
            print("No results found for your query.")

if __name__ == "__main__":
    main()
