In [24]:
import os
import PyPDF2
import faiss
from transformers import AutoTokenizer, AutoModel
import ollama

# Step 1: Extract text from PDFs
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

# Read all PDFs from the 'docs' directory
pdf_dir = "docs"
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith(".pdf")]

# Extract text from all PDFs
documents = [extract_text_from_pdf(pdf) for pdf in pdf_files]

# Step 2: Generate embeddings for documents
# Use a pre-trained model for embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Change if needed
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Generate embeddings for each document
embeddings = [embed_text(doc) for doc in documents]

# Step 3: Create a Faiss index
dimension = embeddings[0].shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)

# Add document embeddings to the Faiss index
for emb in embeddings:
    index.add(emb)

# Step 4: Use DeepSeek model in Ollama
def query_with_ollama(question, retrieved_text):
    client = ollama.Client()
    response = client.generate(
        model="deepseek-r1:7b",  # Change if using a different model
        system="You are a helpful assistant answering questions using retrieved context.",
        prompt=f"Context: {retrieved_text}\n\nQuestion: {question}\nAnswer:"
    )

    return answer.get("response", "")

# Step 5: Query the pipeline
def query_pipeline(question, top_k=1):
    # Convert question to an embedding
    question_embedding = embed_text(question)

    # Search the Faiss index
    distances, indices = index.search(question_embedding, top_k)

    # Retrieve the most relevant document(s)
    retrieved_text = "\n".join([documents[i] for i in indices[0]])

    # Generate an answer using DeepSeek in Ollama
    return query_with_ollama(question, retrieved_text)

# Example usage
if __name__ == "__main__":
    question = "What is the main topic of the documents?"
    answer = query_pipeline(question)
    print("Answer:", answer)

Answer: <think>
Okay, so I need to figure out what the main topic of the given documents is. Let me start by reading through the context carefully.

The context begins by introducing Desaiv, an insurtech company focused on health insurance management using AI-powered solutions. It was founded in 2022 and operates mainly in the MENA region. They secured a pre-seed funding of $2 million from several investors including 500 Global, Terra VC, Oqal angel investors, and others from the MENA, UK, and USA.

The mission is to revolutionize the insurance sector with AI, achieving over 95% prediction accuracy using data from top-tier universities like Oxford. The platform offers various features such as effortless data management, advanced OCR technology, AI-powered analytics, future cost predictions, and customization options. It also lists specific features like Claims Experience Analytics, Quotation Comparison, Request New Quotation, Provider Comparison, and an Underwriter Tool.

The required 

In [30]:
answer

"<think>\nOkay, so I need to figure out what the main topic of the given documents is. Let me start by reading through the context carefully.\n\nThe context begins by introducing Desaiv, an insurtech company focused on health insurance management using AI-powered solutions. It was founded in 2022 and operates mainly in the MENA region. They secured a pre-seed funding of $2 million from several investors including 500 Global, Terra VC, Oqal angel investors, and others from the MENA, UK, and USA.\n\nThe mission is to revolutionize the insurance sector with AI, achieving over 95% prediction accuracy using data from top-tier universities like Oxford. The platform offers various features such as effortless data management, advanced OCR technology, AI-powered analytics, future cost predictions, and customization options. It also lists specific features like Claims Experience Analytics, Quotation Comparison, Request New Quotation, Provider Comparison, and an Underwriter Tool.\n\nThe required 

In [27]:
len(embeddings)

2

In [26]:
embeddings[0].shape

(1, 384)