In [None]:
import os
import re
import faiss
import numpy as np
import PyPDF2
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import ollama

In [3]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ''
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + ' '
    return text.strip()

In [4]:
def clean_text(text):
    """Clean and normalize the extracted text."""
    text = re.sub(
        r'\s+', ' ', text)  
    text = re.sub(r'[^\w\s.,;!?]', '', text)
    return text.lower().strip()


In [5]:
def chunk_text(text, chunk_size=500):
    """Split the text into smaller chunks of fixed size."""
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size])
              for i in range(0, len(words), chunk_size)]
    return chunks if chunks else ["No meaningful text found."]


In [6]:
def generate_embeddings(chunks):
    """Generate embeddings for text chunks using a pre-trained model."""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return np.array(embeddings, dtype=np.float32)


In [7]:
def create_faiss_index(embeddings):
    """Create a FAISS index for the embeddings."""
    if embeddings is None or len(embeddings) == 0:
        raise ValueError(
            "Embeddings are empty. Ensure text extraction is successful.")

    embeddings = np.array(embeddings, dtype=np.float32)

    if len(embeddings.shape) == 1:
        embeddings = embeddings.reshape(1, -1)  # Convert (N,) to (1, N)

    dimension = embeddings.shape[1]  # Extract the embedding dimension
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
    index.add(embeddings)
    return index


In [8]:
def save_knowledge_base(index, chunks, index_path, chunks_path):
    """Save the FAISS index and text chunks to disk."""
    faiss.write_index(index, index_path)
    with open(chunks_path, 'w', encoding='utf-8') as f:
        for chunk in chunks:
            f.write(chunk + '\n')

In [9]:

def load_knowledge_base(index_path, chunks_path):
    """Load the FAISS index and text chunks from disk."""
    if not os.path.exists(index_path) or not os.path.exists(chunks_path):
        raise FileNotFoundError("Knowledge base files are missing.")

    index = faiss.read_index(index_path)
    with open(chunks_path, 'r', encoding='utf-8') as f:
        chunks = f.read().splitlines()
    return index, chunks

In [10]:
def retrieve_relevant_chunks(query, index, chunks, top_k):
    """Retrieve the top-k most relevant chunks for a given query."""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query], convert_to_numpy=True)

    distances, indices = index.search(query_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]

    return relevant_chunks if relevant_chunks else ["No relevant data found."]

In [10]:
# # Step 9: Generate Response using a Pretrained Model

# def generate_response(query, context):
#     """Generate a response based on retrieved chunks using a text generator model."""

#     # Load the text generation model
#     generator = pipeline("text-generation", model="gpt2")

#     # Limit context length to avoid exceeding model's max length (GPT-2 has a 1024 token limit)
#     max_context_length = 800  # Keeping some space for query & generated output
#     truncated_context = context[:max_context_length]  # Truncate if needed

#     # Construct the prompt
#     prompt = f"Answer the following question based on the context:\n\nContext: {truncated_context}\n\nQuestion: {query}\nAnswer:"

#     # Generate response (fix: use max_new_tokens instead of max_length)
#     response = generator(prompt, max_new_tokens=500, num_return_sequences=1)[
#         0]['generated_text']

#     return response


In [23]:

def generate_text(prompt, model):
    for response in ollama.chat(model=model, messages=[{"role": "user", "content": prompt}], stream=True):
        print(response["message"]["content"], end="", flush=True) 

In [12]:
pdf_folder = "/"  
index_path = "knowledge_base.index"
chunks_path = "knowledge_base.txt"

if not os.path.exists(pdf_folder):
    raise FileNotFoundError(f"PDF folder '{pdf_folder}' not found.")

pdf_file = open('book.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

text_data = []
for page in pdf_reader.pages[32:]:
    text_data.append(page.extract_text())

pdf_file.close()

full_text = " ".join(text_data)
if not full_text.strip():
    raise ValueError(
        "No text extracted from PDFs. Check if files contain selectable text.")

In [None]:
cleaned_text = clean_text(full_text)
chunks = chunk_text(cleaned_text, chunk_size=500)
embeddings = generate_embeddings(chunks)
print(
    f"Generated {len(embeddings)} embeddings with shape: {embeddings.shape}")

index = create_faiss_index(embeddings)
save_knowledge_base(index, chunks, index_path, chunks_path)
index, chunks = load_knowledge_base(index_path, chunks_path)

In [None]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="knowledge_base")

for i, chunk in enumerate(chunks):
    collection.add(
        documents=[chunk],
        metadatas=[{"source": "Networking"}], 
        ids=[str(i)]
    )

In [18]:
def retrieve_relevant_chunkss(query, collection, top_k):
    results = collection.query(
        query_texts=[query], 
        n_results=top_k 
    )
    return results["documents"][0]  

In [26]:
model = "llama3.2:1b"
while True:
    query = input("\nEnter your query (or type 'exit' to quit): ").strip()
    if query.lower() == "exit":
        break

    relevant_chunks = retrieve_relevant_chunkss(query, collection, top_k=10)

    print("\n🔍 Relevant Chunks:")
    for i, chunk in enumerate(relevant_chunks):
        print(f"{i + 1}. {chunk}\n")


    context = " ".join(relevant_chunks)
    model_query = f"Briefly explain the {query} with the following context from textbook: {context}"
    generate_text(model_query, model)


🔍 Relevant Chunks:
1. the challenges of agility and quality that comes with cloud scale, his team has developed and embraced custom hardware, machine learning, and open source. albert moved to microsoft in 2007 to innovate on cloud and bring networking to the host network virtualization, ideas that appeared, among many, in his vl2 paper, and which underly cloud networking today. prior to joining microsoft, albert worked at bell labs and att labs as an att fellow. he helped build the systems and tools that run atts networks, and pioneered the architecture and systems at the foundations of softwaredefined networking. he holds an ab in mathematics from dartmouth college and a phd in computer science from the university of washington. albert is a member of the national academy of engineering, and an acm fellow. he has received the ieee koji kobayashi computer and communication award, acm sigcomm award, and acm sigcomm and sigmetrics test of time paper awards. albert and wife kathryn are p