In [1]:
import PyPDF2
import re
import os

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)

        full_text = ""
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            text = re.sub(r'\s+', ' ', text)  
            text = text.strip()

            full_text += text + "\n\n"  

    return full_text

def save_text_to_file(text, output_path):

    cleaned_text = text.encode('utf-8', 'ignore').decode('utf-8')
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

def process_textbooks(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    combined_text = ""

    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Processing textbook: {filename}")

            extracted_text = extract_text_from_pdf(pdf_path)

            combined_text += f"--- START OF {filename} ---\n\n"
            combined_text += extracted_text
            combined_text += f"\n\n--- END OF {filename} ---\n\n"

    output_path = os.path.join(output_folder, "combined_textbooks.txt")
    save_text_to_file(combined_text, output_path)
    print(f"Combined text from all textbooks saved to {output_path}")

    print("All textbooks processed and combined.")


input_folder = r"C:/Users/jaykh/OneDrive/Desktop/Assessment/Books"
output_folder = r"C:/Users/jaykh/OneDrive/Desktop/Assessment/processed_textbooks"

process_textbooks(input_folder, output_folder)

Processing textbook: Python-for-Data-Analysis.pdf
Processing textbook: Python_Datascience.pdf
Processing textbook: think-python-2nd.pdf
Combined text from all textbooks saved to C:/Users/jaykh/OneDrive/Desktop/Assessment/processed_textbooks\combined_textbooks.txt
All textbooks processed and combined.


In [2]:
import os
import nltk
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

nltk.download('punkt', quiet=True)

def chunk_text(text, tokenizer, target_chunk_size=100):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_token_count = len(sentence_tokens)

        if current_chunk_size + sentence_token_count > target_chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_chunk_size = 0

        current_chunk.append(sentence)
        current_chunk_size += sentence_token_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def process_folder(input_folder):
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    all_chunks = []
    processed_files = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_folder, filename)

            print(f"Processing file: {filename}")
            processed_files.append(filename)

            with open(input_path, 'r', encoding='utf-8') as f:
                text = f.read()

            chunks = chunk_text(text, tokenizer)
            all_chunks.extend(chunks)

    print(f"Total files processed: {len(processed_files)}")
    print(f"Files processed: {', '.join(processed_files)}")
    print(f"Total chunks created: {len(all_chunks)}")
    return all_chunks

def embed_chunks(chunks, model):
    return model.encode(chunks)


input_folder = r'C:/Users/jaykh/OneDrive/Desktop/Assessment/processed_textbooks'

all_chunks = process_folder(input_folder)

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_chunks(all_chunks, model)

print(f"Shape of embeddings: {embeddings.shape}")
print("First few embeddings:")
embeddings

  from .autonotebook import tqdm as notebook_tqdm


Processing file: combined_textbooks.txt


Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


Total files processed: 1
Files processed: combined_textbooks.txt
Total chunks created: 4918
Shape of embeddings: (4918, 384)
First few embeddings:


array([[ 0.0166933 ,  0.0186421 ,  0.02215422, ...,  0.06304386,
        -0.03058149, -0.04582713],
       [-0.02789541, -0.0171906 , -0.04590717, ..., -0.02303373,
         0.04750705, -0.0202001 ],
       [ 0.00291845, -0.02582782, -0.01804552, ..., -0.06332909,
         0.00856837,  0.01366187],
       ...,
       [ 0.0303704 ,  0.04122302,  0.03124293, ...,  0.00891567,
        -0.02937707,  0.07764631],
       [-0.06540864,  0.08969024,  0.00168537, ..., -0.02066067,
         0.09093604,  0.10079562],
       [-0.05652855,  0.02541031, -0.08752126, ...,  0.09206382,
         0.07980143,  0.07125656]], dtype=float32)

In [3]:
from sklearn.mixture import GaussianMixture
import numpy as np

def gmm_clustering(embeddings, n_components=10):
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    probabilities = gmm.fit_predict(embeddings)
    return probabilities, gmm

cluster_probs, gmm_model = gmm_clustering(embeddings)

In [4]:
import google.generativeai as genai
import time
from tenacity import retry, stop_after_attempt, wait_exponential
from sentence_transformers import SentenceTransformer

genai.configure(api_key="AIzaSyAvGySbPqXTrwg_Tkp1UMoJIBWt667dnIw")  # Replace with your actual API key

generation_config = {
    "temperature": 0,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 2048,
}

safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]

gen_model = genai.GenerativeModel(model_name="gemini-1.0-pro",
                                  generation_config=generation_config,
                                  safety_settings=safety_settings)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(5))
def summarize_cluster(cluster_texts):
    combined_text = "\n\n".join(cluster_texts[:5])  
    prompt = f"Summarize the following texts concisely:\n\n{combined_text}"

    try:
        response = gen_model.generate_content(prompt)
        if response.parts:
            return response.text
        else:
            return "Summary could not be generated due to content restrictions."
    except Exception as e:
        print(f"Error in summarize_cluster: {str(e)}")
        return "Error in summarization process."

def batch_summarize_clusters(all_cluster_texts, batch_size=5):
    summaries = []
    for i in range(0, len(all_cluster_texts), batch_size):
        batch = all_cluster_texts[i:i+batch_size]
        batch_summaries = [summarize_cluster(cluster) for cluster in batch]
        summaries.extend(batch_summaries)
        time.sleep(5)  
    return summaries


all_cluster_texts = []
for cluster_id in set(cluster_probs):
    cluster_texts = [all_chunks[i] for i, prob in enumerate(cluster_probs) if prob == cluster_id]
    all_cluster_texts.append(cluster_texts)

cluster_summaries = batch_summarize_clusters(all_cluster_texts)

summary_embeddings = embed_model.encode(cluster_summaries)

print("Cluster summaries:", cluster_summaries)
print("Summary embeddings shape:", summary_embeddings.shape)

Cluster summaries: ['**Python for Data Analysis**\n\nThis book introduces Python for data analysis, highlighting its advantages as a "glue" language and its essential libraries (NumPy, pandas, matplotlib, IPython, SciPy). It covers installation, setup, and navigating the book, including code examples, data sources, and jargon.\n\n**IPython Basics**\n\nIPython is an interactive Python shell that enhances productivity with features like tab completion, introspection, code execution from the clipboard, keyboard shortcuts, magic commands, and a Qt-based GUI console. It also integrates with Matplotlib and provides tools for debugging, profiling, and interacting with the operating system.', '**NumPy Basics**\n\nNumPy provides multidimensional arrays (ndarrays) for efficient data manipulation. It supports various data types, indexing, slicing, and element-wise operations. NumPy also offers universal functions for fast array computations, data processing, and mathematical/statistical methods.\

In [5]:
def recursive_raptor(embeddings, texts, depth=0, max_depth=3, min_cluster_size=5):
    if depth >= max_depth or len(embeddings) <= min_cluster_size:
        return {"summary": summarize_cluster(texts), "embeddings": embeddings, "texts": texts}

    cluster_probs, gmm_model = gmm_clustering(embeddings)

    clusters = {}
    for cluster_id in set(cluster_probs):
        cluster_indices = [i for i, prob in enumerate(cluster_probs) if prob == cluster_id]
        cluster_embeddings = embeddings[cluster_indices]
        cluster_texts = [texts[i] for i in cluster_indices]

        clusters[cluster_id] = recursive_raptor(cluster_embeddings, cluster_texts, depth + 1, max_depth, min_cluster_size)

    return clusters


raptor_index = recursive_raptor(summary_embeddings, all_chunks)


In [6]:
from pymilvus import connections

def connect_to_milvus():
    try:
        connections.connect("default", host="localhost", port="19530")
        print("Successfully connected to Milvus")
        return True
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        return False

if connect_to_milvus():
    print("Milvus is ready to use")
else:
    print("Failed to connect to Milvus")

Successfully connected to Milvus
Milvus is ready to use


In [7]:
import numpy as np

def process_cluster(cluster):
    if isinstance(cluster, dict) and 'embeddings' in cluster:
        embeddings = cluster['embeddings']
        if isinstance(embeddings, np.ndarray):
            embeddings = embeddings.tolist()
        
        return [{
            "embedding": embedding,
            "summary": cluster.get('summary', ''),
            "text": cluster.get('text', '')
        } for embedding in embeddings]
    elif isinstance(cluster, str):
        print(f"Found string data: {cluster[:50]}...")
        return []
    elif isinstance(cluster, np.ndarray):
        print(f"Found numpy array of shape: {cluster.shape}")
        return [{
            "embedding": embedding.tolist(),
            "summary": "",
            "text": ""
        } for embedding in cluster]
    else:
        print(f"Unexpected structure: {type(cluster)}")
        return []

entities = []

def process_nested_structure(structure):
    if isinstance(structure, dict):
        for value in structure.values():
            process_nested_structure(value)
    elif isinstance(structure, (list, tuple)):
        for item in structure:
            process_nested_structure(item)
    else:
        entities.extend(process_cluster(structure))

process_nested_structure(raptor_index)

if entities:
    try:
        collection.insert(entities)
        print(f"Inserted {len(entities)} entities into the collection")
    except Exception as e:
        print(f"Error inserting entities: {e}")
        print("First few entities:", entities[:3])
else:
    print("No valid entities found to insert")

Found string data: The first edition of the book was initially releas...
Found numpy array of shape: (1, 384)
Found string data: Revision History for the First Edition: 2012-10-05...
Found string data: O'Reilly Media publishes books for educational, bu...
Found numpy array of shape: (1, 384)
Found string data: Published by O’Reilly Media, Inc., 1005 Gravenstei...
Found string data: **Text 1:**

This book was edited by Julie Steele ...
Found numpy array of shape: (1, 384)
Found string data: Editors: Julie Steele and Meghan Blanchette Produc...
Found string data: **Python for Data Analysis**

This book provides a...
Found numpy array of shape: (1, 384)
Found string data: --- START OF Python-for-Data-Analysis.pdf ---

www...
Found string data: **Book Information:**

* ISBN: 978-1-449-31979-3
*...
Found numpy array of shape: (1, 384)
Found string data: ISBN: 978-1-449-31979-3 [LSI] 1349356084 www.it-eb...
Found string data: **Python as a Versatile Tool**

Python is a versat...
Found numpy 

In [8]:
%%writefile app.py
import streamlit as st
from pymilvus import Collection, connections
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import re


@st.cache_resource
def init_milvus():
    connections.connect("default", host="localhost", port="19530")
    collection = Collection("raptor_index")
    collection.load()
    return collection

@st.cache_resource
def init_models():
    dense_model = SentenceTransformer('all-MiniLM-L6-v2')
    genai.configure(api_key="AIzaSyAvGySbPqXTrwg_Tkp1UMoJIBWt667dnIw")
    gen_model = genai.GenerativeModel('gemini-pro')
    return dense_model, gen_model

collection = init_milvus()
dense_model, gen_model = init_models()

def extract_book_title(text):
    words = text.split()
    if len(words) > 5:
        potential_title = ' '.join(words[:5])
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of'}
        title_words = [word for word in potential_title.split() if word.lower() not in stop_words]
        return ' '.join(title_words) if title_words else "Unknown Book"
    return "Unknown Book"

def hybrid_retrieval(query, top_k=10):
    query_embedding = dense_model.encode([query])[0]
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding.tolist()],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["summary", "text"]
    )
    return [hit.id for hits in results for hit in hits]

def rerank_results(query, doc_ids, top_k=5):
    texts = collection.query(
        expr=f"id in {doc_ids}",
        output_fields=["id", "text"]
    )
    return [doc['id'] for doc in texts][:top_k]

def generate_answer(query, context):
    try:
        prompt = f"Question: {query}\n\nContext: {context}\n\nAnswer:"
        response = gen_model.generate_content(prompt)
        return response.text
    except Exception as e:
        st.error(f"Error generating answer: {e}")
        return "Sorry, I couldn't generate an answer due to a technical issue."

def ask_question(question, top_k=5):
    expanded_question = question
    initial_results = hybrid_retrieval(expanded_question, top_k=top_k*2)
    reranked_results = rerank_results(expanded_question, initial_results, top_k=top_k)
    context_docs = collection.query(
        expr=f"id in {reranked_results}",
        output_fields=["text"]
    )
    context = " ".join([doc['text'] for doc in context_docs])
    answer = generate_answer(question, context)
    return answer, reranked_results

def main():
    st.title("Step AI Assessment")

    question = st.text_input("Please enter your question:")
    
    if st.button("Get Answer"):
        if question:
            with st.spinner("Generating answer..."):
                answer, source_ids = ask_question(question)
            
            st.subheader("Answer:")
            st.write(answer)
            
        else:
            st.warning("Please enter a question.")

if __name__ == "__main__":
    main()

Overwriting app.py


In [9]:
!streamlit run app.py

^C
