In [1]:
import os
import numpy as np
import boto3
import json
import pickle
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from fastapi import FastAPI, Query
import openai
# from pydantic import BaseModel
from dotenv import load_dotenv
# For using with OpenAI embeddings
from openai import OpenAI

from s3 import S3FileManager
from chunk_strategy import markdown_chunking, semantic_chunking

# # Define the FastAPI app
# app = FastAPI(title="Naive RAG System")
load_dotenv()
# Configuration
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

EMBEDDING_DIMENSION = 1536  # OpenAI ada-002 embedding dimension
EMBEDDINGS_KEY = "embeddings.pkl"
DOCUMENTS_KEY = "manual_store.json"
DOCUMENTS_KEY_PKL = "manual_store.pkl"



In [2]:

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

base_path = "nvdia/"

s3_obj = S3FileManager(AWS_BUCKET_NAME, base_path)

In [3]:
def read_markdown_file(file, s3_obj):
    content = s3_obj.load_s3_file_content(file)
    return content


def get_embedding(chunks):
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=chunks
    )
    return response.data

def create_manual_vector_store(file, chunks, chunk_strategy):
    vectors = []
    file = file.split('/')
    parser = file[1]
    identifier = file[2]
    year = identifier[2:6]
    quarter = identifier[6:]

    embeddings_data = get_embedding(chunks)
    for i, embed in enumerate(embeddings_data):
        vectors.append({
            "id": f"{identifier}_{parser}_chunk_{i}",
            "embedding": list(embed.embedding),
            "metadata": {
                "year": year,
                "quarter": quarter,
                "parser": parser,
                "chunk_type": chunk_strategy,
                "text": chunks[i]
            }
        })
    return vectors

In [23]:
def save_to_s3(vectors):
    """Save documents and embeddings to S3."""
    # Save documents
    docs_json = json.dumps([doc for doc in vectors])
    s3_obj.upload_file(AWS_BUCKET_NAME, DOCUMENTS_KEY, docs_json)

# def save_to_s3_pickle(vectors, key=DOCUMENTS_KEY):
#     """Save documents and embeddings to S3 using pickle."""
#     pickle_data = pickle.dumps(vectors)
#     s3_obj.upload_file(AWS_BUCKET_NAME, DOCUMENTS_KEY_PKL, pickle_data)

def load_from_s3():
    # Load documents
    docs_json = s3_obj.load_s3_file_content(DOCUMENTS_KEY)
    documents = json.loads(docs_json)
    return documents

def save_to_s3_pickle(vectors, key=DOCUMENTS_KEY):
    save_file_path = f"{s3_obj.base_path}/{DOCUMENTS_KEY_PKL}"
    pickle_data = pickle.dumps(vectors)
    s3_obj.upload_file(AWS_BUCKET_NAME, save_file_path, pickle_data)

def load_from_s3_pickle(key=DOCUMENTS_KEY_PKL):
    save_file_path = f"{s3_obj.base_path}/{DOCUMENTS_KEY_PKL}"
    pickle_data = s3_obj.load_s3_pdf(save_file_path)
    return pickle.loads(pickle_data)


def search_similar_documents(parser, chunking_strategy, query, top_k=3, year=None, quarter=None):
    query_embedding = get_embedding(query)

    # docs_json = s3_obj.load_s3_file_content(DOCUMENTS_KEY)
    # print("loaded json from s3")
    # documents = json.loads(docs_json)
    # print("loaded json to docs")
    print("pkl started")
    documents = load_from_s3_pickle()
    print("pkl Done")
    
    print(documents[:2])
    
    filtered_docs = [doc for doc in documents if 
                     (doc['metadata']['year'] in year) and 
                     (doc['metadata']['quarter'] in quarter) and 
                     (doc['metadata']['chunk_type'] == chunking_strategy) and 
                     (doc['metadata']['parser'] == parser)]
    
    if not filtered_docs:
        return []
    

    query_embedding = query_embedding[0].embedding
    doc_embeddings = [doc['embedding'] for doc in filtered_docs]

    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    top_docs = [filtered_docs[i] for i in top_indices]
    top_similarities = [float(similarities[i]) for i in top_indices]

    results = []
    for doc, similarity in zip(top_docs, top_similarities):
        results.append({
            'id': doc['id'],
            'similarity': similarity,
            'metadata': doc['metadata']
        })
    return results

In [24]:
base_path = "nvdia/"
    
s3_obj = S3FileManager(AWS_BUCKET_NAME, base_path)
files = list({file for file in s3_obj.list_files() if file.endswith('.md')})
print(files)
files = files[:2]
all_vectors = []
for i, file in enumerate(files):
    print(f"Processing File {i+1}: {file}")
    content = read_markdown_file(file, s3_obj)
    
    print("Using markdown chunking strategy...")
    chunks = markdown_chunking(content, heading_level=2)
    print(f"Chunk size: {len(chunks)}")

    vector = create_manual_vector_store(file, chunks, "markdown")
    all_vectors.extend(vector)
# save_to_s3(all_vectors)
save_to_s3_pickle(all_vectors)

['nvdia/docling/FY2025Q3/extracted_data.md', 'nvdia/mistral/FY2025Q4/extracted_data.md', 'nvdia/mistral/FY2025Q3/extracted_data.md', 'nvdia/mistral/FY2025Q1/extracted_data.md', 'nvdia/docling/FY2025Q2/extracted_data.md', 'nvdia/mistral/FY2025Q2/extracted_data.md', 'nvdia/docling/FY2025Q1/extracted_data.md', 'nvdia/docling/FY2025Q4/extracted_data.md']
Processing File 1: nvdia/docling/FY2025Q3/extracted_data.md
Using markdown chunking strategy...
Chunk size: 153
Processing File 2: nvdia/mistral/FY2025Q4/extracted_data.md
Using markdown chunking strategy...
Chunk size: 191


In [26]:
query = " risk factors?"

parser = 'mistral'
chunking_strategy = 'markdown'
top_k = 3
year = ["2025"]
quarter = ["Q3"]
res = search_similar_documents(parser, chunking_strategy, query, top_k, year, quarter)
res

pkl started
pkl Done
[{'id': 'FY2025Q3_docling_chunk_0', 'embedding': [-0.00365094980224967, -0.012917857617139816, 0.054134905338287354, 0.08689914643764496, -0.012309686280786991, -0.01605878956615925, -0.01662847027182579, 0.055705372244119644, -0.009330418892204762, 0.034026775509119034, 0.00919954665005207, 0.024342231452465057, -0.0442809984087944, 0.0062510729767382145, 0.05373459309339523, 0.03649025410413742, 0.013926343061029911, 0.018214331939816475, 0.000779459485784173, -0.010823901742696762, -0.003976205829530954, -0.0013885926455259323, 0.011178026907145977, 0.012117227539420128, 0.024434613063931465, 0.028160620480775833, -0.03513533994555473, -0.05906185880303383, -0.0442809984087944, -0.007748404983431101, 0.01799877919256687, -0.04169434681534767, 0.0429568812251091, -0.01773703470826149, 0.045913051813840866, 0.016305137425661087, -0.011085646227002144, -0.01174770575016737, -0.010592950507998466, -0.007656024303287268, 0.042926087975502014, -0.06263390183448792, 0.

[]

In [None]:
query_embedding

list

In [16]:
query_embedding = get_embedding(query)
query_embedding[0].embedding

[-0.041926365345716476,
 -0.010448547080159187,
 0.08105058968067169,
 0.042587246745824814,
 0.020645959302783012,
 -0.02188841626048088,
 0.03978510573506355,
 -0.005412624683231115,
 -0.07089943438768387,
 0.00810241512954235,
 -0.03436587378382683,
 -0.009424179792404175,
 -0.011988402344286442,
 0.03166947141289711,
 -0.03084997832775116,
 -0.01962820068001747,
 0.028840897604823112,
 0.02700364589691162,
 -0.06830877810716629,
 0.04483424499630928,
 0.024227939546108246,
 0.03402221202850342,
 0.03182808309793472,
 -0.005845502484589815,
 0.0039652930572628975,
 -0.006436992436647415,
 -0.007289530243724585,
 -0.03322915360331535,
 0.016033001244068146,
 0.01865009404718876,
 0.03217174485325813,
 -0.029633956030011177,
 0.006159421522170305,
 -0.02751913294196129,
 -0.006694736424833536,
 -0.017196154221892357,
 -0.010032190941274166,
 -0.03455092012882233,
 -0.001898383954539895,
 -0.001890123006887734,
 0.007811626885086298,
 0.0043915617279708385,
 -0.01022384688258171,
 -0.0

In [46]:
embedding_data[2].embedding

[0.0474097840487957,
 0.027897074818611145,
 0.02071954868733883,
 0.051500339061021805,
 0.021646911278367043,
 0.0192459337413311,
 -0.011001305654644966,
 0.04207427799701691,
 -0.0080921845510602,
 -0.018445607274770737,
 0.03422345966100693,
 0.01628599688410759,
 0.012906843796372414,
 -0.03234333172440529,
 0.050560273230075836,
 0.022510753944516182,
 0.015688929706811905,
 -0.03277525305747986,
 -0.008365311659872532,
 -0.014634530991315842,
 0.0367133654654026,
 0.011052119545638561,
 -0.027312710881233215,
 0.0011195035185664892,
 -0.030285349115729332,
 0.012189091183245182,
 -0.06463584303855896,
 -0.009667429141700268,
 0.005999268498271704,
 0.029751798138022423,
 0.03757720813155174,
 -0.035214342176914215,
 -0.0049004084430634975,
 -0.006980620790272951,
 0.02279023267328739,
 0.0469016395509243,
 -0.008854399435222149,
 -0.00607866607606411,
 -0.0025073701981455088,
 -0.03272443637251854,
 -0.010029480792582035,
 -0.018140720203518867,
 0.051983073353767395,
 -0.00535

In [39]:
type(embedding_data[0].embedding)

list

In [None]:
# Search for similar documents
# query = "NVDIA risk factors?"
# results = search_similar_documents(query, top_k=2)

# print(f"Query: {query}")
# for doc, sim in zip(results, sims):
#     print(f"Similarity: {sim:.4f}")
#     print(f"Content: {doc['content']}")
#     print(f"Metadata: {doc['metadata']}")
#     print("---")

In [None]:


# FastAPI routes
@app.post("/index")
async def index_route(documents: List[str], metadatas: List[Dict[str, Any]] = None):
    """Index documents by computing embeddings and storing in S3."""
    return index_documents(documents, metadatas)

@app.post("/search", response_model=QueryResponse)
async def search_route(request: QueryRequest):
    """Search for similar documents based on cosine similarity."""
    documents, similarities = search_similar_documents(request.query, request.top_k)
    return QueryResponse(documents=documents, similarities=similarities)

@app.get("/health")
async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}

# Example usage
if __name__ == "__main__":
    # Example documents
    sample_docs = [
        "The quick brown fox jumps over the lazy dog.",
        "Machine learning is a subset of artificial intelligence.",
        "Natural language processing deals with the interaction between computers and human language.",
        "Retrieval-augmented generation combines information retrieval with text generation.",
        "Vector databases are specialized databases that store vector embeddings efficiently."
    ]
    
    # Example metadata
    sample_metadata = [
        {"source": "example", "category": "proverb"},
        {"source": "example", "category": "AI"},
        {"source": "example", "category": "NLP"},
        {"source": "example", "category": "RAG"},
        {"source": "example", "category": "Databases"}
    ]
    
    # Index documents
    index_documents(sample_docs, sample_metadata)
    
    # Search for similar documents
    query = "How does RAG work?"
    results, sims = search_similar_documents(query, top_k=2)
    
    print(f"Query: {query}")
    for doc, sim in zip(results, sims):
        print(f"Similarity: {sim:.4f}")
        print(f"Content: {doc['content']}")
        print(f"Metadata: {doc['metadata']}")
        print("---")

NameError: name 'Document' is not defined

In [3]:
DOCUMENTS_KEY_PKL = "manual_store.pkl"

def save_to_s3_pickle(s3_obj, vectors, key=DOCUMENTS_KEY_PKL):
    save_file_path = f"{s3_obj.base_path}/{DOCUMENTS_KEY_PKL}"
    pickle_data = pickle.dumps(vectors)
    s3_obj.upload_file(AWS_BUCKET_NAME, save_file_path, pickle_data)

def load_from_s3_pickle(s3_obj, key=DOCUMENTS_KEY_PKL):
    try:
        save_file_path = f"{s3_obj.base_path}/{DOCUMENTS_KEY_PKL}"
        pickle_data = s3_obj.load_s3_pdf(save_file_path)
        return pickle.loads(pickle_data)
    except:
        return []


def read_markdown_file(file, s3_obj):
    content = s3_obj.load_s3_file_content(file)
    return content

def get_embedding(chunks):
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=chunks
    )
    return response.data

In [None]:
def get_manual_vector_store(file, chunks, chunk_strategy):
    vectors = []
    file = file.split('/')
    parser = file[-2]
    year = file[-4]
    quarter = file[-3]
    identifier = f"FY{year}{quarter}"

    embeddings_data = get_embedding(chunks)
    for i, embed in enumerate(embeddings_data):
        vectors.append({
            "id": f"{identifier}_{parser}_{chunk_strategy}_chunk_{i}",
            "embedding": list(embed.embedding),
            "metadata": {
                "year": year,
                "quarter": quarter,
                "parser": parser,
                "chunk_type": chunk_strategy,
                "text": chunks[i]
            }
        })
    return vectors

def create_manual_vector_store():
    print("helloooo")
    base_path = "nvidia/"
    s3_obj = S3FileManager(AWS_BUCKET_NAME, base_path)
    all_vectors = load_from_s3_pickle(s3_obj)
    print(all_vectors[:2])
    # all_vectors=[]
    files = list({file for file in s3_obj.list_files() if file.endswith('.md')})
    print(files)
    # all_vectors = []
    for i, file in enumerate(files):
        print(f"Processing File {i+1}: {file}")
        content = read_markdown_file(file, s3_obj)

        chunks = markdown_chunking(content, heading_level=2)
        print(f"Markdown Chunk size: {len(chunks)}")
        vector = get_manual_vector_store(file, chunks, "markdown")
        all_vectors.extend(vector)

        chunks = semantic_chunking(content)
        print(f"semantic Chunk size: {len(chunks)}")
        vector = get_manual_vector_store(file, chunks, "semantic")
        all_vectors.extend(vector)

        chunks = sliding_window_chunking(content)
        print(f"sliding Chunk size: {len(chunks)}")
        vector = get_manual_vector_store(file, chunks, "sliding_window")
        all_vectors.extend(vector)

    save_to_s3_pickle(s3_obj, all_vectors)

In [9]:
base_path = "nvidia/"
s3_obj = S3FileManager(AWS_BUCKET_NAME, base_path)
all_vectors = load_from_s3_pickle(s3_obj)

In [10]:
all_vectors

[]