In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain.schema.runnable import RunnablePassthrough
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType


COLLECTION_NAME = "excel_rag_db"
DIMENSION = 1536   # text-embedding-3-small
MILVUS_URI = "http://localhost:19530"

# Connect to local Milvus
client = MilvusClient(uri=MILVUS_URI)
print("Connected to local Milvus.")


Connected to local Milvus.


In [5]:

df = pd.read_csv("../data/IMDB-Movie-Data.csv", encoding="utf-8")

docs = []
for _, row in df.iterrows():
    content = " | ".join([f"{col}: {row[col]}" for col in df.columns])
    docs.append(Document(page_content=content))
print(f"Loaded {len(docs)} rows.")


Loaded 1000 rows.


In [7]:
chunk_size = 10  # number of rows per chunk
docs = []

for i in range(0, len(df), chunk_size):
    chunk_df = df.iloc[i:i + chunk_size]
    text_block = "\n".join(
        [" | ".join([f"{col}: {row[col]}" for col in df.columns]) for _, row in chunk_df.iterrows()]
    )
    docs.append(Document(page_content=text_block))

print(f"Grouped into {len(docs)} chunks ({chunk_size} rows each).")


✅ Grouped into 100 chunks (10 rows each).


In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
split_docs = splitter.split_documents(docs)
print(f"Final chunk count after text splitting: {len(split_docs)}")


Final chunk count after text splitting: 401


In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
texts = [doc.page_content for doc in split_docs]
vectors = embeddings.embed_documents(texts)
print(f"Generated {len(vectors)} embeddings of dim {len(vectors[0])}.")


✅ Generated 401 embeddings of dim 1536.


In [19]:
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType
from langchain_openai import OpenAIEmbeddings
import pandas as pd
from dotenv import load_dotenv
import os
from pymilvus import IndexParams

# Load environment variables if needed
load_dotenv()

COLLECTION_NAME = "imdb_documents"
MILVUS_URI = "http://localhost:19530"
EMBED_DIM = 1536
CHUNK_ROWS = 12

# Connect to Milvus
client = MilvusClient(uri=MILVUS_URI)
print("Connected to local Milvus")

# Drop old collection if exists
if COLLECTION_NAME in client.list_collections():
    client.drop_collection(COLLECTION_NAME)
    print(f"Dropped old collection: {COLLECTION_NAME}")

# Define collection schema
schema = CollectionSchema(
    fields=[
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBED_DIM),
    ],
    description="IMDB dataset chunks for local RAG",
)

# Create collection
client.create_collection(collection_name=COLLECTION_NAME, schema=schema)
print(f"Created new collection: {COLLECTION_NAME}")

# Load CSV data
df = pd.read_csv("../data/IMDB-Movie-Data.csv", encoding="utf-8")

# Chunk data
chunks = []
for i in range(0, len(df), CHUNK_ROWS):
    text_block = "\n".join(
        [", ".join([f"{col}: {row[col]}" for col in df.columns]) for _, row in df.iloc[i:i + CHUNK_ROWS].iterrows()]
    )
    chunks.append(text_block)

print(f"Total chunks created: {len(chunks)}")

# Generate embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectors = embeddings.embed_documents(chunks)
print(f"Generated {len(vectors)} embeddings of dimension {len(vectors[0])}")

# Insert records
records = [{"text": text, "embedding": vec} for text, vec in zip(chunks, vectors)]
client.insert(collection_name=COLLECTION_NAME, data=records)
print(f"Inserted {len(records)} records successfully")

# Create index on the vector field
index = IndexParams(
    index_type="IVF_FLAT",   # or "HNSW"
    metric_type="L2",        # "IP" for cosine similarity
    params={"nlist": 128}
)

client.create_index(
    collection_name=COLLECTION_NAME,
    field_name="embedding",
    index_params=index
)
print("Index created successfully")

# Load the collection into memory for search
client.load_collection(collection_name=COLLECTION_NAME)
print(f"Collection {COLLECTION_NAME} loaded into memory")

# Query a few records
results = client.query(
    collection_name=COLLECTION_NAME,
    filter="",
    output_fields=["id", "text", "embedding"],
    limit=2
)

for r in results:
    print("Record ID:", r["id"])
    print("Text sample:", r["text"][:150])
    print("Embedding length:", len(r["embedding"]))
    print("First 10 dims of embedding:", r["embedding"][:10])


ImportError: cannot import name 'IndexParams' from 'pymilvus' (C:\Users\AkshayRedekar\Documents\pocexcel\.venv\Lib\site-packages\pymilvus\__init__.py)

In [None]:
from pymilvus import MilvusClient
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

COLLECTION_NAME = "imdb_documents"
MILVUS_URI = "http://localhost:19530"

client = MilvusClient(uri=MILVUS_URI)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

query_text = "Tell me about the movie Avatar and its genre."
query_vector = embeddings.embed_query(query_text)

print("Query text:", query_text)
print("Embedding dimension:", len(query_vector))
print("First 10 dimensions of query embedding:", query_vector[:10])

search_results = client.search(
    collection_name=COLLECTION_NAME,
    data=[query_vector],
    limit=5,
    output_fields=["id", "text", "embedding"],
)

print("\nTop 5 similar chunks related to 'Avatar':\n")
for i, result in enumerate(search_results[0], start=1):
    print(f"Result {i}")
    print("Record ID:", result["id"])
    print("Text snippet:", result["text"][:200].replace("\n", " "))
    print("Embedding length:", len(result["embedding"]))
    print("First 10 dims of embedding:", result["embedding"][:10])
    print("-" * 80)
