In [5]:
# [CELL 1] - Check API Key
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')
print("API Key exists:", bool(api_key))  # Should print True
print("API Key starts with:", api_key[:5] if api_key else "No key found")

API Key exists: True
API Key starts with: sk-pr


In [None]:
import os
os.environ['OPENAI_API_KEY'] = ''

In [8]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Define paths
DATA_PATH = r"C:\Users\PC\Documents\VsCode\Projects\Datathon\LLMS\langchain-rag-tutorial\data\books"
CHROMA_PATH = "chroma"

def load_documents():
    loader = DirectoryLoader(
        DATA_PATH, 
        glob="*.md",
        loader_cls=TextLoader,
        loader_kwargs={'encoding': 'utf-8'}  # Specify encoding
    )
    documents = loader.load()
    print(f"Loaded {len(documents)} documents")
    return documents

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks

def create_vector_store(chunks):
    # Create embeddings and store in Chroma
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PATH
    )
    vector_store.persist()
    print(f"Created vector store at {CHROMA_PATH}")
    return vector_store

def main():
    # Load and process documents
    documents = load_documents()
    chunks = split_documents(documents)
    vector_store = create_vector_store(chunks)
    
    # Optional: test a simple query
    query = "What is this document about?"
    results = vector_store.similarity_search(query, k=2)
    print("\nTest Query Results:")
    for doc in results:
        print("\nContent:", doc.page_content[:200], "...")
        print("Source:", doc.metadata)

if __name__ == "__main__":
    main()

Loaded 1 documents
Split into 1 chunks


  vector_store.persist()


Created vector store at chroma


Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1



Test Query Results:

Content: # Alice in Wonderland

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but i ...
Source: {'source': 'C:\\Users\\PC\\Documents\\VsCode\\Projects\\Datathon\\LLMS\\langchain-rag-tutorial\\data\\books\\alice_in_wonderland.md', 'start_index': 0}
