In [8]:
# !pip install pypdf

In [5]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader  # Changed from PDFLoader to PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
pdf_path = "./Data/Startup Course Text Data.pdf" 

print("--- Loading PDF document ---")
loader = PyPDFLoader(pdf_path) 
documents = loader.load()

print(f"Loaded {len(documents)} pages from PDF")

--- Loading PDF document ---
Loaded 374 pages from PDF


In [11]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  # Adding some overlap for better context
docs = text_splitter.split_documents(documents)

In [12]:
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")


--- Document Chunks Information ---
Number of document chunks: 374


In [None]:
def create_vector_store(docs, embeddings, store_name):
    persistent_directory = "./VectorStores" + store_name

    print(f"\n--- Creating vector store {store_name} ---")

    Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
    
    print(f"--- Finished creating vector store {store_name} ---")

In [14]:
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

  from tqdm.autonotebook import tqdm, trange


In [15]:
# Create the vector store for the startup masterclass
create_vector_store(docs, huggingface_embeddings, "/startup_masterclass_vector_store")


--- Creating vector store /startup_masterclass_vector_store ---
--- Finished creating vector store /startup_masterclass_vector_store ---
