In [None]:
#!pip install langchain unstructured openai chromadb Cython tiktoken pypdf patool

# Vector Store Stage:
![Vector_store](./images/VS.png)

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"


# Load The PDF Files:

In [5]:
pdf_folder_path = "./path_to_PDF_directory"
loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

In [6]:
type(docs)

list

### An example of a document:
 - Until now our documents are the pages of PDF files.
 - You can notice that we split the paragraphs with "__________", to help the retriever get the most relevant splits.

In [7]:
docs[0]

Document(page_content="When can we see the aurora at Middle latitudes?  \nThis is one of the toughest questions to answer because it depends on so many parameters. We do get \nthis question a lot of times and each time we have the same answer. In this help topic we’ll try to explain  \nwhat is needed before you can run outside.  \n__________  \nWhat is needed to see it?  \nThe first thing that needs to happen and also the most important thing to happen are earth -directed \nsolar eruptions or solar flares. You can follow this live on our website. Only the strong and major flares \n(high -level M and X -class flares) are candidates to cause sufficient storming for aurora on middle \nlatitudes. During strong and long -duration solar fl ares, material of the Sun gets blown into space. This is \ncalled a coronal mass ejection. If the eruption comes from a sunspot group which is located near the \ncentre of the earth -facing solar disk, then chances are high that a resulting coronal mass ej

# Splitting the documents:
 - Now our documents are the splits (chunks) from the pages.

In [9]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=0,
    separators=["__________"]
)

In [10]:
# Before splitting, we have 46 pages.
len(docs)

46

In [11]:
split_docs = splitter.split_documents(docs)

In [12]:
# After splitting we have 144 chunks/splits
len(split_docs)

144

In [13]:
# An example of a document (split/chunk)
split_docs[0]

Document(page_content='When can we see the aurora at Middle latitudes?  \nThis is one of the toughest questions to answer because it depends on so many parameters. We do get \nthis question a lot of times and each time we have the same answer. In this help topic we’ll try to explain  \nwhat is needed before you can run outside.  \n', metadata={'source': 'PDFs/When can we see the aurora at Middle latitudes.pdf', 'page': 0})

# Get the embedding for our splits:
 - the embeddings are used to get the most relevant documents.

In [16]:
embedding = OpenAIEmbeddings()
persist_directory = './persist_directory'

In [17]:
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding,
    persist_directory=persist_directory
)

# Testing the retriever:
 - We use similarity_search to get the most "K" (3) relevant splits.
 - You can see the splits we got discussing the subject we asked about.

In [19]:
question = "what are sunspots?"
answer = vectordb.similarity_search(question,k=3)
len(answer)

3

In [20]:
answer[0]

Document(page_content='What are sunspots?  \nSunspots form on the surface of the Sun due to strong magnetic field lines coming up from within the \nSun trough the solar surface and appear visibly as dark spots compared to their surroundings. These \nsunspots which can become many times bigger than the Earth are always dark because they are much \ncooler than the surrounding surface of the Sun itself. A big sunspot can have a temperature of 3700°C. \nThis sounds like much but if we compare this with the temperature of the photosphere of  the Sun which \nis about 5500°C, then you see that there is a considerable difference. As a matter of fact, if we could take \na sunspot out of the Sun and place it into our night sky it would only be as bright as the full moon, a very \nbig contrast with the b right Sun itself.  \n', metadata={'page': 0, 'source': 'PDFs/What are sunspots.pdf'})

In [21]:
answer[1]

Document(page_content='__________  \nSunspots are a common sight on our Sun during the years around solar maximum. Solar maximum or \nsolar max is the period of greatest solar activity in the solar cycle of the Sun, where one solar cycle lasts \nabout 11 years. Around solar mini mum, only very few or even no sunspots can be found. Sunspots form \nwhere magnetic field lines come up from the Sun’s interior trough the solar surface meaning that every \nsunspot has it’s own polarity.  \nA sunspot consists of two parts:  \n\uf0b7 The dark part (umbra)  \n\uf0b7 Lighter part around the dark part (penumbra)  \nImage ={ “ID”:”0 028”, ”description”: “”}  \n', metadata={'page': 0, 'source': 'PDFs/What are sunspots.pdf'})

In [22]:
answer[2]

Document(page_content='__________  \nBut more about those sunspots because without sunspots, we will not have any solar flares. Sunspots \nare darker and cooler a reas on the solar surface where strong magnetic field lines come up from the \ninterior of the Sun trough the solar surface. When these magnetic field lines become entangled with \neach other and snap, they release a huge amount of energy which we call a solar  flare. Sunspots are \nhowever not something we can always find on our Sun, the Sun follows a pattern of about 11 years \nwhere the Sun goes from pretty much no sunspots to very many sunspots, and back to no sunspots \nagain. This is what we call a solar cycle.  \n \n \n ', metadata={'page': 2, 'source': 'PDFs/basics.pdf'})

# Persist the splits and embeddings:

In [23]:
vectordb.persist()