In [33]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [34]:
# from langchain.document_loaders import PyPDFLoader
# loader = PyPDFLoader("ncair_data.pdf")
# pages = loader.load()

In [35]:
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader('./datasets')
documents = loader.load()
documents[:3]

[Document(page_content="Abuja FabLab 1.0 is the first digital Fabrication Laboratory (FABLAB) in Nigeria, launched by the \nNational Information Technology Development Agency (NITDA) to promote digital innovation and \nentrepreneurship. The lab is situated at the National Centre for  Artificial Intelligence and Robotics \n(NCAIR), a subsidiary of NITDA. Its inauguration took place on May 20, 2022, and it is envisioned to be \na significant catalyst for technological advancements in the country.  \n \nThe primary objective of Abuja FabLab 1.0 i s to encourage startups, innovators, and makers to ideate, \ndesign, and rapidly prototype digital products that have real -life applications. It serves as a hub where \naspiring entrepreneurs can bring their ideas to life and transform them into tangible proto types. The \nlab is equipped with advanced tools and resources that enable users to work with a wide range of \nmaterials, including iron, wood, electronics, and more, allowing for diverse

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # separator="\n",
    chunk_size=1500,
    chunk_overlap=150,
    # length_function=len
)

In [37]:
docs = text_splitter.split_documents(documents)

In [38]:
len(docs)

353

In [39]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
from langchain.vectorstores import Chroma

In [40]:
persist_directory = 'docs/chroma/'

In [41]:
# ! rmdir /s/q docs  # remove old database files if any

### Create a list of unique ids for each document based on the content

In [42]:
import uuid
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in docs]
unique_ids = list(set(ids))

### Ensure that only docs that correspond to unique ids are kept and that only one of the duplicate ids is kept

In [43]:
seen_ids = set()
unique_docs = [doc for doc, id in zip(docs, ids) if id not in seen_ids and (seen_ids.add(id) or True)]

In [44]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    ids=unique_ids,
    persist_directory=persist_directory
)

In [45]:
vectordb.persist()

In [46]:
vectordb._collection.count()

353

In [47]:
question = "what is ncair"

In [48]:
quest = vectordb.similarity_search(question,k=2)
# quest = vectordb.max_marginal_relevance_search(question, k=2)
quest

[Document(page_content='About NCAIR  \nThe National Centre for Artificial Intelligence and Robotics (NCAIR) is \none of NITDA’s special purpose vehicles created to promote research and \ndevelopment on emerging technologies and their practical application in areas \nof Nigerian national interest. NCAIR i s also focused on creating a thriving \necosystem for innovation -driven entrepreneurship (IDE), job creation, and \nnational development.  \nFor more info: https://www .ncair.nitda.gov.ng  \n \nSigned  \nMr Kashif Inuwa Abdullahi CCIE  \nDG/CEO NITDA', metadata={'source': 'datasets\\Press Brief - NITDA FABLAB V1 (1).pdf', 'page': 1}),
 Document(page_content='NCAIR  Contact:  \n1. 08178778499  \n2. 08178778501  \nHead Office: NCAIR Building, suite 5 wilflox plaza, Wuye, Abuja . Email:  \nemail: ncair@nitda.gov.ng  \nInstagram: https://www.instagram.com/ncairnigeria/   \nTwitter: https://twitter.com/NCAIRNigeria  \nLinkedIn: https://www.linkedin.com/in/ncair -nigeria -\n49b63820a/?origi