In [1]:
# This notebook is for testing the parsing of documents in the doc folder.

import chromadb
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse
import nest_asyncio
from uuid import uuid4

load_dotenv()

True

In [2]:
nest_asyncio.apply()

# set up parser
parser = LlamaParse(
    result_type="text"
)

# set up embeddings to convert text to vectors
model_name = "distilbert/distilbert-base-cased"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name distilbert/distilbert-base-cased. Creating a new one with mean pooling.


In [8]:
# set up chromadb to store vectors
vector_store = Chroma(
    collection_name="iqma_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)

# use SimpleDirectoryReader to parse our files
file_extractor = {".docx": parser}
reader = SimpleDirectoryReader(input_dir="./docs", file_extractor=file_extractor)
documents = reader.load_data() # parsed files

No sentence-transformers model found with name distilbert/distilbert-base-cased. Creating a new one with mean pooling.


Started parsing the file under job_id 3e142e77-19b6-462a-860e-088828752211
Started parsing the file under job_id 38686afd-225f-4a88-9d3c-9763049e07e7
Started parsing the file under job_id 6c99efe6-a1fb-41d1-8691-0362031bdd79
Started parsing the file under job_id 2af58cac-b795-4a81-bfde-45791fb78ee7
Started parsing the file under job_id ba9599b6-482f-4208-89bc-2772b1b47a39
Started parsing the file under job_id e49f9a91-a093-4993-a777-b33bb08981c3
Started parsing the file under job_id 4d1e3909-5e51-44f3-960f-0b43a601707b
Started parsing the file under job_id a66fa978-357b-447a-8cca-ce5c38b34573
Started parsing the file under job_id cf128097-4fa6-41cd-8ffa-b8eb9b953537
Started parsing the file under job_id c935575a-0f19-4aef-8446-87bbbe874b31
Started parsing the file under job_id 3ee18a2f-67a3-411a-bdf6-5fe591ab534a
Started parsing the file under job_id dfbc15ba-a2fb-4726-93e6-a48698dd98e3
Started parsing the file under job_id 8ba43fb2-beed-479f-86f3-09e22627e3d7
Started parsing the file 

In [9]:
documents

[Document(id_='39a844a9-77d5-4957-afca-f1b94ce2673e', embedding=None, metadata={'file_path': 'c:\\Users\\kengb\\Documents\\GitHub\\fyp2024\\backend\\src\\chatbot\\docs\\15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_name': '15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 28991, 'creation_date': '2024-08-30', 'last_modified_date': '2024-08-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="15 Psychological Mind Tricks To Get People To Do\nWhat You Want\n           https://www.youtube.com/watch/ld7KtnULvVc\n\nThe art of persuasion is a subtle complex discipline which requires skill but\nlucky for all our

In [23]:
print(documents[0].id_)

39a844a9-77d5-4957-afca-f1b94ce2673e


In [24]:
from langchain_core.documents import Document

# print(documents[0].text)
new_docs = []
for doc in documents:
    new_doc = Document(
        page_content=doc.text,
        id=doc.id_,
        metadata =doc.metadata
    )
    new_docs.append(new_doc)

In [25]:
new_docs

[Document(id='39a844a9-77d5-4957-afca-f1b94ce2673e', metadata={'file_path': 'c:\\Users\\kengb\\Documents\\GitHub\\fyp2024\\backend\\src\\chatbot\\docs\\15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_name': '15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 28991, 'creation_date': '2024-08-30', 'last_modified_date': '2024-08-30'}, page_content="15 Psychological Mind Tricks To Get People To Do\nWhat You Want\n           https://www.youtube.com/watch/ld7KtnULvVc\n\nThe art of persuasion is a subtle complex discipline which requires skill but\nlucky for all our viewers out there We are gonna let you in on 15 useful\ntricks hidden in the enormous wealth of psychological knowledge Millions of\npeople all over the world have been swindling manipulating and persuading\npeople with great success, but how do they do it? well If you understand how\n

In [26]:
# save vectors into chromadb
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("iqma_collection")

# adding documents to chroma
uuids = [str(uuid4()) for _ in range(len(new_docs))]
vector_store.add_documents(documents=new_docs, ids=uuids)

['3c364d00-1300-4036-9a94-183b20b0e7ba',
 'd4769564-c9b7-47a8-9228-604de414b300',
 '97010169-8807-4a91-a842-ad5e6ffe5ddc',
 '77980db3-2fa4-4e6e-a907-f6baf8d35fab',
 'aecca1b2-fd7d-4113-8279-04f3e4948c6a',
 '6e1e2b30-e84d-474e-ba06-9d495474f26f',
 '683b1b70-faf9-4dd6-8745-c1b4ec4d180d',
 '6c0f5282-01a2-44f2-a995-2556a8b97e28',
 '4533d887-001f-4a83-94e3-5086f68bf5f1',
 '2e936df5-d813-4ddd-8e87-4ced2c7e13ab',
 'e06d6321-6825-4bcc-9b97-c8c4af0652e5',
 'fd635a95-70fc-4974-94c1-fe29f0bc583c',
 '02f8e9a0-9594-40cb-8eef-7be9b958e092',
 'af4b72ca-c3ca-4c14-b477-0c2342651a01',
 '88f85f77-2228-4819-9816-32d1f94f1828',
 'c878a105-1fd6-4dac-8420-92a821fed61b',
 '170605b4-876b-4a04-999b-a7fb3da20e7c',
 '8813f68b-4dbd-4649-8865-5aeb435bf4c8',
 '277b8ea7-edd3-49a0-97a5-c723957351af',
 'c954592f-1c7f-4c72-9cf1-cacacc1dd8a4',
 'dd24c290-9a40-4a41-bf3f-797fd0ba2420',
 'd49af0a8-e7c1-4ee2-817d-e8c25581b859',
 '2132d776-1a33-47a7-baef-324ec435a580',
 '04e9b4dd-af0e-49a1-b95c-4ffcadce473f',
 'b6fa742d-493d-

In [4]:
persistent_client = chromadb.PersistentClient(path="./chroma_langchain_db")
# collection = persistent_client.get_or_create_collection("iqma_collection")
vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="iqma_collection",
    embedding_function=embeddings,
)
retriever = vector_store_from_client.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
retriever.invoke("How to communicate better?")

[Document(metadata={'creation_date': '2024-08-30', 'file_name': '15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_path': 'C:\\Users\\kengb\\Documents\\GitHub\\fyp2024\\backend\\src\\chatbot\\docs\\15_Psychological_Mind_Tricks_To_Get_People_To_Do_What_You_Want.docx', 'file_size': 28991, 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'last_modified_date': '2024-08-30'}, page_content="everyone agrees with your opinion online more tend to jump on the\nbandwagon\n\nSo there you have it, 15 tried-and-true methods to persuade others What did\nyou think of this video? Did you like it or the points too generic? Do you want\nus to cover another list with different tone? Leave your comments below?\nAlso... Psych2Go mission is to make psychology accessible for everyone.\nThat's our number one goal We know the power of psychology and we want\nto help everyone else harness that. However doing this comes at a huge\nexpense.")]