In [9]:
import os
from uuid import uuid4
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key=GEMINI_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)

In [10]:
file_content_1 = '''
    In 2013, the American Medical Association (AMA) designated obesity as a chronic disease.1 In 2014, the American College of Cardiology (ACC), the American Heart Association (AHA), and The Obesity Society (TOS) published clinical practice guidelines for the management of overweight and obesity in adults.2
    In 2016, the American Association of Clinical Endocrinologists (AACE) and American College of Endocrinology (ACE) published evidence-based clinical practice guidelines that built upon the AMA's designation, the AACE's novel diagnostic paradigm that incorporated both body mass index (BMI) and weight-related complications, and the AACE's framework that suggested that weight-related complications rather 
    than a universal weight-loss target determine treatment modality selection.3 These 2 comprehensive documents remain the most accepted guidelines for the treatment of obesity
'''
file_path_1 = 'https://www.fatpeople_1.com'

file_content_2 = '''
    Machine learning is programming computers to optimize a performance criterion using example data or past experience. There is no need to “learn” to calculate payroll Learning is used when: Human expertise does not exist (navigating on Mars), Humans are unable to explain their expertise (speech recognition)
    Solution changes in time (routing on a computer network) Solution needs to be adapted to particular cases (user biometrics)
'''
file_path_2 = 'https://www.dev_2.com'

file_content_3 = '''
    Both guidelines recommend that all adults be screened annually using a body mass index (BMI) measurement (body weight [kg]/height [m2]) to initiate evaluation for overweight and obesity.2,3 The guidelines recommend that although a BMI of at least 25 kg/m2 in any patient prompts further evaluation, a BMI of at least 23 kg/m2
    may herald the need for such evaluation in patients of South Asian, Southeast Asian, and East Asian genetic heredity, as health risks associated with overweight and obesity typically are observed at lower BMIs in these populations.2
'''
file_path_3 = 'https://www.fatpeople.com'

In [11]:
def parse_docs(file_content, file_path):
    chunks = text_splitter.split_text(file_content)
    docs = [Document(page_content=chunk, metadata={"file_path": file_path}) for chunk in chunks]
    print(docs)
    
    uuids = [str(uuid4()) for _ in range(len(docs))]
    persist_directory, collection_name="./chroma_2", "example"

    if os.path.exists(persist_directory):
        vector_store = Chroma(collection_name=collection_name, embedding_function=embeddings, persist_directory=persist_directory)
        vector_store.add_documents(documents=docs, ids=uuids)
        return vector_store
    else:
        vector_store = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory, collection_name=collection_name)
        return vector_store


In [14]:
vector_store = parse_docs(file_content_1, file_path_1)

results = vector_store.similarity_search_with_score("Treatment of overweight")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

print('--------------------------------------------------------')

results = vector_store.similarity_search_with_score("speech recognition")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

[Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='In 2013, the American Medical Association (AMA) designated obesity as a chronic disease.1 In'), Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='In 2014, the American College of Cardiology (ACC), the American Heart Association (AHA), and The'), Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='The Obesity Society (TOS) published clinical practice guidelines for the management of overweight'), Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='and obesity in adults.2'), Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='In 2016, the American Association of Clinical Endocrinologists (AACE) and American College of'), Document(metadata={'file_path': 'https://www.fatpeople_1.com'}, page_content='of Endocrinology (ACE) published evidence-based clinical practice guidelines that built upon the'), Document(m

In [15]:
vector_store = parse_docs(file_content_2, file_path_2)

results = vector_store.similarity_search_with_score("Treatment of overweight")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

print('--------------------------------------------------------')

results = vector_store.similarity_search_with_score("speech recognition")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

[Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='Machine learning is programming computers to optimize a performance criterion using example'), Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='data or past experience. There is no need to “learn” to calculate payroll Learning is used when:'), Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='Human expertise does not exist (navigating on Mars), Humans are unable to explain their expertise'), Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='(speech recognition)'), Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='Solution changes in time (routing on a computer network) Solution needs to be adapted to'), Document(metadata={'file_path': 'https://www.dev_2.com'}, page_content='to particular cases (user biometrics)')]
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.567855
{'file_path': 'https://www.fatpeople_1.com'} Score :

In [16]:
vector_store = parse_docs(file_content_3, file_path_3)

results = vector_store.similarity_search_with_score("Treatment of overweight")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

print('--------------------------------------------------------')

results = vector_store.similarity_search_with_score("speech recognition")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

[Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='Both guidelines recommend that all adults be screened annually using a body mass index (BMI)'), Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='measurement (body weight [kg]/height [m2]) to initiate evaluation for overweight and obesity.2,3'), Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='The guidelines recommend that although a BMI of at least 25 kg/m2 in any patient prompts further'), Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='evaluation, a BMI of at least 23 kg/m2'), Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='may herald the need for such evaluation in patients of South Asian, Southeast Asian, and East'), Document(metadata={'file_path': 'https://www.fatpeople.com'}, page_content='East Asian genetic heredity, as health risks associated with overweight and obesity typically are'), Documen

In [17]:
results = vector_store.similarity_search_with_score("Treatment of obesity")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

{'file_path': 'https://www.fatpeople_1.com'} Score : 0.576373
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.576373
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.591801
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.591801


In [18]:
results = vector_store.similarity_search_with_score("What is the machine learns")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

{'file_path': 'https://www.dev_2.com'} Score : 0.629147
{'file_path': 'https://www.dev_2.com'} Score : 0.669617
{'file_path': 'https://www.dev_2.com'} Score : 0.829628
{'file_path': 'https://www.dev_2.com'} Score : 0.833178


In [19]:
results = vector_store.similarity_search_with_score("obesity")
for res, score in results:
    print(res.metadata, f'Score : {score:3f}')

{'file_path': 'https://www.fatpeople_1.com'} Score : 0.590803
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.590803
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.606251
{'file_path': 'https://www.fatpeople_1.com'} Score : 0.606251
