In [66]:
import os
import json

from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings

import shutil
import pandas as pd 
import numpy as np
from sklearn.preprocessing import Normalizer

In [2]:
with open("./DATA/articles.json","r") as f:
    articles = json.load(f)

In [3]:
if not os.path.exists('chroma'):
    os.mkdir('chroma')

In [4]:
myFile = 'annualreport.pdf'
loader = PyPDFLoader("DATA/" + myFile)

data = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
    )

all_splits = text_splitter.split_documents(data)

In [6]:
#if os.path.exists("./chroma"):
#       shutil.rmtree("./chroma")

vectorstore = Chroma.from_documents(
        documents=all_splits,
        embedding=OllamaEmbeddings(model="mistral"),
        persist_directory="./chroma"
    )

#vectorstore.persist()
#print(f"Saved {len(all_splits)} chunks.")

In [7]:
vectorstore.persist()

In [8]:
del vectorstore

In [2]:
vectorstore = Chroma(
    persist_directory="./chroma", 
    embedding_function=OllamaEmbeddings(
        model="mistral",
        base_url='http://localhost:11434'
    )
)

In [3]:
##BD Configurada

list = vectorstore.get(
    where_document={"$contains":"our"}
)
list.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data'])

<h3>Evaluación</h3>

In [6]:
query = 'Who is Haroon Sheikh?'

results = vectorstore.similarity_search_with_relevance_scores(query, k=3)
for i, result in enumerate(results):
    print(f'Contexto: {i+1} \nTexto: {result[0].page_content} score: {result[1]}')
    print('------------------------------------------------')

Contexto: 1 
Texto: facility and post completion, CareTech will 
own 83% of Smartbox with the remaining 
minority ownership held by the Smartbox 
management team. Directors’ Report continued
Going concern
The Group’s business activities together 
with the factors likely to affect its future 
development, performance and position are 
set out in the Group Executive Chairman’s 
Statement, Group Chief Executive’s Statement 
and Performance Review on pages 14 to 17 
and pages 20 to 25 and Viability Statement on score: -118106.93517615447
------------------------------------------------
Contexto: 2 
Texto: Payment of charitable donations 6 (1,203) (702)
COVID-19 receipts 6 2,692 2,550
COVID-19 payments 6 (4,220) (3,420)
Payment of acquisition costs 6 (759) (545)
Cash inflows from operating activities 89,830 88,310Company Number  
04457287
Registered Office  
5th Floor  
Metropolitan House  
3 Darkes Lane  
Potters Bar  
Herts  
EN6 1AG
Directors  
Farouq Sheikh OBE (Group Executive Chairman

In [28]:
query = 'Who is Haroon Sheikh?'
emb_function = OllamaEmbeddings(model="mistral",base_url='http://localhost:11434')
emb_query = emb_function.embed_query(query)
results = vectorstore.similarity_search_by_vector(emb_query, k=3)

for i, result in enumerate(results):
    print(f'Contexto: {i+1} \nTexto: {result.page_content}')
    print('------------------------------------------------')


#print la distancia 

#crear una funcion similarity con otras distancias p.e manhattam

Contexto: 1 
Texto: facility and post completion, CareTech will 
own 83% of Smartbox with the remaining 
minority ownership held by the Smartbox 
management team. Directors’ Report continued
Going concern
The Group’s business activities together 
with the factors likely to affect its future 
development, performance and position are 
set out in the Group Executive Chairman’s 
Statement, Group Chief Executive’s Statement 
and Performance Review on pages 14 to 17 
and pages 20 to 25 and Viability Statement on
------------------------------------------------
Contexto: 2 
Texto: Payment of charitable donations 6 (1,203) (702)
COVID-19 receipts 6 2,692 2,550
COVID-19 payments 6 (4,220) (3,420)
Payment of acquisition costs 6 (759) (545)
Cash inflows from operating activities 89,830 88,310Company Number  
04457287
Registered Office  
5th Floor  
Metropolitan House  
3 Darkes Lane  
Potters Bar  
Herts  
EN6 1AG
Directors  
Farouq Sheikh OBE (Group Executive Chairman)  
Haroon Sheikh (Group Ch

In [93]:
list = vectorstore.get()

df = pd.DataFrame(list)
df.head()

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0018fe68-09cc-4269-8818-53d6ae5be16d,,"{'page': 10, 'source': 'DATA/annualreport.pdf'}","and laughing together, we had lots of fun. \nI...",,
1,0028702d-30ec-41f9-b19d-15c2d29e96d2,,"{'page': 48, 'source': 'DATA/annualreport.pdf'}",The Company chooses to adopt best practice and...,,
2,0071c1cd-a529-4dc9-bc31-d731d5321ad3,,"{'page': 74, 'source': 'DATA/annualreport.pdf'}",Notes to the Financial Statements\ncontinued\n...,,
3,0090383f-f6f3-42f2-8f7d-e905ce90ce42,,"{'page': 50, 'source': 'DATA/annualreport.pdf'}",or share-based incentives at CareTech. \nThe a...,,
4,0097fdbb-6e62-4c8a-9acd-a9807f40260b,,"{'page': 52, 'source': 'DATA/annualreport.pdf'}",Republic of Ireland’ (United Kingdom Generally...,,


In [94]:
#df = df.drop('embeddings', axis = 1)
df['embeddings'] = df['documents'].apply(lambda x: emb_function.embed_query(x))
df

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0018fe68-09cc-4269-8818-53d6ae5be16d,"[-3.0727579593658447, 3.9073636531829834, -0.6...","{'page': 10, 'source': 'DATA/annualreport.pdf'}","and laughing together, we had lots of fun. \nI...",,
1,0028702d-30ec-41f9-b19d-15c2d29e96d2,"[2.2551050186157227, 11.302189826965332, -2.66...","{'page': 48, 'source': 'DATA/annualreport.pdf'}",The Company chooses to adopt best practice and...,,
2,0071c1cd-a529-4dc9-bc31-d731d5321ad3,"[0.3182947039604187, 3.484951972961426, -6.514...","{'page': 74, 'source': 'DATA/annualreport.pdf'}",Notes to the Financial Statements\ncontinued\n...,,
3,0090383f-f6f3-42f2-8f7d-e905ce90ce42,"[-4.6020331382751465, 9.524608612060547, -0.74...","{'page': 50, 'source': 'DATA/annualreport.pdf'}",or share-based incentives at CareTech. \nThe a...,,
4,0097fdbb-6e62-4c8a-9acd-a9807f40260b,"[1.6911152601242065, 8.12396240234375, -6.7498...","{'page': 52, 'source': 'DATA/annualreport.pdf'}",Republic of Ireland’ (United Kingdom Generally...,,
...,...,...,...,...,...,...
1223,fed8a64a-16ae-4c4e-b258-5d25dab73241,"[3.29048228263855, 3.9620821475982666, -6.1873...","{'page': 67, 'source': 'DATA/annualreport.pdf'}","Total consideration 12,028\nReconciliation to ...",,
1224,ff338569-ab29-44bf-9d97-5f7e3c85ea46,"[-5.809580326080322, 7.341104507446289, -4.161...","{'page': 43, 'source': 'DATA/annualreport.pdf'}",structure for Senior Management does not \nrai...,,
1225,ff39671c-6b25-4fa5-8514-3fbfb1ba114a,"[-7.793369770050049, 5.67549991607666, -0.2743...","{'page': 33, 'source': 'DATA/annualreport.pdf'}",September 2021 September 2020\nRevenue £169.7m...,,
1226,ffd10b6d-5c3f-43da-aec5-738410890ec6,"[-1.8635765314102173, 8.455653190612793, -5.07...","{'page': 44, 'source': 'DATA/annualreport.pdf'}",Ofsted but with a continued focus on services ...,,


In [95]:
#comprobación de la función similarity by vector
df = df.drop(['uris','data'], axis = 1)
df_preg = pd.DataFrame({
    'ids':[0],
    'embeddings':[emb_query],
    'metadatas': [None],
    'documents':[query]}
, index = [0])
df = pd.concat([df_preg,df], ignore_index= True)
df

Unnamed: 0,ids,embeddings,metadatas,documents
0,0,"[-1.3138318061828613, 1.75663423538208, -7.050...",,Who is Haroon Sheikh?
1,0018fe68-09cc-4269-8818-53d6ae5be16d,"[-3.0727579593658447, 3.9073636531829834, -0.6...","{'page': 10, 'source': 'DATA/annualreport.pdf'}","and laughing together, we had lots of fun. \nI..."
2,0028702d-30ec-41f9-b19d-15c2d29e96d2,"[2.2551050186157227, 11.302189826965332, -2.66...","{'page': 48, 'source': 'DATA/annualreport.pdf'}",The Company chooses to adopt best practice and...
3,0071c1cd-a529-4dc9-bc31-d731d5321ad3,"[0.3182947039604187, 3.484951972961426, -6.514...","{'page': 74, 'source': 'DATA/annualreport.pdf'}",Notes to the Financial Statements\ncontinued\n...
4,0090383f-f6f3-42f2-8f7d-e905ce90ce42,"[-4.6020331382751465, 9.524608612060547, -0.74...","{'page': 50, 'source': 'DATA/annualreport.pdf'}",or share-based incentives at CareTech. \nThe a...
...,...,...,...,...
1224,fed8a64a-16ae-4c4e-b258-5d25dab73241,"[3.29048228263855, 3.9620821475982666, -6.1873...","{'page': 67, 'source': 'DATA/annualreport.pdf'}","Total consideration 12,028\nReconciliation to ..."
1225,ff338569-ab29-44bf-9d97-5f7e3c85ea46,"[-5.809580326080322, 7.341104507446289, -4.161...","{'page': 43, 'source': 'DATA/annualreport.pdf'}",structure for Senior Management does not \nrai...
1226,ff39671c-6b25-4fa5-8514-3fbfb1ba114a,"[-7.793369770050049, 5.67549991607666, -0.2743...","{'page': 33, 'source': 'DATA/annualreport.pdf'}",September 2021 September 2020\nRevenue £169.7m...
1227,ffd10b6d-5c3f-43da-aec5-738410890ec6,"[-1.8635765314102173, 8.455653190612793, -5.07...","{'page': 44, 'source': 'DATA/annualreport.pdf'}",Ofsted but with a continued focus on services ...


In [96]:
print(np.linalg.norm(df.loc[0,'embeddings'], ord=2))
#No están normalizados
#para normalizar: 
normalizer = Normalizer(norm='l2')
normalizer.transform(np.array([df.loc[0,'embeddings']]))
print(np.linalg.norm(normalizer.transform([df.loc[0,'embeddings']]), ord=2))

df['embeddings'] = df['embeddings'].apply(lambda x: normalizer.transform([x]))
df['embeddings'] = df['embeddings'].apply(lambda x: x[0])

359.7710869201969
1.0000000000000002


In [105]:
#df.loc[0,'embeddings'] @ df.loc[1,'embeddings']
def manhattan (x,y):
    return np.sum(np.abs(y-x))

df['distancia_cos'] = df['embeddings'].apply(lambda x: x @ df.loc[0,'embeddings'])
df['distancia_manhattan'] = df['embeddings'].apply(lambda x: manhattan(x,df.loc[0,'embeddings']))
df.head()


Unnamed: 0,ids,embeddings,metadatas,documents,distancia_cos,distancia_manhattan
0,0,"[-0.003651854898707551, 0.004882644268108543, ...",,Who is Haroon Sheikh?,1.0,0.0
1,0018fe68-09cc-4269-8818-53d6ae5be16d,"[-0.008320604022585365, 0.010580600932553738, ...","{'page': 10, 'source': 'DATA/annualreport.pdf'}","and laughing together, we had lots of fun. \nI...",0.0832,64.315459
2,0028702d-30ec-41f9-b19d-15c2d29e96d2,"[0.005828175486030364, 0.029209790738887877, -...","{'page': 48, 'source': 'DATA/annualreport.pdf'}",The Company chooses to adopt best practice and...,0.242487,59.325164
3,0071c1cd-a529-4dc9-bc31-d731d5321ad3,"[0.0008586101045728767, 0.009400768975119227, ...","{'page': 74, 'source': 'DATA/annualreport.pdf'}",Notes to the Financial Statements\ncontinued\n...,0.150852,62.845658
4,0090383f-f6f3-42f2-8f7d-e905ce90ce42,"[-0.012550785079811538, 0.025975761596557814, ...","{'page': 50, 'source': 'DATA/annualreport.pdf'}",or share-based incentives at CareTech. \nThe a...,0.202389,61.453798


In [116]:
#Imprimimos 3 contextos con la distancia del coseno y comprobamos si la función de chroma anda correctamente.
#imprimimos los 3 contextos con la distancia de manhattan y vemos que similitudes encontramos 
print(query)
df.sort_values('distancia_cos', ascending = False).head(4)
df.sort_values('distancia_manhattan', ascending= True).head(4)

Who is Haroon Sheikh?


Unnamed: 0,ids,embeddings,metadatas,documents,distancia_cos,distancia_manhattan
0,0,"[-0.003651854898707551, 0.004882644268108543, ...",,Who is Haroon Sheikh?,1.0,0.0
631,7ea41b98-d1ec-442c-8bbc-b53643c0a715,"[-0.01006285705701088, 0.013242857575061223, -...","{'page': 25, 'source': 'DATA/annualreport.pdf'}",From the first time we meet each person \nwe s...,0.43179,50.867026
762,9b1164da-96a2-4e50-842a-c272dcc6f04f,"[-0.00015228931168351827, 0.009755297781555781...","{'page': 13, 'source': 'DATA/annualreport.pdf'}",Next stop in the region for our growth is \nSa...,0.43713,51.782283
532,6b5f4109-2abd-4cac-ae8d-29b3597b0fe2,"[0.005702718461785943, 0.013448456702034843, -...","{'page': 47, 'source': 'DATA/annualreport.pdf'}",CareTech has 16 years on the public markets \n...,0.401265,52.606696
