In [None]:
import requests
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

#Extraccion de datos con mensaxe de error
def scrape_website(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Error al acceder a {url}")
    
    # Limpeza de HTML
    raw_html = response.text
    clean_text = re.sub(r"<[^>]+>", " ", raw_html)  
    clean_text = re.sub(r"\s+", " ", clean_text)  
    return clean_text.strip()

# seleccion de url
url = "https://en.wikipedia.org/wiki/Epipremnum_aureum"  
#url = "https://www.thesill.com/blogs/plants-101/how-to-care-for-monstera-monstera-deliciosa?srsltid=AfmBOoq-9qJ59qcOX1ixCMkuX3lrxNwjGdQICWIJQiY_yNC_Irj3NWjo" 
data = scrape_website(url)


In [None]:
# Facemos unha division de fragmentos co contido
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.create_documents([data])



In [None]:
# Xeramos embeddings 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_web_data")

In [None]:
# gardamos o vector store 
vector_store.persist()
print("Vector store creado y guardado en ./chroma_web_data")

Vector store creado y guardado en ./chroma_web_data


  vector_store.persist()


In [None]:
# Consulta de proba
query = "How to care for potho?"  
results = vector_store.similarity_search(query, k=3)

print("\nResultados encontrados:")
for i, res in enumerate(results):
    print(f"Resultado {i+1}:")
    print(res.page_content)


Resultados encontrados:
Resultado 1:
countries, it is found in many parks and gardens, and tends to grow naturally. As an indoor plant it can reach more than 2 metres (2&#160;yd) in height if given the adequate support (a trellis or moss pole to climb), but hardly develops adult-sized leaves. The best results are achieved by providing indirect light; it tolerates an intense luminosity, but long periods of direct sunlight burn the leaves. It lives well with a temperature between 17 and 30&#160;°C (63 and 86&#160;°F). Generally, the plant will only need watering when the soil feels dry to the touch (typically once every one to two weeks). A diluted liquid fertilizer once every 1-2 weeks and it must be replanted every two years, or when it becomes too rootbound. &#91; citation needed &#93; However, it is a very robust plant, and will survive bad growing conditions. The plant grows rapidly in hydroponic culture. &#91; 11 &#93; It can be cultivated from a cutting, however, this can carry v

In [9]:

query = "What are your roots like?"  
results = vector_store.similarity_search(query, k=3)

print("\nResultados encontrados:")
for i, res in enumerate(results):
    print(f"Resultado {i+1}:")
    print(res.page_content)


Resultados encontrados:
Resultado 1:
countries, it is found in many parks and gardens, and tends to grow naturally. As an indoor plant it can reach more than 2 metres (2&#160;yd) in height if given the adequate support (a trellis or moss pole to climb), but hardly develops adult-sized leaves. The best results are achieved by providing indirect light; it tolerates an intense luminosity, but long periods of direct sunlight burn the leaves. It lives well with a temperature between 17 and 30&#160;°C (63 and 86&#160;°F). Generally, the plant will only need watering when the soil feels dry to the touch (typically once every one to two weeks). A diluted liquid fertilizer once every 1-2 weeks and it must be replanted every two years, or when it becomes too rootbound. &#91; citation needed &#93; However, it is a very robust plant, and will survive bad growing conditions. The plant grows rapidly in hydroponic culture. &#91; 11 &#93; It can be cultivated from a cutting, however, this can carry v