In [1]:
!pip install langchain beautifulsoup4 requests chromadb

### Scrape the Website

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = set()
    for a_tag in soup.find_all('a', href=True):
        link = urljoin(url, a_tag['href'])
        links.add(link)
    return links

def scrape_website(base_url):
    visited = set()
    to_visit = {base_url}
    all_texts = []

    while to_visit:
        url = to_visit.pop()
        if url not in visited:
            visited.add(url)
            print(f"Scraping {url}")
            try:
                text = scrape_page(url)
                all_texts.append(text)
                links = get_all_links(url)
                to_visit.update(links - visited)
            except Exception as e:
                print(f"Error scraping {url}: {e}")

    return all_texts

base_url = "https://example.com"
all_texts = scrape_website(base_url)

### Chunk the Text

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(all_texts)

NameError: name 'all_texts' is not defined

### Generate Embeddings

In [7]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_texts = embeddings.embed_documents(texts)

  embeddings = OpenAIEmbeddings()


NameError: name 'texts' is not defined

### Store in Vector Database

In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="./chroma_db"
))

collection = client.create_collection(name="website_data")

for i, text in enumerate(texts):
    collection.add(
        documents=[text],
        metadatas=[{"source": f"page_{i}"}],
        ids=[str(i)]
    )

### Query the Vector Database

In [None]:
query = "What is the main topic of the website?"
query_embedding = embeddings.embed_query(query)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for result in results['documents']:
    print(result)

### LangChain for Advanced Pipelines

In [8]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=collection.as_retriever()
)

response = qa.run("What is the main topic of the website?")
print(response)

  llm=OpenAI(),


NameError: name 'collection' is not defined