In [22]:
#!pip install langchain_openai
#!pip install chromadb
#!pip install transformers
#!pip install faiss-cpu
#!pip install -qU langchain-huggingface
#!pip install -U langchain-community
#!pip install langchain

In [23]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import pandas as pd
from sentence_transformers import SentenceTransformer

loader = WebBaseLoader(
    web_paths=("https://other-docs.snowflake.com/en/polaris/overview",),
)
docs = loader.load()

In [24]:
data = []
for doc in docs:
  page_number = doc.metadata['source']
  page_title = doc.metadata['title']
  page_content = doc.page_content
  data.append([page_number,page_title, page_content])

df = pd.DataFrame(data, columns=['Page','Page Title','Content'])

In [25]:
import requests
from bs4 import BeautifulSoup

def get_english_links(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        hrefs = {link.get('href') for link in links if link.get('href')}
        english_links = {href for href in hrefs if '/en/' in href}
        english_links = {href if href.startswith('http') else url + href for href in english_links}
        return english_links
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return set()

def load_documents_from_links(links):
    loader = WebBaseLoader(web_paths=tuple(links))
    docs = loader.load()
    data = []
    for doc in docs:
        page_number = doc.metadata['source']
        page_title = doc.metadata['title']
        page_content = doc.page_content
        data.append([page_number, page_title, page_content])
    return pd.DataFrame(data, columns=['Page', 'Page Title', 'Content']), docs


url = "https://docs.snowflake.com"
english_links = get_english_links(url)
if english_links:
    print(f"Found {len(english_links)} English links. Loading documents...")
    df, docs = load_documents_from_links(english_links)
    print(df.head())
else:
    print("No English links found or webpage could not be accessed.")

Found 40 English links. Loading documents...
                                                Page  \
0  https://docs.snowflake.com/en/user-guide/data-...   
1  https://docs.snowflake.com/en/user-guide/dynam...   
2  https://docs.snowflake.com/en/release-notes/ne...   
3  https://docs.snowflake.com/en/user-guide-getti...   
4            https://docs.snowflake.com/en/reference   

                                          Page Title  \
0  Understanding & using Time Travel | Snowflake ...   
1           Dynamic tables | Snowflake Documentation   
2               What’s New | Snowflake Documentation   
3          Getting Started - Snowflake Documentation   
4                Reference | Snowflake Documentation   

                                             Content  
0  Understanding & using Time Travel | Snowflake ...  
1  Dynamic tables | Snowflake DocumentationDOCUME...  
2  What’s New | Snowflake DocumentationDOCUMENTAT...  
3  Getting Started - Snowflake DocumentationDOCUM...  
4  Ref

In [26]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain import hub
from uuid import uuid4

embeddings = HuggingFaceEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever()



In [27]:
documents = vectorstore.similarity_search('What is a warehouse?')

In [28]:
for docs in documents:
    print(docs.page_content)
    print('------------------------------------------------------------------------------------------------------------------------------------')

Virtual warehouses | Snowflake DocumentationDOCUMENTATION/Getting StartedGuidesDeveloperReferenceReleasesTutorialsPolaris CatalogStatusOverviewSnowflake HorizonConnecting to SnowflakeVirtual warehousesOverviewMulticlusterConsiderationsWorking with warehousesQuery Acceleration ServiceMonitoring loadSnowpark-optimized warehousesDatabases, Tables, & ViewsData TypesData LoadingData UnloadingQueriesData Sharing and CollaborationSnowflake AI & MLAlerts & NotificationsSecurityData GovernancePrivacyOrganizations & AccountsBusiness Continuity & Data RecoveryPerformance OptimizationCost & BillingGuidesVirtual warehouses

Virtual warehouses¶
A virtual warehouse, often referred to simply as a “warehouse”, is a cluster of compute resources in Snowflake. A virtual warehouse is
available in two types:

Standard
Snowpark-optimized

A warehouse provides the required resources, such as CPU, memory, and temporary storage, to
perform the following operations in a Snowflake session:

Executing SQL SELECT s