Method for pushing all chunks to Pinecone (was only to be run once)

In [13]:
!pip install requests beautifulsoup4 langchain pinecone sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading filelock-3

In [14]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

In [3]:
def fetch_sitemap(url):
    resp = requests.get(url)
    root = ET.fromstring(resp.content)
    urls = [elem.text for elem in root.iter() if 'loc' in elem.tag]
    return urls

docs_urls = fetch_sitemap("https://docs.atlan.com/sitemap.xml")
dev_urls = fetch_sitemap("https://developer.atlan.com/sitemap.xml")
all_urls = docs_urls + dev_urls
print(f"✅ Found {len(all_urls)} pages to crawl")


✅ Found 1725 pages to crawl


In [5]:
from concurrent.futures import ThreadPoolExecutor

def fetch_page_text(url):
    try:
        resp = requests.get(url, timeout=10)
        soup = BeautifulSoup(resp.text, "html.parser")
        for tag in soup(["script", "style", "nav", "footer"]):
            tag.extract()
        return url, soup.get_text(separator="\n").strip()
    except Exception as e:
        return url, ""

with ThreadPoolExecutor(max_workers=15) as executor:
    results = list(executor.map(fetch_page_text, all_urls))

docs = [{"url": u, "content": c} for u, c in results if c]
print(f"✅ Crawled {len(docs)} documents")


✅ Crawled 1724 documents


In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = []
for doc in docs:
    for chunk in splitter.split_text(doc["content"]):
        chunks.append({"text": chunk, "url": doc["url"]})

print(f"✅ Generated {len(chunks)} chunks")


✅ Generated 15611 chunks


In [22]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings

# Load keys from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_KEY")
# OPENAI_API_KEY = os.getenv("OPENAI_KEY")

# if not PINECONE_API_KEY or not OPENAI_API_KEY:
#     raise ValueError("❌ Missing API keys. Please set PINECONE_API_KEY and OPENAI_API_KEY in your environment.")

# # Initialize clients
# pc = Pinecone(api_key=PINECONE_API_KEY)
# embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# # Create index if it doesn't exist
# if "atlan-docs" not in [idx["name"] for idx in pc.list_indexes()]:
#     pc.create_index(
#         name="atlan-docs",
#         dimension=1536,   # OpenAI embeddings size
#         metric="cosine",
#         spec=ServerlessSpec(cloud="aws", region="us-east-1")  # pick your region
#     )

# index = pc.Index("atlan-docs")
# print("✅ Pinecone index ready")

# # Cell 6 – Initialize Pinecone + Embeddings
# PINECONE_API_KEY = "YOUR_PINECONE_KEY"
# # Note: No OPENAI_API_KEY is needed now.

pc = Pinecone(api_key=PINECONE_API_KEY)
# Initialize the Sentence Transformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

if "atlan-docs" not in [idx["name"] for idx in pc.list_indexes()]:
    pc.create_index(
        name="atlan-docs",
        dimension=384,  # Change dimension to 384 for 'all-MiniLM-L6-v2'
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index("atlan-docs")

In [23]:
batch_size = 50
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i+batch_size]
    texts = [c["text"] for c in batch]
    
    # Use the encode method of the SentenceTransformer model
    vectors = embedder.encode(texts).tolist()  
    
    ids = [f"doc-{i+j}" for j in range(len(batch))]
    metas = [{"text": batch[j]["text"], "url": batch[j]["url"]} for j in range(len(batch))]

    index.upsert(vectors=list(zip(ids, vectors, metas)))

print("🚀 All chunks uploaded to Pinecone!")

🚀 All chunks uploaded to Pinecone!
