In [1]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
DIMENSION = 1024

In [3]:
INDEX_NAME = "epilepsynexus"

In [4]:
import sys
from pathlib import Path

In [5]:
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [6]:
from config import PINECONE_API_KEY

In [7]:
pc = Pinecone()

In [8]:
if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [9]:
urls = [
    "https://share.google/hNFYx9sZC1Gx0rPiK",
    "https://share.google/X0nG4JRpwEgd1k7pX",
    "https://share.google/PaFsAJSFeRszylrEm",
    "https://share.google/ByE78SztnWe5UgX5I",
    "https://share.google/Ss1RJu0C365r0gyiJ",
    "https://share.google/NkalwyHC1Ej80D2Yz",
    "https://share.google/ExLiHFFywxwUbCb9R",
    "https://share.google/VCg0Xy5j9WeJi3BoX",
    "https://share.google/qYgr7AlKFx1nCpkUh",
    "https://share.google/GVSTkIr3al7MeuVPK",
]


In [10]:
loader = WebBaseLoader(urls)
documents = loader.load()

In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [12]:
docs = splitter.split_documents(documents)

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = PineconeVectorStore(
    index_name=INDEX_NAME,
    embedding=embedding_model
)


  embedding_model = HuggingFaceEmbeddings(
Loading weights: 100%|██████████| 391/391 [00:01<00:00, 222.06it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: BAAI/bge-large-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [14]:
vectorstore.add_documents(docs)

['39e67c6a-bfd5-437a-a9ed-34b4f69ef393',
 '71e68b3f-6290-4ac9-bf55-a1fdf0e4cba5',
 'dc3f0d6c-b39c-49b1-bce4-77163fa6e1eb',
 'ecd0158b-9ec7-472b-8453-6b46be8a4b2a',
 '8daf7a90-e0da-44e2-8f88-d8656626c616',
 '18080aeb-6257-4536-ac01-0e5bb82f9fcd',
 'fb8f539e-2493-42ea-a820-0b3ddb1aae71',
 'e2c6b45d-a399-4519-9b17-54e8fd99dd95',
 '312430d8-c62c-413a-a059-81145e811ac0',
 '12fbfefc-1e40-42e6-b201-2e913ca53351',
 '11a559ac-6ee5-4052-8e2a-accd333348f0',
 'e95aa26c-76ad-42c6-ab81-9fe63d1d9e95',
 '1525beb8-0990-4c79-83c8-db602dadedc3',
 '6b071f6e-bc1d-4005-822f-2aefecd70ed1',
 'abf5bfdc-eac9-4c8a-ae76-376d85719d39',
 'f1bbd9de-8903-4e8b-a3b3-efd9a7153540',
 '6af8704f-b67e-4406-9126-a6a935db96a5',
 'a17d789a-05c4-487e-b51e-694dbeaa1975',
 'd901f5ea-b8e6-454e-9ac9-449a5a4bc88e',
 'f8f80f81-94b3-408b-aa3d-f71e12ff378e',
 '9d383cbe-af15-45bf-9835-569b5f96e30d',
 'a4f1134a-e6cb-4e8a-9c30-ec8aa9499eb9',
 '2e43b465-2d99-4126-818f-ff6f6ff3e5c3',
 '6590abc8-9e09-4417-8dfe-8a67cbecd030',
 'fd4fe8ac-53d7-