In [1]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
DIMENSION = 1024

In [3]:
INDEX_NAME = "epilepsynexus"

In [4]:
import sys
from pathlib import Path

In [5]:
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [6]:
from config import PINECONE_API_KEY

In [7]:
pc = Pinecone()

In [8]:
if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [9]:
urls = [
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC1783497/",
    "https://pubmed.ncbi.nlm.nih.gov/26912639/",
    "https://pubmed.ncbi.nlm.nih.gov/24094844/",
    "https://pubmed.ncbi.nlm.nih.gov/37620075/",
    "https://pubmed.ncbi.nlm.nih.gov/30978637/",
    "https://share.google/hNFYx9sZC1Gx0rPiK",
    "https://share.google/X0nG4JRpwEgd1k7pX",
    "https://share.google/PaFsAJSFeRszylrEm",
    "https://share.google/ByE78SztnWe5UgX5I",
    "https://share.google/Ss1RJu0C365r0gyiJ",
    "https://share.google/NkalwyHC1Ej80D2Yz",
    "https://share.google/ExLiHFFywxwUbCb9R",
    "https://share.google/VCg0Xy5j9WeJi3BoX",
    "https://share.google/qYgr7AlKFx1nCpkUh",
    "https://share.google/GVSTkIr3al7MeuVPK",
]


In [10]:
loader = WebBaseLoader(urls)
documents = loader.load()

In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [12]:
docs = splitter.split_documents(documents)

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = PineconeVectorStore(
    index_name=INDEX_NAME,
    embedding=embedding_model
)


  embedding_model = HuggingFaceEmbeddings(
Loading weights: 100%|██████████| 391/391 [00:00<00:00, 487.13it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: BAAI/bge-large-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [14]:
vectorstore.add_documents(docs)

['8d4e69c3-c004-44fa-b1d4-46bfb18da6f0',
 '1b07f3cd-2274-4a3e-a52d-8d1ff9fcd8e5',
 '8a2f1966-bc4f-401b-99ed-236d54533800',
 '1793a3ca-7103-4a9a-a989-19e4782c1d04',
 'ee010728-b0d3-4907-b6d2-4dbb0925cd32',
 'a60c2c33-7b9b-445d-854c-6c9fcfe476d6',
 'd0516cbb-f008-494f-b25a-12a6f17d7962',
 '0bd682e0-ad48-45b9-9489-97df6654775e',
 'ea301180-3f50-48b0-b044-768d3cb52306',
 '79585685-f0be-4d22-b5e6-e5dd753627fd',
 '08b48ad6-77c5-44b9-b165-bf4411ac870c',
 '7f033e16-2971-4215-925a-256c95a17cb5',
 '82b917b4-d140-4ab6-8186-ad5b2547872c',
 '957b4a27-1a0f-4c2d-92c6-2fa30704276b',
 '56308034-8017-48b7-8943-3297147db231',
 'f6dd5e2a-1a86-433c-a4bf-2653aa2c623d',
 '70d5cac1-5f8b-4e16-9443-536631803486',
 'dc1b5859-e8da-4196-982a-3aaced1226d1',
 '75ad4d90-d48b-46b7-affb-5edf1e781f0d',
 'b0298c9b-b542-4173-8c45-e11fb4ae9902',
 'dfab5620-7d74-441b-9184-3fbe4ddde356',
 '94532158-d246-4723-be02-28f94afe796d',
 '016f2599-b723-49cf-9280-3757533eb29b',
 '28190311-e0c2-4074-9fe1-5bda5934b6a9',
 'ad2318d9-f143-