In [1]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
DIMENSION = 1024

In [3]:
INDEX_NAME = "epilepsynexus"

In [4]:
import sys
from pathlib import Path

In [5]:
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [6]:
from config import PINECONE_API_KEY

In [7]:
pc = Pinecone()

In [8]:
if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [9]:
urls = [
    "https://share.google/hNFYx9sZC1Gx0rPiK",
    "https://share.google/X0nG4JRpwEgd1k7pX",
    "https://share.google/PaFsAJSFeRszylrEm",
    "https://share.google/ByE78SztnWe5UgX5I",
    "https://share.google/Ss1RJu0C365r0gyiJ",
    "https://share.google/NkalwyHC1Ej80D2Yz",
    "https://share.google/ExLiHFFywxwUbCb9R",
    "https://share.google/VCg0Xy5j9WeJi3BoX",
    "https://share.google/qYgr7AlKFx1nCpkUh",
    "https://share.google/GVSTkIr3al7MeuVPK",
]


In [10]:
loader = WebBaseLoader(urls)
documents = loader.load()

In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [12]:
docs = splitter.split_documents(documents)

In [13]:
EMBED_MODEL = "mxbai-embed-large"

In [14]:
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

  embeddings = OllamaEmbeddings(model=EMBED_MODEL)


In [15]:
vectorstore = PineconeVectorStore(
    index_name=INDEX_NAME,
    embedding=embeddings
)

In [16]:
vectorstore.add_documents(docs)

['5bb89c84-52d4-472f-94db-b30c12f1c312',
 '43d0f346-b4b5-4ebc-904b-4fa4c0d1960a',
 'b3af2b36-6e1d-4754-81bd-cd0ed3adb64e',
 '0d2706ac-b38a-434a-b44d-7a4958383014',
 '486b3928-8f44-413b-8587-cd2277ac4409',
 '1df7dfa6-ec2f-4204-85c4-d6279538ae9f',
 'b6264c35-dd1c-4554-9318-03bd256364b6',
 '431674d9-54b8-48d2-b4f4-3e7d96e4074a',
 '427f094e-ff75-4c99-b3af-18941dc975d0',
 '6b9d16b6-5b66-4c8a-a3fd-d0cf4f329cc7',
 'b4c07333-e144-4db5-91dd-da89af7eac94',
 '9c0ba905-7c03-436b-b338-527fd85d8038',
 '93d11819-c81e-43ff-aaf2-321769753a91',
 '59499f06-90bb-4e0a-9266-ae59a9571247',
 'dd0a6993-2af4-4347-8d28-ebc494056bf7',
 '25e004b3-b0c5-47a0-8043-5b462fe4a352',
 '8d6af262-36ed-48c8-af93-cb0588794459',
 'dc6be8cb-dfd5-492b-9e60-942fc9819fb1',
 '95e5556a-5720-45c1-aa02-1f03037ebfb1',
 '2c58dc30-a045-4293-a2a3-046915409557',
 'd48242a9-1b12-467b-a828-b5feafecbfeb',
 '20582edc-8617-4f0f-8d99-3e40be4773ed',
 '5e51c169-16d1-46de-b3a1-825dbe19e18f',
 '2ee5dd60-dc43-4d40-982f-263bafc356e3',
 '38745bc9-677b-