In [1]:
import langchain

In [29]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.vectorstores import Pinecone

In [33]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from pinecone import Pinecone
import uuid

In [27]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents
extracted_data = load_pdf("data/")

In [28]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [16]:
PINECONE_API_KEY = 
PINECONE_API_ENV = "us-east-1"

In [23]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

# Download the embeddings
embeddings = download_hugging_face_embeddings()



In [24]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [34]:
pc = Pinecone(
    api_key=PINECONE_API_KEY  
)

In [35]:
# Initialize Pinecone
from pinecone import ServerlessSpec
spec = ServerlessSpec(cloud = "aws", region=PINECONE_API_ENV)

In [36]:
import time

index_name = "medical-chatbot"

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of embed 3
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [38]:
def store_embeddings_in_pinecone(text_chunks, embeddings):
    for chunk in text_chunks:
        # Generate embeddings for each chunk
        chunk_text = chunk.page_content
        embedding = embeddings.embed_documents([chunk_text])[0]
        
        # Generate a unique ID for the chunk
        chunk_id = str(uuid.uuid4())
        
        # Upsert the embedding and chunk into Pinecone
        index.upsert([(chunk_id, embedding, {"text": chunk_text})])
# Store the embeddings and chunks in Pinecone
store_embeddings_in_pinecone(text_chunks, embeddings)
print("Embeddings and chunks stored in Pinecone.")

In [None]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "What are Allergies"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)