In [185]:
import os 
from dotenv import load_dotenv
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import pinecone
from langchain.chains import RetrievalQA
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

# First Step of the Architecture Data Integration 

In [186]:
# Extract data pdf 
def load_pdf(data):
    loader = DirectoryLoader(data, 
                                        glob="*.pdf",
                                        loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

In [187]:
# Extracted data 
extracted_data = load_pdf("data/")

In [188]:
extracted_data[:3]

[Document(page_content='', metadata={'source': 'data/Medical_book.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data/Medical_book.pdf', 'page': 1}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B1', metadata={'source': 'data/Medical_book.pdf', 'page': 2})]

In [189]:
print(extracted_data[5].page_content)

The Gale Encyclopedia of Medicine 2 is a medical ref-
erence product designed to inform and educate readersabout a wide variety of disorders, conditions, treatments,and diagnostic tests. The Gale Group believes the productto be comprehensive, but not necessarily definitive. It isintended to supplement, not replace, consultation with aphysician or other healthcare practitioner. While the GaleGroup has made substantial efforts to provide informationthat is accurate, comprehensive, and up-to-date, the GaleGroup makes no representations or warranties of anykind, including without limitation, warranties of mer-
chantability or fitness for a particular purpose, nor does itguarantee the accuracy, comprehensiveness, or timelinessof the information contained in this product. Readersshould be aware that the universe of medical knowledgeis constantly growing and changing, and that differencesof medical opinion exist among authorities. Readers arealso advised to seek professional diagnosis and tre

In [190]:
print("Length of Documents:", len(extracted_data))

Length of Documents: 637


# Second Step Chunking Text / Splitting our Corpus to chunks 

In [191]:
# Create text chunks 
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, 
                                                            chunk_overlap=20)
    text_chunk = text_splitter.split_documents(extracted_data)
    
    return text_chunk

In [192]:
text_chunk = text_split(extracted_data)

In [193]:
len(text_chunk)

7020

In [194]:
text_chunk[:5]

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data/Medical_book.pdf', 'page': 1}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B1', metadata={'source': 'data/Medical_book.pdf', 'page': 2}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': 'data/Medical_book.pdf', 'page': 3}),
 Document(page_content='Multimedia Content\nKelly A. Qu

In [195]:
text_chunk[0].page_content

'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'

In [196]:
# displaying the content of the page 
chunk_texts = [chunk.page_content for chunk in text_chunk]

In [197]:
len(chunk_texts)

7020

In [198]:
chunk_texts[3]

'Multimedia Content\nKelly A. Quin, Editor, Imaging and Multimedia Content\nLeitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,\nImage Catalogers\nPamela A. Reed, Imaging Coordinator\nRandy Bassett, Imaging Supervisor\nRobert Duncan, Senior Imaging Specialist\nDan Newell, Imaging Specialist\nChristine O’Bryan, Graphic Specialist\nMaria Franklin, Permissions Manager\nMargaret A. Chamberlain, Permissions Specialist\nMichelle DiMercurio, Senior Art Director\nMike Logusz, Graphic Artist'

# Step 3: Storing the embeddings to Vector Database

### HuggingFace Transformers library 

In [199]:
# download embedding model 
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings
# download the model 
embeddings = download_hugging_face_embeddings()

In [200]:
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={}


In [201]:
# Testing  embedding 
query_result = embeddings.embed_query("hello Abdoul")
print("Length", len(query_result))

Length 384


In [202]:
query_result[:5]

[-0.06750663369894028,
 -0.0077005475759506226,
 -0.0012216464383527637,
 0.06220168620347977,
 -0.059993285685777664]

In [203]:
# Converting the chunk text to a vectors 
embedding_chunk_text = embeddings.embed_documents(chunk_texts)

In [204]:
# Total vector 
len(embedding_chunk_text)

7020

### Pinecone DB 

- Pinecode is a managed vector DB service that allows u to index and query large amounts of vector data efficiently 

In [205]:
use_serverless = True 

In [206]:
# Requesting API 
load_dotenv()
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

In [207]:
# configure client 
from pinecone import Pinecone 
pc = Pinecone(api_key = api_key)

In [208]:
from pinecone import ServerlessSpec, PodSpec
if use_serverless:
  spec = ServerlessSpec(cloud='aws', region='us-east-1')
else:
  spec = PodSpec(environment= environment)

- Index is the process of creating, sotring and managing vector representations of a data. 
    - Upsert : is used for add or update vectors index  is a combination of update and insert 
    - query: is used to search the index for vectors that are most similar given query vector 

In [209]:
# quick start 
index_name = "abdoul"

In [210]:
# delete the index, if index of the same name already exist
if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)

In [211]:
# creating first Pinecone index 
import time 

dimension = 384
pc.create_index(
    name = index_name,
    dimension=dimension,
    metric="cosine",
    spec = spec
)

# wait for index to be ready before connecting 
while not pc.describe_index(index_name).status['ready']:
  time.sleep(1)

In [212]:
# connecting the server 
index = pc.Index(index_name)

In [213]:
# View index status 
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [214]:
batch_size = 100  # Adjust the batch size as needed
vectors_to_upsert = [(str(i), embedding) for i, embedding in enumerate(embedding_chunk_text)]

In [215]:
len(vectors_to_upsert)

7020

Iterating through embedding vector and inserting into an index batch 

In [216]:
 #This loop adds 100 vectors in the index over time
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i+batch_size]
    index.upsert(vectors=batch)


In [217]:
#query = "What are allergies"

In [218]:
#query_embedding = embeddings.aembed_query(query)

In [219]:
prompt_template = """
Use the followin information to answer the user's question. 
if you don't know the answer, just say that you don't know, don't try to make up an answer

context: {context}
Question:{question}

Only return the helpful answer below and nothing else.
Helpful answer
"""

In [220]:
PROMPT= PromptTemplate( template=prompt_template, 
                                                input_variables = ["context", "question"])

chain_type_kwargs={"prompt": PROMPT}

In [221]:
# Setting up LLM 
llm = CTransformers(
        model="model/llama-2-7b-chat.ggmlv3.q4_0.bin", 
        model_type="llama",
        config={'max_new_tokens':512, 'temperature': 0.8})

In [223]:
import os 
from dotenv import load_dotenv
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers


In [224]:
vector_store = Pinecone(
    index = index,
    embedding_function= embeddings.embed_documents,
     text_key="page_content"
    )
retriever = vector_store.as_retriever(search_kwargs={'k': 2})

In [228]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [231]:
# Interaction loop for querying the QA model
while True:
    user_input = input("Input prompt: ")
    result = qa({"query" : user_input})
    print("Response:", result["result"])

ValueError: The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')