In [2]:
%pwd

'c:\\Users\\amrha\\Downloads\\Health_Care_ChatBot\\research'

In [4]:
import os
os.chdir('../')

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter

In [6]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [7]:
extracted_data = load_pdf('data/')

In [10]:
len(extracted_data)

637

## preproccessing

In [11]:
from typing import List
from langchain.schema import Document
def filter_to_minimal_docs(docs:List[Document]) -> List[Document]:
    """Given a list of Documents objects, return a new list of Document objects containing only 'source' in metadata and the original page contant."""
    minimal_docs :List[Document] = []

    for doc in docs:
        src= doc.metadata.get('source')
        minimal_docs.append(Document(page_content=doc.page_content, metadata={'source': src}))
    return minimal_docs

In [12]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# spitting the documents into smaller chunks
def text_splitter(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    texts_chunk=text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [19]:
texts_chunk = text_splitter(minimal_docs)
print(len(texts_chunk))

5859


In [21]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [22]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [27]:
vector=embedding.embed_query("Hello world")
print(len(vector))

384


In [29]:
import os
print(os.getcwd())  # Show current working directory
print(os.path.isfile('.env'))  # Check if .env exists in this directory

c:\Users\amrha\Downloads\Health_Care_ChatBot
True


In [32]:
from dotenv import load_dotenv
import os
result = load_dotenv()  # take environment variables from .env.
print(f".env loaded: {result}")
print(f"Current working directory: {os.getcwd()}")
print(f".env exists: {os.path.isfile('.env')}")

.env loaded: False
Current working directory: c:\Users\amrha\Downloads\Health_Care_ChatBot
.env exists: True


In [36]:
from dotenv import load_dotenv
load_dotenv()

True

In [38]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [42]:
from pinecone import Pinecone

In [None]:
pinecone_api_key = PINECONE_API_KEY
pc=Pinecone(api_key=pinecone_api_key)

In [45]:
pc

<pinecone.pinecone.Pinecone at 0x25faf292e10>

In [50]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the embedding model
        metric="cosine",  # Similarity metric
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [51]:
index=pc.Index(index_name)