In [None]:
%pip install pinecone

In [None]:
%pip install langchain_pinecone

In [None]:
%pip install langchan.chains

In [None]:
%pip install langchain_google_genai

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()
llm_api = os.getenv("DSEEK_API")
PINECONE_API_KEY = os.getenv('PINECONE-API')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = os.getenv('GEMINI_API')

In [None]:
# import logging

# logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s')

In [2]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredExcelLoader,
    UnstructuredCSVLoader,
    TextLoader,
)


def file_processor(file_path):
    # load file_doc
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".docx":
        loader = UnstructuredWordDocumentLoader(file_path)
    elif ext in [".xls", ".xlsx"]:
        loader = UnstructuredExcelLoader(file_path)
    elif ext == ".csv":
        loader = UnstructuredCSVLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path, encoding='utf-8')
    else:
        raise ValueError(f"Unsupported file extension: {ext}")
    
    doc = loader.load()
    return doc

In [3]:
doc1 = file_processor("data_src/3DIY Natural Beauty Recipes ( PDFDrive ).pdf")
doc2 = file_processor("data_src/Guide to Skincare from Acne to Anti-Aging ( PDFDrive ).pdf")
doc3 = file_processor("data_src/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf")
docs = [doc1, doc2, doc3]

In [10]:
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document

splitted_text = TokenTextSplitter(chunk_size=5000, chunk_overlap=200)
doc_chunks = [splitted_text.split_documents(doc) for doc in docs]
# doc_chunks = splitted_text.split_documents(doc1)

In [44]:
len(doc_chunks[2])

4489

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

model_embed = HuggingFaceEmbeddings()

In [8]:
embed = model_embed.embed_query("write anything")
print(len(embed))

768


In [11]:
from pinecone import Pinecone, ServerlessSpec


from pinecone import ServerlessSpec

global pc

index_name = "med-skincare-chatbot"
# def create_index(index_name=index_name, dimension=768):
#     pc = Pinecone(api_key=PINECONE_API_KEY)
#     # create index only once
#     if not pc.has_index(index_name):
#         pc.create_index(
#             name=index_name,
#             vector_type="dense",
#             dimension=768,
#             metric="cosine",
#             spec=ServerlessSpec(
#                 cloud="aws",
#                 region="us-east-1"
#             ),
#             deletion_protection="disabled",
#             tags={
#                 "environment": "development"
#             }
#         )




In [None]:
from langchain_pinecone import PineconeVectorStore
# Load and overwrite files in existing index
# docs2pc = PineconeVectorStore.from_documents(
#     documents=doc_chunks[0],
#     index_name=index_name,
#     embedding=model_embed
# )

In [None]:
# upsert file, i.e update or add files to an existing index

# def upsert_file(docz, index_host=str):
#     from tqdm import tqdm
#     index = pc.Index(host=index_host)
#     # Prepare data for upserting
#     records_to_upsert = []
#     for i, doc in tqdm(enumerate(docz), desc="processing", total=len(docz)):
#         # Generate a unique ID for each document chunk
#         unique_id = f"doc_{i}"
#         # Generate embedding for the document content
#         embedding = model_embed.embed_query(doc.page_content)
#         records_to_upsert.append({
#             "id": unique_id,
#             "values": embedding,
#             "metadata": {"text": doc.page_content}
#         })
        
    # Upsert the records to the index
    # batch_size = 200
    # for i in tqdm(range(0, len(records_to_upsert), batch_size)):
    #     batch = records_to_upsert[i:i + batch_size]
    #     index.upsert(vectors=batch)
    # print(f"Upserted {len(records_to_upsert)} records to Pinecone index '{index_name}'.")

In [13]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_existing_index(
    index_name=index_name, 
    embedding=model_embed,
    # namespace="chatbot"
    )

In [14]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)
 

In [10]:
retrieved_doc = retriever.invoke("what is hives")
retrieved_doc

[Document(id='doc_1846', metadata={}, page_content='National Institute for Occupational Safety and Health.\nCincinnati, Ohio. (800) 356-4674.\nOTHER\nHistoplasmosis: Protecting Workers at Risk.Centers for\nDisease Control and Prevention.\n<http://www.cdc.gov/niosh/97146eng.html> .\nTish Davidson, A.M.\nHIV infection see AIDS\nHives\nDefinition\nHives is an allergic skin reaction causing localized\nredness, swelling, anditching.\nDescription\nHives is a reaction of the body’s immune system\nthat causes areas of the skin to swell, itch, and become\nreddened (wheals). When the reaction is limited to\nsmall areas of the skin, it is called ‘‘urticaria.’’\nInvolvement of larger areas, such as whole sections\nof a limb, is called ‘‘angioedema.’’\nCauses and symptoms\nCauses\nHives is an allergic reaction. The body’s immune\nsystem is normally responsible for protection from\nforeign invaders. When it becomes sensitized to nor-\nmally harmless substances, the resulting reaction is\ncalled an a

In [15]:
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI

# llm = ChatOpenAI(
#         openai_api_base="https://openrouter.ai/api/v1",
#         openai_api_key=llm_api,
#         model_name='deepseek/deepseek-r1:free',
#         temperature=0.6,
#         max_completion_tokens= 4000,
#         streaming= True
#         )



llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.6,
    max_tokens=4000,
    timeout=None,
    max_retries=3,
    # other params...
)

In [16]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a medical assistance, expert in medical question-answering task."
    "You are to answer questions from the retrieval documents."
    "if a question is out of context or can't be found from retrieved documents, simply answer with 'I don't know'"
    "Ensure to keep your answers short and precise"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [17]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

llm_prompt = create_stuff_documents_chain(llm=llm, prompt=prompt)
reatrival_llm = create_retrieval_chain(retriever, llm_prompt)

response = reatrival_llm.invoke({"input": "I have a reoccurring acne. Please what can I do to stop it completely?"})
print(response['answer'])

The document states that for some patients whose acne reappears, another course of isotretinoin may be needed, while others may do well with topical drugs or oral antibiotics. However, there is no guarantee of completely stopping acne permanently.
