<a href="https://colab.research.google.com/github/Anion-codes/Medical_chatbot/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain_core langchain langchain-community pinecone-client==3.0.1 ctransformers==0.2.5 sentence-transformers pypdf==3.16.4 PyMuPDF==1.24.1 flask==2.3.3 transformers langchain_pinecone==0.2.6 pinecone-client==3.0.1

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting pinecone-client==3.0.1
  Downloading pinecone_client-3.0.1-py3-none-any.whl.metadata (12 kB)
Collecting ctransformers==0.2.5
  Downloading ctransformers-0.2.5-py3-none-any.whl.metadata (14 kB)
Collecting pypdf==3.16.4
  Downloading pypdf-3.16.4-py3-none-any.whl.metadata (7.4 kB)
Collecting PyMuPDF==1.24.1
  Downloading PyMuPDF-1.24.1-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting flask==2.3.3
  Downloading flask-2.3.3-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain_pinecone==0.2.6
  Downloading langchain_pinecone-0.2.6-py3-none-any.whl.metadata (5.3 kB)
Collecting PyMuPDFb==1.24.1 (from PyMuPDF==1.24.1)
  Downloading PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Collecting pinecone<7.0.0,>=6.0.0 (from pinecone[async]<7.0.0,>=6.0.0->langchain_pinecone==0.2.6)
  Downloading pinecone-6.0.2-py3-none-any.w

In [None]:
import re
import unicodedata
from uuid import uuid4
from google.colab import drive, userdata
from sentence_transformers import SentenceTransformer
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
from langchain.chains import RetrievalQA, StuffDocumentsChain, LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms import CTransformers
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone



In [None]:
# Helper Functions
def remove_line_breaks(text):
    return re.sub(r'\n+', ' ', text)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def fix_hyphenated_linebreaks(text):
    return re.sub(r'-\n', '', text)

def normalize_unicode(text):
    return unicodedata.normalize('NFKC', text)

def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text)

def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

def remove_special_chars(text):
    return re.sub(r'[^\w\s.,;:!?()\-]', '', text)

def remove_broken_sentences(text):
    return "\n".join([line for line in text.splitlines() if re.search(r'[aeiouAEIOU]', line)])

def merge_short_lines(text, min_length=40):
    lines = text.splitlines()
    merged, buffer = [], ''
    for line in lines:
        line = line.strip()
        if len(line) < min_length:
            buffer += ' ' + line
        else:
            merged.append(buffer.strip())
            buffer = line
    merged.append(buffer.strip())
    return '\n'.join([line for line in merged if line])

def clean_text(text):
    text = normalize_unicode(text)
    text = fix_hyphenated_linebreaks(text)
    text = remove_line_breaks(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = remove_broken_sentences(text)
    text = merge_short_lines(text)
    text = remove_extra_spaces(text)
    return text
# add and remove functions if there is over-cleaaning problem

def text_split(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=20,
    )
    return splitter.split_documents(documents)



In [None]:
#  Mount Google Drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# Load and Clean PDFs
pdf_dir = '/content/drive/MyDrive/docs/'
loader = DirectoryLoader(pdf_dir, loader_cls=PyPDFLoader)
documents = loader.load()

cleaned_documents = [Document(page_content=doc.page_content) for doc in documents] #add clean text like page_content=clean_text(doc.page_content)
text_chunks = text_split(cleaned_documents)
print(f"Number of text chunks: {len(text_chunks)}")



  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [None]:
#Embeddings
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")



In [None]:
# Pinecone Setup
API_key = userdata.get('PINECONE_API_KEY')
pc = Pinecone(api_key=API_key)
index_name = f"chatbot-{str(uuid4())[:8]}" #TO GENERATE NEW INDEX EVERYTIME

pc.create_index_for_model(
    name=index_name,
    cloud="aws",
    region="us-east-1",
    embed={"model": "llama-text-embed-v2", "field_map": {"text": "chunk_text"}}
)

index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)



In [None]:
# Add documents
uuids = [str(uuid4()) for _ in text_chunks]
text_contents = [chunk.page_content for chunk in text_chunks]
vector_store.add_texts(texts=text_contents, ids=uuids)



In [None]:
# Retrieval and LLM Setup
retriever = vector_store.as_retriever(similarity_top_k=3)

llm = CTransformers(
    model="/content/drive/MyDrive/docs/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama",
    config={"max_new_tokens": 512, "temperature": 0.8}
)

prompt = PromptTemplate.from_template(
    """
    Use the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}
    Question: {question}

    Only return the helpful answer below and nothing else.
    Helpful answer:
    """
)

document_chain = StuffDocumentsChain(
    llm_chain=LLMChain(llm=llm, prompt=prompt),
    document_variable_name="context"
)

qa = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=document_chain,
    return_source_documents=True
)



In [None]:
#Testruning QA based on a medical book
query = "what is allergy"
docs = retriever.get_relevant_documents(query)
print("Retrieved Docs:", docs)
response = qa.invoke(query)
print("Answer:", response['result'])