# Install these packages if not installed


In [1]:
# !pip install langchain
# !pip install chromadb
# !pip install pypdf
# !pip install pytest
# !pip install accelerate
# !pip install -U bitsandbytes
# %pip install -qU langchain-google-vertexai
#!pip install sacremoses
#!pip install -qU langchain-community faiss-cpu

# All Imports

In [1]:
import torch
import numpy as np
import re

# for Google Cloud AutoML
from google.oauth2 import service_account
import vertexai
import json

# LangChain imports
from langchain.schema.document import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_vertexai import VertexAIEmbeddings

# Vector DB
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss
from uuid import uuid4

# Template
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

# Huggingface Login
from huggingface_hub import login

# LLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

# Google Cloud AutoML Setup

In [2]:
def set_google_cloud_autoML():
    # load the JSON file
    with open('./data/probable-life-441114-n1-d2f8fa3aef61.json') as source:
        info = json.load(source)
    
    vertex_cred = service_account.Credentials.from_service_account_info(info)
    
    PROJECT_ID = "probable-life-441114-n1"
    REGION = "asia-south1"
    vertexai.init(project=PROJECT_ID,
                 location=REGION,
                 credentials=vertex_cred)

# Data loading fucnction

In [3]:
def load_and_process_data(DATA_DIRECTORY):
    loader = PyPDFDirectoryLoader(DATA_DIRECTORY)
    data =  loader.load()
    spliter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 100,
        length_function = len,
        is_separator_regex = False, # use if your separators are plain text and not regex patterns.
    )
    return spliter.split_documents(data)

In [None]:
# chunks = load_and_process_data("data")

# Vector DB FAISS

In [34]:
def initialize_vector_db():
    # get the indexing with max embedding lenght
    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
    # initialize the DB
    vector_db = FAISS(
        embedding_function= embeddings,
        index = index,
        docstore = InMemoryDocstore(),
        index_to_docstore_id = {}
    )
    return vector_db

    

# Retriever

In [39]:
def get_similarities(query_text):
    result = vector_db.similarity_search(query = query_text)
    context = "\n".join([doc.page_content for doc in result])
    return result, context

In [38]:
#res = get_similarities(query_text)


# Set Huggingface token

In [9]:
# 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
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load the LLM

In [11]:
def load_llm():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
    # model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")
    from transformers import BioGptForCausalLM
    model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", 
                                              attn_implementation="sdpa", 
                                              torch_dtype=torch.float16,
                                             )
    return tokenizer, model

# Generate Response

In [17]:
def get_response():
    response = chain.invoke({"context": context, "query_text": query_text})
    matches = re.search(r"Question:\s*(.*?)\n\nAnswer:\s*(.*)",response.get("text"), re.DOTALL)
    if matches:
        question = matches.group(1).strip()
        answer = matches.group(2).strip()
        print(f"Question: {question}")
        print(f"Answer: {answer}")

# Necessary Function calls

In [25]:
# Question goes here for now
query_text = "How does pulmonary emphysema affect electrocardiographic potentials?"

In [35]:
# function to setup GCAML
set_google_cloud_autoML()
# Load and Process data
chunks = load_and_process_data("data")  # data is the directory name
# Initialize the a specific Embeddings Model version
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")
# Add data into vector DB
vector_db = initialize_vector_db()
uuids = [str(uuid4()) for _ in range(len(chunks))]
vector_db.add_documents(documents= chunks, ids = uuids)

# define reteiever
retriever = vector_db.as_retriever()

PROMPT_TEMPLATE = """
You are a biomedical expert. Based on the information provided below, answer the question concisely.

Information: {context}

Question: {query_text}

Answer:
"""
prompt_template = PromptTemplate(
    input_variables=["context", "query_text"],
    template=PROMPT_TEMPLATE
)

In [40]:
# Retrieve
result, context = get_similarities(query_text=query_text)

In [14]:
# Pipeline
tokenizer, model = load_llm()
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, device=0)
# Create a LangChain LLM instance
llm = HuggingFacePipeline(pipeline=pipe)
# Chain Old School
chain = LLMChain(prompt=prompt_template, llm=llm)

  llm = HuggingFacePipeline(pipeline=pipe)
  chain = LLMChain(prompt=prompt_template, llm=llm)


In [43]:
get_response()

Question: How does pulmonary emphysema affect electrocardiographic potentials?
Answer: Pulmonary emphysema can decrease the electro- cardiographic potentials, but for a different reason than that of pericardial effusion.


# Next design a Chat like system with chat follow up facility.