In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import pipeline
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import pacmap
import numpy as np
import plotly.express as px
from ragatouille import RAGPretrainedModel

# Load dataset from the specified path
data_path = "/content/Patient_data.csv"
hospital_data = pd.read_csv(data_path)

# Create documents with Anonymous_Uid as the main identifier and all corresponding fields as metadata
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=f"Details for Anonymous_Uid: {doc['Anonymous_Uid']}",
        metadata={k: v if pd.notna(v) else "NULL" for k, v in doc.items() if k != "Anonymous_Uid"}
    )
    for doc in tqdm(hospital_data.to_dict(orient="records"))
]

# Use a custom text splitter to handle tokenization correctly
class TokenizerTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, tokenizer, chunk_size, chunk_overlap, **kwargs):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
        self.tokenizer = tokenizer

    def split_text(self, text: str):
        print("\n=== Debug: TokenizerTextSplitter ===")
        print(f"Original text: {text[:10]}...")  # Show first 200 characters for debugging

        # Tokenize the text
        tokens = self.tokenizer(
            text, truncation=False, add_special_tokens=False, return_offsets_mapping=False
        )["input_ids"]
        print(f"Tokenized length: {len(tokens)}")

        # Split tokens into chunks
        chunk_size = self._chunk_size
        overlap = self._chunk_overlap
        chunks = [
            tokens[i: i + chunk_size]
            for i in range(0, len(tokens), chunk_size - overlap)
        ]
        print(f"Number of chunks created: {len(chunks)}")

        # Decode tokens back into readable text chunks
        decoded_chunks = [self.tokenizer.decode(chunk, skip_special_tokens=True).strip() for chunk in chunks]
        print(f"Decoded first chunk: {decoded_chunks[0][:10]}...")  # Show first 200 characters of the first chunk
        return decoded_chunks


# Split documents into chunks
def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str = "thenlper/gte-small"
) -> List[LangchainDocument]:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    text_splitter = TokenizerTextSplitter(
        tokenizer=tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
    )

    print("\n=== Debug: split_documents ===")
    docs_processed = []
    for doc_index, doc in enumerate(knowledge_base):
        print(f"\nProcessing document {doc_index + 1}:")
        print(f"Original Content: {doc.page_content[:10]}...")  # Debug the raw content

        # Split the document content into chunks
        chunks = text_splitter.split_text(doc.page_content)
        for chunk_index, chunk in enumerate(chunks):
            print(f"Chunk {chunk_index + 1}: {chunk[:10]}...")  # Debug each chunk

            # Create new LangchainDocument with chunked content
            docs_processed.append(
                LangchainDocument(page_content=chunk, metadata=doc.metadata)
            )

    # Ensure uniqueness
    unique_docs = {doc.page_content: doc for doc in docs_processed}
    print(f"\nTotal unique documents processed: {len(unique_docs)}")
    return list(unique_docs.values())


docs_processed = split_documents(512, RAW_KNOWLEDGE_BASE)

# Visualizing the distribution of document lengths
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

# Create embeddings for documents using HuggingFace Embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-small",
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

# Create the vector database using FAISS
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE)

# Embed a user query for retrieval
user_query = "Mention all Anonymous_Uid whose Drugname are AUROFLOX EYE DROPS?"
query_vector = embedding_model.embed_query(user_query)

# Visualize the embeddings
embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)
embeddings_2d = [
    list(KNOWLEDGE_VECTOR_DATABASE.index.reconstruct_n(idx, 1)[0]) for idx in range(len(docs_processed))
] + [query_vector]
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")
df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "source": docs_processed[i].metadata.get("Anonymous_Uid", "Unknown"),
            "extract": docs_processed[i].page_content[:100] + "...",
            "symbol": "circle",
            "size_col": 4,
        }
        for i in range(len(docs_processed))
    ]
    + [
        {
            "x": documents_projected[-1, 0],
            "y": documents_projected[-1, 1],
            "source": "User query",
            "extract": user_query,
            "size_col": 100,
            "symbol": "star",
        }
    ]
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_layout(title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>")
fig.show()

# Function to perform RAG (Retrieval Augmented Generation) to answer user queries
# Function to perform RAG (Retrieval Augmented Generation) to answer user queries
def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5
) -> Tuple[str, List[LangchainDocument]]:
    print("=> Retrieving documents...")
    # Retrieve the most relevant documents
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)

    # Print the retrieved documents with metadata
    print("\n=== Retrieved Relevant Documents ===")
    for i, doc in enumerate(relevant_docs):
        print(f"Document {i+1}:")
        print(f"Page Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")
        print("-" * 50)

    # Rerank the documents if a reranker is provided
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    # Limit the number of documents to the final number required
    relevant_docs = relevant_docs[:num_docs_final]

    # Construct the context from the retrieved documents
    context = "\nExtracted documents:\n" + "".join(
        [f"Document {str(i+1)}:\nContent: {doc.page_content}\nMetadata: {doc.metadata}\n" for i, doc in enumerate(relevant_docs)]
    )

    # Build the final prompt for the LLM
    final_prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "Use the context to answer the user's question.Also don't be case sensitive"},
            {"role": "user", "content": f"Context:\n{context}\n---\nQuestion: {question}"},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    print("=> Generating answer...")
    # Generate the answer using the LLM
    answer = llm(final_prompt)[0]["generated_text"]
    return answer, relevant_docs


# Example query to get answer from the system




# Load the pre-trained model for text generation
READER_MODEL_NAME = "Hugg"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

# Example query to get answer from the system
question = "Mention all Anonymous_Uid whose Drugname are AUROFLOX EYE DROPS?"
answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE)
print(f"Answer: {answer}")

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import pipeline
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import numpy as np
from ragatouille import RAGPretrainedModel

# Load dataset from the specified path
data_path = "/content/Patient_data.csv"
hospital_data = pd.read_csv(data_path)
hospital_data = hospital_data

# Create documents with Anonymous_Uid as the main identifier and all corresponding fields as metadata
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=f"Details for Anonymous_Uid: {doc['Anonymous_Uid']}",
        metadata={k: v if pd.notna(v) else "NULL" for k, v in doc.items() if k != "Anonymous_Uid"}
    )
    for doc in tqdm(hospital_data.to_dict(orient="records"))
]

# Use a custom text splitter to handle tokenization correctly
class TokenizerTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, tokenizer, chunk_size, chunk_overlap, **kwargs):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
        self.tokenizer = tokenizer

    def split_text(self, text: str):
        print("\n=== Debug: TokenizerTextSplitter ===")
        print(f"Original text: {text[:10]}...")  # Show first 200 characters for debugging

        # Tokenize the text
        tokens = self.tokenizer(
            text, truncation=False, add_special_tokens=False, return_offsets_mapping=False
        )["input_ids"]
        print(f"Tokenized length: {len(tokens)}")

        # Split tokens into chunks
        chunk_size = self._chunk_size
        overlap = self._chunk_overlap
        chunks = [
            tokens[i: i + chunk_size]
            for i in range(0, len(tokens), chunk_size - overlap)
        ]
        print(f"Number of chunks created: {len(chunks)}")

        # Decode tokens back into readable text chunks
        decoded_chunks = [self.tokenizer.decode(chunk, skip_special_tokens=True).strip() for chunk in chunks]
        print(f"Decoded first chunk: {decoded_chunks[0][:10]}...")  # Show first 200 characters of the first chunk
        return decoded_chunks


# Split documents into chunks
def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str = "BAAI/bge-large-en-v1.5"  # Use new tokenizer model here
) -> List[LangchainDocument]:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    text_splitter = TokenizerTextSplitter(
        tokenizer=tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
    )

    print("\n=== Debug: split_documents ===")
    docs_processed = []
    for doc_index, doc in enumerate(knowledge_base):
        print(f"\nProcessing document {doc_index + 1}:")
        print(f"Original Content: {doc.page_content[:10]}...")  # Debug the raw content

        # Split the document content into chunks
        chunks = text_splitter.split_text(doc.page_content)
        for chunk_index, chunk in enumerate(chunks):
            print(f"Chunk {chunk_index + 1}: {chunk[:10]}...")  # Debug each chunk

            # Create new LangchainDocument with chunked content
            docs_processed.append(
                LangchainDocument(page_content=chunk, metadata=doc.metadata)
            )

    # Ensure uniqueness
    unique_docs = {doc.page_content: doc for doc in docs_processed}
    print(f"\nTotal unique documents processed: {len(unique_docs)}")
    return list(unique_docs.values())


docs_processed = split_documents(512, RAW_KNOWLEDGE_BASE)

# Visualizing the distribution of document lengths
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")  # Use the new tokenizer
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

# Create embeddings for documents using HuggingFace Embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",  # Use the new embedding model here
    multi_process=True,
    model_kwargs={"device": "cuda", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True},
)

# Create the vector database using FAISS
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE)

# Embed a user query for retrieval
user_query = "Mention all Anonymous_Uid whose drugname is TOBA DROPS ?"
query_vector = embedding_model.embed_query(user_query)

# Function to perform RAG (Retrieval Augmented Generation) to answer user queries
def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5
) -> Tuple[str, List[LangchainDocument]]:
    print("=> Retrieving documents...")
    # Retrieve the most relevant documents
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)

    # Print the retrieved documents with metadata
    print("\n=== Retrieved Relevant Documents ===")
    for i, doc in enumerate(relevant_docs):
        print(f"Document {i+1}:")
        print(f"Page Content: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")
        print("-" * 50)

    # Rerank the documents if a reranker is provided
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    # Limit the number of documents to the final number required
    relevant_docs = relevant_docs[:num_docs_final]

    # Construct the context from the retrieved documents
    context = "\nExtracted documents:\n" + "".join(
        [f"Document {str(i+1)}:\nContent: {doc.page_content}\nMetadata: {doc.metadata}\n" for i, doc in enumerate(relevant_docs)]
    )

    # Build the final prompt for the LLM
    final_prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "Use the context to answer the user's question.Also don't be case sensitive"},
            {"role": "user", "content": f"Context:\n{context}\n---\nQuestion: {question}"},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    print("=> Generating answer...")
    # Generate the answer using the LLM
    answer = llm(final_prompt)[0]["generated_text"]
    return answer, relevant_docs


# Load the pre-trained model for text generation
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

# Example query to get answer from the system
question = "Mention all Anonymous_Uid whose drugname is TOBA DROPS ?"
answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE)
print(f"Answer: {answer}")
