# STEP 0 â€” Environment Setup

In [None]:
!pip install -q transformers sentence-transformers faiss-cpu langchain datasets numpy pandas

# STEP 1 â€” Collect & Load Data

In [None]:
from datasets import load_dataset

corpus = load_dataset(
    "rag-datasets/rag-mini-bioasq",
    "text-corpus",
    split="passages"
)

qa_data = load_dataset(
    "rag-datasets/rag-mini-bioasq",
    "question-answer-passages",
    split="test"
)

print(corpus[0])

{'passage': 'New data on viruses isolated from patients with subacute thyroiditis de Quervain \nare reported. Characteristic morphological, cytological, some physico-chemical \nand biological features of the isolated viruses are described. A possible role \nof these viruses in human and animal health disorders is discussed. The isolated \nviruses remain unclassified so far.', 'id': 9797}


In [None]:
from google.colab import files
uploaded = files.upload()

Saving dialogs.txt to dialogs (3).txt


# STEP 2 â€” Preprocess the Data

In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z0-9?.!,']+", " ", text)
    return text.strip()

# Knowledge chunks
bio_chunks = [clean_text(p["passage"]) for p in corpus]

print("Total BioASQ knowledge chunks:", len(bio_chunks))
print("\nSample chunk:\n", bio_chunks[0])

Total BioASQ knowledge chunks: 40221

Sample chunk:
 new data on viruses isolated from patients with subacute thyroiditis de quervain are reported. characteristic morphological, cytological, some physico chemical and biological features of the isolated viruses are described. a possible role of these viruses in human and animal health disorders is discussed. the isolated viruses remain unclassified so far.


In [None]:
custom_text = ""
for file in uploaded.keys():
    with open(file, "r") as f:
        custom_text += f.read() + "\n"

In [None]:
dialog_text = custom_text

In [None]:
dialog_chunks = []

for line in dialog_text.split("\n"):
    if "\t" in line:
        user, bot = line.split("\t", 1)
        chunk = f"User: {clean_text(user)}\nBot: {clean_text(bot)}"
        dialog_chunks.append(chunk)

print("Dialog chunks:", len(dialog_chunks))
print(dialog_chunks[0])

Dialog chunks: 3725
User: hi, how are you doing?
Bot: i'm fine. how about yourself?


# STEP 3 â€” Chunk the Data

In [None]:
!pip install -q langchain-text-splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)

custom_chunks = splitter.split_text(clean_text(custom_text))

In [None]:
# Source 2: Conversational dialog chunks
conversation_chunks = dialog_chunks

# Optional Source 3: Other custom documents (already chunked)
all_chunks = bio_chunks + conversation_chunks

print("Total knowledge chunks:", len(all_chunks))

Total knowledge chunks: 43946


# STEP 4 - Convert Chunks into Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

chunk_embeddings = embed_model.encode(
    all_chunks,
    show_progress_bar=True
)

Batches:   0%|          | 0/1374 [00:00<?, ?it/s]

# STEP 5 â€” Store Embeddings in a Vector Database (FAISS)

In [None]:
import faiss
import numpy as np

dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)

index.add(np.array(chunk_embeddings).astype("float32"))

print("Vectors stored:", index.ntotal)

Vectors stored: 43946


# Step 6 - Retrieve Relevant Chunks

In [None]:
bio_count = len(bio_chunks)

def retrieve_chunks(query, k=10):
    query_vec = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(query_vec, k)

    bio_ranked = []

    for dist, idx in zip(distances[0], indices[0]):
        if idx < bio_count:
            bio_ranked.append((dist, all_chunks[idx]))

    bio_ranked.sort(key=lambda x: x[0])

    return [chunk[:300] for _, chunk in bio_ranked[:5]]

# Step 7 - Generate the Answer (LLM â€” FLAN-T5-LARGE)

In [None]:
from transformers import pipeline

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0
)

Device set to use cuda:0


# STEP 8 â€” Complete RAG Function

In [None]:
def rag_answer(user_query):

    retrieved_chunks = retrieve_chunks(user_query, k=10)
    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
I am a knowledgeable medical assistant.

Use the context below to answer the question.
If the context is partially relevant, infer carefully
and give the best possible answer.

Context:
{context}

Question:
{user_query}

Answer:
"""

    output = generator(
        prompt,
        max_new_tokens=256,
        do_sample=False
    )

    return output[0]["generated_text"].strip()

In [None]:
# def rag_answer(user_query, k=5):
#     query = user_query
#     # query = clean_text(user_query)

#     # -------- RETRIEVE --------
#     retrieved_chunks = retrieve_chunks(query, k=k)
#     context = "\n\n".join(retrieved_chunks)

#     # -------- AUGMENT --------
#     prompt = f"""
# You are a knowledgeable assistant.

# Answer the question using ONLY the context below.
# If the answer is not found, say:
# "I don't have that information."

# Provide a clear, complete sentence.

# Context:
# {context}

# Question:
# {query}

# Answer:
# """

#     # -------- GENERATE --------
#     output = generator(
#     prompt,
#     max_new_tokens=256,
#     do_sample=False
#     )

#     answer = output[0]["generated_text"].strip()

#     # Safety fallback
#     if answer.lower() in ["1", "true", "false"]:
#         return "I don't have that information."

#     return answer

In [None]:
print("Q:", qa_data[0]["question"])
print("Expected (gold):", qa_data[0]["answer"])
print("RAG Answer:", rag_answer(qa_data[0]["question"]))

Q: Is Hirschsprung disease a mendelian or a multifactorial disorder?
Expected (gold): Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model.
RAG Answer: heterogonous


# STEP 9 â€” Interactive Chat

In [None]:
def is_small_talk(text):
    greetings = ["hi", "hello", "hai", "hey", "heloo"]
    return text.lower().strip() in greetings

In [None]:
print("ðŸ¤– Multi-Source RAG Chatbot Ready!")
print("Type 'exit' to stop.\n")

while True:
    user = input("You: ")

    if user.lower() in ["exit", "quit", "bye"]:
        print("Bot: Goodbye ðŸ‘‹")
        break

    if is_small_talk(user):
        print("Bot: Hello! Please ask a knowledge-based question.")
        continue

    print("Bot:", rag_answer(user))

ðŸ¤– Multi-Source RAG Chatbot Ready!
Type 'exit' to stop.

You: hai how are you ?
Bot: I am fine
You: Who are you really ?
Bot: I am a knowledgeable medical assistant.
You: what is the condition called if thyroid gland produces less hormone ?
Bot: hypothyroidism
You: bye
Bot: Goodbye ðŸ‘‹
