In [None]:
!mkdir -p /content/project/data
!mkdir -p /content/project/faiss_index

In [None]:
!ls -R /content/project

In [None]:
!pip install -q langchain langchain-community faiss-cpu sentence-transformers transformers sacremoses streamlit bitsandbytes

In [None]:
!pip install -q torchvision

In [None]:
!pip install pymupdf

In [None]:
import fitz  # PyMuPDF
import os

# Path to the folder with PDFs
pdf_folder = "/content/project/data"

# List to hold all extracted text
all_texts = []

# Loop through each PDF in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        print(f"📘 Extracting from: {filename}")

        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()

        all_texts.append(text)

print(f"\n✅ Extracted text from {len(all_texts)} PDF(s).")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Create Document objects from the raw text
documents = [Document(page_content=text) for text in all_texts]

# Splitter configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

# Perform the split
chunked_docs = text_splitter.split_documents(documents)

print(f"✅ Total chunks created: {len(chunked_docs)}")
print(f"🧩 Example chunk:\n\n{chunked_docs[0].page_content[:500]}")

In [None]:
# First chunk
print("🔹 First chunk:\n", chunked_docs[0].page_content)

# Random chunk
import random
i = random.randint(0, len(chunked_docs)-1)
print(f"\n🔹 Random chunk #{i}:\n", chunked_docs[i].page_content)

In [None]:
!pip install -U langchain-huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load biomedical sentence embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="pritamdeka/S-PubMedBERT-MS-MARCO"
)

print("✅ Biomedical embedding model loaded.")

In [None]:
# Create the FAISS vector index
db = FAISS.from_documents(chunked_docs, embedding_model)

# Save the FAISS index locally
db.save_local("/content/project/faiss_index")

print(f"✅ Vector index created and saved! Total chunks embedded: {len(chunked_docs)}")


In [None]:
!pip install -q accelerate

In [None]:
!pip install -q dotenv

In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
login(os.getenv("HF_TOKEN"))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from transformers import BitsAndBytesConfig

# Replace with the model you prefer
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=text_pipeline)
print("✅ Mistral-Instruct model loaded and ready.")

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Load the FAISS index
db = FAISS.load_local(
    "/content/project/faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a trusted biomedical research assistant specialized in neurodegenerative diseases.

Use only the information provided in the context to answer the question.

Do not use prior knowledge or speculate. If the answer is not in the context, say "I don't know."

Answer concisely and professionally, suitable for clinical or academic use.

Context:
{context}

Question:
{question}

Answer:"""
)

# ✅ Build the RAG chain using the custom prompt
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 8}),
    return_source_documents=False,
    chain_type_kwargs={"prompt": custom_prompt}
)

print("✅ RAG pipeline is live and ready for Q&A!")

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
def clean_output(response_text):
    if "Answer:" in response_text:
        response_text = response_text.split("Answer:")[-1]
    if "Helpful Answer:" in response_text:
        response_text = response_text.split("Helpful Answer:")[-1]
    if "Unhelpful" in response_text:
        response_text = response_text.split("Unhelpful")[0]
    return response_text.strip()

In [None]:
while True:
    query = input("Ask a question (or type 'exit'): ")
    if query.lower() == 'exit':
        break
    response = rag_chain.invoke(query)
    print("Answer:", clean_output(response['result']))


##Evaluation

In [None]:
from project.alzheimers_test_set import test_set
from difflib import SequenceMatcher

def evaluate_response(generated, expected):
    return SequenceMatcher(None, generated.lower(), expected.lower()).ratio()

results = []

for item in test_set:
    query = item["question"]
    expected = item["expected_answer"]

    response = rag_chain.invoke(query)
    generated = clean_output(response["result"])

    score = evaluate_response(generated, expected)

    results.append({
        "question": query,
        "expected": expected,
        "generated": generated,
        "score": round(score, 2)
    })

# Print individual results
for res in results:
    print(f"\n🧠 Q: {res['question']}")
    print(f"✅ Expected: {res['expected']}")
    print(f"🤖 Generated: {res['generated']}")
    print(f"📈 Similarity Score: {res['score']}\n{'-'*70}")

# Print overall average score
average_score = sum([r["score"] for r in results]) / len(results)
print(f"\n🎯 Final Average Similarity Score: {round(average_score * 100, 2)}%")

In [None]:
import csv

csv_path = "/content/project/evaluation_results.csv"  # change if needed

with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["question", "expected", "generated", "score"])
    writer.writeheader()
    for row in results:
        writer.writerow(row)

print(f"📁 Evaluation results saved to: {csv_path}")

In [None]:
!pip install streamlit
!npm install -g localtunnel

In [None]:
!curl https://loca.lt/mytunnelpassword


In [None]:
%%writefile /content/launch_streamlit.sh
!streamlit run /content/project/app.py --server.port 8501


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!streamlit run /content/project/app.py --server.port 8501 & npx localtunnel --port 8501

In [None]:
!pip freeze > requirements.txt