In [44]:
# Install necessary libraries
# !pip install PyPDF2 langchain-text-splitters unstructured==0.14.5 unstructured[pdf]==0.14.5 openpyxl transformers bitsandbytes accelerate pandas faiss-gpu scikit-learn
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [45]:
# Load the dataset
data = pd.read_excel("Disease_symptom_and_patient_profile_dataset.xlsx")

# Initialize the text splitter with your desired chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,       # Maximum size for each chunk
    chunk_overlap=100      # Overlap between chunks
)

# List to hold the chunks
chunks = []

# Iterate over each row in the DataFrame and convert it to a single string for chunking
for index, row in data.iterrows():
    # Combine all column values for this row into a single string, separated by space
    row_text = ' '.join([f"{col}: {str(row[col])}" for col in data.columns if pd.notna(row[col])])

    # Wrap the row content as a Document object for chunking
    doc = Document(page_content=row_text)

    # Generate chunks for this document/row and add them to the chunks list
    row_chunks = text_splitter.split_documents([doc])
    chunks.extend(row_chunks)

# Now `chunks` contains all your row content, split into chunks with overlap where needed


In [None]:

# Set up embeddings and vector store
persist_directory = r"embedd_vecdb_bge-m3"
model_path="BAAI/bge-m3"
embeddings= HuggingFaceEmbeddings(model_name=model_path)
# Create and save FAISS vector store
# vector_db = FAISS.from_documents(chunks, embeddings)
# vector_db.save_local(persist_directory)

#Load the FAISS vector store
vector_db = FAISS.load_local(persist_directory, embeddings,allow_dangerous_deserialization=True)
print("Database created and persisted successfully.")

In [41]:
# Load the LLM model for Qwen
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [49]:
# Define the function for LLM-based answer generation
def qwen_llm(top_docs, question):
    messages = [
        {"role": "system", "content": "You are a helpful medical assistant."},
        {"role": "user", "content": f"""
        Please provide a medical prediction based on the information given.
        Context: '''{top_docs}'''
        Question: '''{question}'''
        """}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
    response = tokenizer.batch_decode(generated_ids[:, model_inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
    return response

In [50]:
# Generate predictions
question = "What is the outcome if a patient has fever and fatigue?"
top_docs = vector_db.similarity_search(question, k=1)
response = qwen_llm(top_docs, question)
print("LLM Response:", response)

LLM Response: Based on the provided context, the document indicates that for a patient with Influenza, the symptoms include fever, cough, fatigue, difficulty breathing, and the patient is 30 years old, female, with normal blood pressure and cholesterol levels. The outcome variable is positive, suggesting that the patient likely has Influenza.

**Medical Prediction:** If a patient has fever and fatigue, along with other symptoms such as cough, difficulty breathing, and is confirmed to have Influenza, the outcome is positive, indicating the patient has Influenza. It's important for the patient to seek appropriate medical care, rest, and possibly receive antiviral medications as prescribed by a healthcare provider.


In [54]:
# Create Dataset For Evaluate
def generate_synthetic_data(df, num_samples=100):
    synthetic_data = []
    for _ in range(num_samples):
        # Randomly choose attributes based on existing data distribution
        disease = df["Disease"].sample(1).values[0]
        fever = "Yes" if np.random.rand() > 0.5 else "No"
        cough = "Yes" if np.random.rand() > 0.5 else "No"
        fatigue = "Yes" if np.random.rand() > 0.5 else "No"
        difficulty = "Yes" if np.random.rand() > 0.5 else "No"
        age = np.random.randint(0, 100)
        gender = "Male" if np.random.rand() > 0.5 else "Female"
        blood_pressure = "Normal" if np.random.rand() > 0.5 else "High"
        cholesterol = "Normal" if np.random.rand() > 0.5 else "High"
        
        # Determine outcome based on some logic (or randomness)
        outcome = "Positive" if fever == "Yes" and fatigue == "Yes" else "Negative"
        
        synthetic_data.append({
            "Disease": disease,
            "Fever": fever,
            "Cough": cough,
            "Fatigue": fatigue,
            "Difficulty": difficulty,
            "Age": age,
            "Gender": gender,
            "Blood Pressure": blood_pressure,
            "Cholesterol": cholesterol,
            "Outcome": outcome
        })
    
    return pd.DataFrame(synthetic_data)

# Generate and append synthetic data
synthetic_df = generate_synthetic_data(df, num_samples=100)
full_df = pd.concat([df, synthetic_df], ignore_index=True)

In [59]:
synthetic_df['id']=synthetic_df.index

In [60]:
synthetic_df

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty,Age,Gender,Blood Pressure,Cholesterol,Outcome,id
0,Alzheimer's Disease,Yes,No,Yes,No,52,Male,Normal,Normal,Positive,0
1,Osteoporosis,Yes,Yes,No,No,38,Female,Normal,Normal,Negative,1
2,HIV/AIDS,Yes,Yes,Yes,Yes,87,Female,Normal,Normal,Positive,2
3,Ulcerative Colitis,No,Yes,Yes,Yes,8,Male,Normal,Normal,Negative,3
4,Cataracts,No,Yes,Yes,No,35,Female,Normal,High,Negative,4
...,...,...,...,...,...,...,...,...,...,...,...
95,Stroke,No,Yes,Yes,Yes,58,Male,Normal,Normal,Negative,95
96,Allergic Rhinitis,Yes,Yes,Yes,Yes,93,Female,High,High,Positive,96
97,Hyperthyroidism,No,Yes,Yes,Yes,28,Female,Normal,High,Negative,97
98,Kidney Cancer,No,Yes,No,No,86,Female,Normal,High,Negative,98


In [None]:
# Define functions to calculate hit rate and MRR Evaluate
def hit_rate(predictions, actual_outcomes):
    hits = sum(1 for pred, actual in zip(predictions, actual_outcomes) if pred == actual)
    return hits / len(actual_outcomes) if actual_outcomes else 0

def mean_reciprocal_rank(predictions, actual_outcomes):
    reciprocal_ranks = []
    for pred, actual in zip(predictions, actual_outcomes):
        if pred == actual:
            rank = 1  # First rank since we assume a single prediction in this context
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0)  # No relevant prediction found
    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0

# Function to evaluate the RAG system
def evaluate_rag(synthetic_df, vector_db, model, tokenizer):
    predictions = []
    
    for index, row in synthetic_df.iterrows():
        # Construct the query based on synthetic data attributes
        question = f"What is the outcome if a patient has fever: {row['Fever']}, fatigue: {row['Fatigue']}?"
        
        # Get top documents from the vector DB
        top_docs = vector_db.similarity_search(question, k=5)
        
        # Get prediction from the LLM
        response = qwen_llm(top_docs, question)
        
        # Assume response contains the predicted outcome (extract or modify as needed)
        predictions.append("Positive" if "Positive" in response else "Negative")
    
    # Calculate metrics
    hr = hit_rate(predictions, synthetic_df['Outcome'].tolist())
    mrr = mean_reciprocal_rank(predictions, synthetic_df['Outcome'].tolist())
    
    return hr, mrr

# Evaluate the RAG system using the synthetic data
hr, mrr = evaluate_rag(synthetic_df, vector_db, model, tokenizer)

print(f"Hit Rate: {hr:.2f}")
print(f"Mean Reciprocal Rank: {mrr:.2f}")
