## Imports

In [17]:
# === Standard Library Imports ===
import os
import re
import csv
import json
import time
import glob
import pickle
import random
from datetime import datetime
from collections import defaultdict, deque

# === Web & API Requests ===
import requests
from bs4 import BeautifulSoup

# === PDF Handling ===
from PyPDF2 import PdfReader
import pdfplumber

# === Machine Learning & Embeddings ===
import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer

# === OpenAI & Azure OpenAI ===
import tiktoken
import openai
from openai import OpenAIError, RateLimitError, AzureOpenAI
from langchain_openai import AzureOpenAIEmbeddings, AzureOpenAI  # (duplicate module name kept)

# === Google Gemini ===
import google.generativeai as genai

# === ChromaDB ===
import chromadb
from langchain_chroma import Chroma

# === LangChain ===
from langchain_openai import AzureOpenAIEmbeddings  # (duplicate kept as required)

# === Selenium (Web Scraping) ===
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# === Utilities & Progress Bars ===
from tqdm import tqdm

# === Repeated Imports (kept, as requested not to remove) ===
import os
import json
import torch
import time
import tiktoken
import google.generativeai as genai
from tqdm import tqdm
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

# === Faulty Custom Imports (kept but syntactically corrected) ===
from src.utils.file_utils import zip_directory
from src.preprocessing.nlp.web_scraping_utils import *  
from src.preprocessing.nlp.pdf_preprocessing import *      

# === Environment Variables ===
from dotenv import load_dotenv
load_dotenv()

# === Custom Imports (Block 2) ===
from src.utils.file_utils import zip_directory
from src.preprocessing.nlp.web_scraping_utils import *
from src.preprocessing.nlp.pdf_preprocessing import (
    extract_text_from_pdf,
    save_to_json,
    process_multiple_pdfs
)

from src.nlp.utils.tex_cleaning import (
    clean_text,
    is_navigation_item,
    clean_website_data,
    clean_pdf_data,
    clean_list_based_json,
    process_json_files
)

from src.nlp.api.config import openai_client, embedding_function, genai

from src.nlp.rag.chromadb import (
    store_embeddings_in_chroma,
    process_and_store_embeddings,
    store_embeddings_in_chroma_openai,
    process_and_store_embeddings_openai
)

from src.nlp.rag.embeddings_utils import structure_documents, load_all_embeddings
from src.nlp.rag.embeddings import GeminiEmbeddings, MiniLMEmbeddings, get_embedding
from src.nlp.rag.embeddings_openai import structure_documents_openai, get_embedding_openai

from src.nlp.rag.retrieval import (
    retrieve_top_k_chromadb,
    hybrid_retrieval,
    combined_retrieval
)

from src.nlp.prompt.prompt import generate_structured_prompt_tnm

from src.nlp.api.gpt4omini import (
    get_azure_openai_rate_limits,
    enforce_rate_limits_openai,
    generate_response_gpt4o
)

from src.nlp.api.gemini_2flash import generate_response_gemini

from src.nlp.rag.chunking import chunk_text, chunk_text_openai

from src.nlp.rag.rag_pipeline import retrieval_and_response_pipeline

  from tqdm.autonotebook import tqdm, trange


### Dinamic Websites

In [None]:
# List of URLs to scrape - Website American Cancer Association
urls = [
    # American Cancer Association
    "https://www.cancer.org/cancer/types/lung-cancer/about/what-is.html",
    "https://www.cancer.org/cancer/types/lung-cancer/about/key-statistics.html",
    "https://www.cancer.org/cancer/types/lung-cancer/causes-risks-prevention/risk-factors.html",
    "https://www.cancer.org/cancer/types/lung-cancer/causes-risks-prevention/what-causes.html",
    "https://www.cancer.org/cancer/types/lung-cancer/causes-risks-prevention/prevention.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/detection.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/lung-nodules.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/signs-symptoms.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/how-diagnosed.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/staging-nsclc.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/staging-sclc.html",
    "https://www.cancer.org/cancer/types/lung-cancer/detection-diagnosis-staging/survival-rates.html",
    
    ## Non Small Cell LC
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/surgery.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/radiofrequency-ablation.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/radiation-therapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/tumor-treating-fields.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/chemotherapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/targeted-therapies.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/immunotherapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/palliative.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-non-small-cell/by-stage.html",
    
    ## Small Cell LC
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/chemotherapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/immunotherapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/radiation-therapy.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/surgery.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/palliative.html",
    "https://www.cancer.org/cancer/types/lung-cancer/treating-small-cell/by-stage.html",
    
    ## SCLC NSCLC
    "https://www.cancer.org/cancer/types/lung-cancer/if-you-have-non-small-cell-lung-cancer-nsclc.html",
    "https://www.cancer.org/cancer/types/lung-cancer/if-you-have-small-cell-lung-cancer-sclc.html",
    
    # NCI
    ## Patient version
    "https://www.cancer.gov/types/lung/patient/non-small-cell-lung-treatment-pdq",
    "https://www.cancer.gov/types/lung/patient/small-cell-lung-treatment-pdq",
    
    ## Health professional version
    "https://www.cancer.gov/types/lung/hp/non-small-cell-lung-treatment-pdq",
    "https://www.cancer.gov/types/lung/hp/small-cell-lung-treatment-pdq"
]
    
output_directory = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/scraped_data/websites/"
output_csv_file = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/scraped_data/websites.csv"
scrape_and_store_pages(urls, output_directory, output_csv_file)

### PDF

## Data cleaning

## Embedding data - models

**Embedding Models**

| **Model** | **Provider** | **Size** | **Best For** |
|-----------|------------|---------|------------|
| `all-MiniLM-L6-v2` | **Sentence-Transformers** | **22M** | General text retrieval |
| `text-embedding-ada-002` | **OpenAI** | Proprietary | High-quality embeddings for various NLP tasks |
| `gemini-text-embedding-004` | **Google** | Proprietary | Optimized for Google AI applications |

In [2]:
# === Configure Torch to Use MPS (Mac), CUDA (NVIDIA), or CPU ===
device = "mps" if torch.backends.mps.is_available() else "cpu"

#### Get OpenAI limits

In [4]:
# Initialize Azure OpenAI Client
openai_client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version,
)

# Configure Azure OpenAI Embeddings
embedding_function = AzureOpenAIEmbeddings(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    azure_deployment="TextEmbeddings",  # Ensure this is the correct deployment name in Azure
    openai_api_version=api_version,
    max_retries=50
)

# Run the function
get_azure_openai_rate_limits()


Azure OpenAI API Rate Limits:
Requests Per Minute (RPM): 2500
Remaining Requests: 2499
Tokens Per Minute (TPM): 250000
Remaining Tokens: 249997


In [5]:
# List available models
models = genai.list_models()

# Print models
for model in models:
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thi

In [6]:
# === Tokenizer for Chunking ===
enc = tiktoken.get_encoding("cl100k_base")

### Gemini embeddings + MiniLM embeddings

In [8]:
 os.environ["TOKENIZERS_PARALLELISM"] = "false"

# === Set Device to Use Mac MPS (Metal Performance Shaders) ===
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# === Define Experimental Parameters ===
CHUNK_OVERLAPS = [0.2]  # Overlap variations

# Define chunk sizes per model type
CHUNK_SIZES = {
    "gemini": [1000],  # Gemini models
    "local": [500]     # Local models (MiniLM, MPNet)
}

# === Storage Paths ===
DATA_FOLDER = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/scraped_data/cleaned"
STORAGE_DIR = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings"
CHROMA_PATHS = {
    "gemini": os.path.join(STORAGE_DIR, "chroma_db_gemini"),
    "local": os.path.join(STORAGE_DIR, "chroma_db_minilm"),
}

for path in CHROMA_PATHS.values():
    os.makedirs(path, exist_ok=True)

# === Define Embedding Models ===
EMBEDDING_MODELS = {
    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "gemini": "gemini"
}


# === Initialize ChromaDB with the Correct Embedding Classes ===
db_gemini = Chroma(persist_directory=CHROMA_PATHS["gemini"], embedding_function=GeminiEmbeddings(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyAsBeecsEuVOeo7zanoC7yfC5w97hi4ffM")))
db_minilm = Chroma(persist_directory=CHROMA_PATHS["local"], embedding_function=MiniLMEmbeddings(device=device))

Using device: mps


In [71]:
print(" Total embeddings stored in ChromaDB - Gemini:", db_gemini._collection.count())
print(" Total embeddings stored in ChromaDB - MiniLM:", db_minilm._collection.count())

 Total embeddings stored in ChromaDB - Gemini: 1327
 Total embeddings stored in ChromaDB - MiniLM: 1837


### Open AI embedings - text-embedding-ada-002

In [8]:
# === Define Azure OpenAI Rate Limits ===
MAX_REQUESTS_PER_MINUTE = 2500  
MAX_TOKENS_PER_MINUTE = 250000  
REQUEST_TIMESTAMPS = deque(maxlen=MAX_REQUESTS_PER_MINUTE)  
TOKENS_USED = 0  

# === Define Chunking Parameters ===
CHUNK_OVERLAPS = [0.2]  
CHUNK_SIZES = {
    "openai": [1000]  
}

# === Storage Paths ===
DATA_FOLDER = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/scraped_data/cleaned"
STORAGE_DIR = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings"
CHROMA_DB_DIR = os.path.join(STORAGE_DIR, "chroma_db_openAI")

# Ensure directories exist
os.makedirs(STORAGE_DIR, exist_ok=True)
os.makedirs(CHROMA_DB_DIR, exist_ok=True)

# Initialize ChromaDB
db = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embedding_function)

In [73]:
print(" Total embeddings stored in ChromaDB - OpenAI:", db_openai._collection.count())

 Total embeddings stored in ChromaDB - OpenAI: 1327


In [11]:
# Example usage
directory_to_zip = "/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_openAI" 
zip_directory(directory_to_zip)

Directory '/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_openAI' successfully zipped to '/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_openAI.zip'


'/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_openAI.zip'

## Retrieve

**Vector Storage Choice: ChromaDB**
For this experiment, **ChromaDB** will be used as the vector storage system. ChromaDB was selected because:

- It supports **both vector search (embeddings) and lexical search (BM25)**, allowing for **hybrid retrieval**.
- It includes **built-in persistence**, meaning stored embeddings can be reused without recomputation.
- It allows storing **metadata**, which helps track the source of retrieved documents.

This makes ChromaDB a **better choice than FAISS**, which only supports vector search and does not natively store metadata or perform hybrid retrieval.

---

**Models to Experiment With**
| **Model Name** | **Description** |
|--------------|----------------|
| **GPT-4o-mini** | OpenAI's most advanced language model, capable of high-quality reasoning and text generation. |
| **GPT-3.5** | A faster and more cost-effective alternative to GPT-4, useful for real-time applications. |
| **Gemini 1.5 Flash** | Optimized for speed and efficiency, suitable for tasks requiring quick responses. |
| **Gemini 1.5 Pro** | More powerful than Gemini Flash, designed for complex reasoning and multimodal tasks. |

---

**Retrieval Methods to Compare**
| **Retrieval Method** | **Description** |
|--------------------|----------------|
| **Embedding-Based Search** | Uses semantic similarity to find relevant documents based on vector representations. |
| **BM25 (Lexical Search)** | A keyword-based retrieval method that ranks documents based on term frequency and inverse document frequency. |
| **Hybrid Search (BM25 + Embeddings)** | Combines BM25 for exact keyword matching with embeddings for semantic understanding. |
| **Re-Ranking (BM25/Embeddings + Ranking Model)** | Uses a second-stage model to refine retrieved results, improving precision for complex queries. |

---

**Best Embedding Models to Try**
| **Embedding Model** | **Description** |
|--------------------|----------------|
| **OpenAI `text-embedding-ada-002`** | OpenAI's most widely used embedding model, optimized for general-purpose retrieval. |
| **Gemini Embeddings** | Google's proprietary embedding model, optimized for retrieval within the Gemini ecosystem. |
| **all-MiniLM-L6-v2** | A small, efficient model that balances speed and accuracy for embedding-based retrieval. |
| **bge-large-en-v1.5** | A larger embedding model designed for improved retrieval accuracy in dense vector search. |

---

This setup will allow for a **direct comparison of GPT vs. Gemini** using multiple retrieval techniques and embedding models to determine the best configuration for the given dataset.

#### Load embeddings

In [4]:
db_gemini = Chroma(persist_directory="/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_gemini")
db_minilm = Chroma(persist_directory="/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_minilm")
db_openai = Chroma(persist_directory="/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/LLM lung cancer/embedings/chroma_db_openAI")


# Load stored embeddings
stored_gemini = load_all_embeddings(db_gemini)
stored_minilm = load_all_embeddings(db_minilm)
stored_openai = load_all_embeddings(db_openai)

# Print count of stored embeddings
print(f"Total Gemini embeddings: {len(stored_gemini['documents'])}")
print(f"Total MiniLM embeddings: {len(stored_minilm['documents'])}")
print(f"Total OpenAI embeddings: {len(stored_openai['documents'])}")

Total Gemini embeddings: 1327
Total MiniLM embeddings: 1837
Total OpenAI embeddings: 1327


In [5]:
# Retrieve a single stored embedding to check dimensions
sample_gemini = db_gemini._collection.get(include=["embeddings"], limit=1)
sample_minilm = db_minilm._collection.get(include=["embeddings"], limit=1)
sample_openai= db_openai._collection.get(include=["embeddings"], limit=1)

# Extract the first embedding from each (if available)
print("Gemini Embedding Dimensions:", len(sample_gemini["embeddings"][0]))
print("MiniLM Embedding Dimensions:", len(sample_minilm["embeddings"][0]))
print("Open AI Embedding Dimensions:", len(sample_openai["embeddings"][0]))

Gemini Embedding Dimensions: 768
MiniLM Embedding Dimensions: 384
Open AI Embedding Dimensions: 1536


### Response Generation

#### Model response

In [65]:
query_text = "Based on the patient data and TMN staging what is the exact stage of the cancer and the indicated course of treatment?"
t_stage = "T1b"
n_stage = "N3"
m_stage = "M1"
histopath_grade = ""
cancer_type = "Small cell carcinoma"
age = 72
gender = "Female"
additional_info = "Not a smoker"

# Test different embeddings & retrieval methods
for embedding in ["gemini", "minilm", "openai"]:
    for retrieval in ["cosine", "bm25", "combined"]:
        for llm in ["gpt-4o", "gemini"]:
            print(f"\nRunning test with: {embedding} embeddings | {retrieval} retrieval | {llm} LLM")
            response = retrieval_and_response_pipeline(
                query_text, embedding, retrieval, llm,
                t_stage, n_stage, m_stage, histopath_grade, cancer_type, age, gender, additional_info
            )
            print(f"Response:\n{response}\n")


Running test with: gemini embeddings | cosine retrieval | gpt-4o LLM
Response:
### 1. Clinical Stage

**AJCC TNM Stage**:
- **T Stage**: T1b (tumor size > 3 cm but ≤ 5 cm, no local invasion)
- **N Stage**: N3 (metastasis to ipsilateral supraclavicular or contralateral mediastinal lymph nodes)
- **M Stage**: M1 (distant metastasis)

Based on the AJCC 8th Edition criteria:
- The combination of T1b, N3, and M1 categorizes this case as **Stage IV**.

**Extensive or Limited Stage Classification**:
- **Small Cell Lung Cancer (SCLC)** is classified as **Extensive-Stage SCLC (ES-SCLC)** when there is any distant metastasis (M1). In this case, since M1 is present, the cancer is classified as **ES-SCLC**.

### 2. Treatment Plan

**Treatment Strategy for Extensive-Stage SCLC**:
1. **Systemic Therapy**:
   - **First-Line Chemotherapy**: 
     - **Carboplatin** plus **Etoposide** is the traditional regimen. 
     - Newer options may include combination regimens like **cisplatin** with **etoposide*