In [5]:
!pip install requests tqdm faiss-cpu transformers tensorflow sentence-transformers textblob gensim

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [6]:
import os
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm


DATA_DIR = Path("./mimic_textbooks")


def download_and_extract_zip(url, extract_to=DATA_DIR):

    extract_to.mkdir(parents=True, exist_ok=True)


    zip_path = extract_to / "textbooks.zip"
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
            if chunk:
                file.write(chunk)


    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset downloaded and extracted.")

# Download and extract textbooks
dataset_url = "https://www.dropbox.com/scl/fi/54p9kkx5n93bffyx08eba/textbooks.zip?rlkey=2y2c5x8y0uncnddichn9cmd7n&st=m290nmkk&dl=1"
download_and_extract_zip(dataset_url)


Downloading dataset...


88121KB [00:01, 61856.47KB/s]


Extracting dataset...
Dataset downloaded and extracted.


In [None]:
import re
from gensim.utils import simple_preprocess
from textblob import TextBlob


def load_text_files(directory):
    texts = []
    for file_path in Path(directory).glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            texts.append(file.read())
    return texts


def clean_and_tokenize(text):

    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)


    tokens = simple_preprocess(text)
    return ' '.join(tokens)


def correct_spelling(text):
    return str(TextBlob(text).correct())


def chunk_text(text, chunk_size=200):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


documents = load_text_files(DATA_DIR / "textbooks/en")
cleaned_documents = [clean_and_tokenize(doc) for doc in documents]
corrected_documents = [correct_spelling(doc) for doc in cleaned_documents]
chunked_documents = []
for doc in corrected_documents:
    chunked_documents.extend(chunk_text(doc))

print(f"Total document chunks created: {len(chunked_documents)}")


In [3]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [None]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
import numpy as np


print("Available devices:", tf.config.list_physical_devices('GPU'))


tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = TFAutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embeddings_in_batch(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]


        inputs = tokenizer(batch_texts, return_tensors="tf", truncation=True, padding=True, max_length=512)


        outputs = model(inputs).last_hidden_state
        batch_embeddings = tf.reduce_mean(outputs, axis=1).numpy()


        all_embeddings.extend(batch_embeddings)

    return np.array(all_embeddings)


embeddings = get_embeddings_in_batch(chunked_documents, batch_size=128)
print(f"Generated embeddings for {len(embeddings)} document chunks.")



In [6]:
import faiss
import numpy as np


dimension = 384
index = faiss.IndexFlatL2(dimension)


embedding_matrix = np.array([embedding.flatten() for embedding in embeddings]).astype('float32')

index.add(embedding_matrix)
print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 60061


In [9]:


def get_embedding(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    outputs = model(inputs).last_hidden_state
    return tf.reduce_mean(outputs, axis=1).numpy()



query_text = "What are causes of heart failure?"
query_embedding = get_embedding(query_text)
query_embedding = np.array(query_embedding).reshape(1, -1).astype('float32')


k = 5
distances, indices = index.search(query_embedding, k)


print("Top similar document chunks:")
for idx in indices[0]:
    print(chunked_documents[idx])


Top similar document chunks:
down to six principal mechanisms failure of the pump in the most common situation the cardiac muscle contracts weakly and the chambers cannot empty systolic dysfunction in some cases the muscle cannot relax sufficiently to permit ventricular filling resulting in diastolic dysfunction obstruction to flow lesions that prevent valve opening eg calcific aortic valve stenosis or cause increased ventricular chamber pressures eg systemic hypertension or aortic coarctation can overwork the myocardium which has to pump against the obstruction regurgitant flow valve pathology that allows backward flow of blood results in increased volume workload and may overwhelm the pumping capacity of the affected chambers shunted flow defects congenital or acquired that divert blood inappropriately from one chamber to another or from one vessel to another lead to pressure and volume overloads disorders of cardiac conduction uncoordinated cardiac impulses or blocked conduction pat