In [426]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from typing import List, Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from rank_bm25 import BM25Okapi
import faiss

## Text Preprocessing and Chunking

In [45]:
class TextProcessor:
    def __init__(self, lowe_case=False, 
                 lemmatization=False, 
                 stem=False, 
                 stop_word_removal=False, 
                 split_pattern=r'[ ,.]', 
                 white_space_replace_pattern=r'\s+'):
        
        self.lowe_case = lowe_case
        self.lemmatization = lemmatization
        self.stem = stem
        self.stop_word_removal = stop_word_removal

        self.split_pattern = split_pattern
        self.white_space_replace_pattern = white_space_replace_pattern

        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def remove_white_space(self, text: str) -> str:
        return re.sub(self.white_space_replace_pattern, ' ', text)
    
    def lemmatize(self, words: List[str]) -> List[str]:
        return [self.lemmatizer.lemmatize(word) for word in words]

    def stemm(self, words: List[str]) -> List[str]:
        return [self.stemmer.stem(word) for word in words]

    def remove_stop_words(self, words: List[str]) -> List[str]:
        return [word for word in words if word not in self.stop_words]

    def process_text(self, text: str) -> List[str]:
        if self.lowe_case:
            text = text.lower()
        words = re.split(self.split_pattern, text)
        if self.lemmatization:
            words = self.lemmatize(words)
        if self.stem:
            words = self.stemm(words)
        if self.stop_word_removal:
            words = self.remove_stop_words(words)
        
        processed_text = ' '.join(words)
        processed_text =  self.remove_white_space(processed_text)
        processed_text =  self.remove_white_space(processed_text) # remove white space again
        return processed_text

In [190]:
def chunk_docs(paragraph, k=10, o=3, split_pattern=r'[ ,.]'):
    """
    Splits a paragraph into segments of length k with an overlap of o words.
    
    Parameters:
    - paragraph (str): The long paragraph to split.
    - k (int): The length of each segment in words.
    - o (int): The number of overlapping words between consecutive segments.
    
    Returns:
    - list: A list of paragraph segments.
    """
    words = re.split(split_pattern, paragraph)
    segments = []
    
    # Loop through words in steps of k - o to create overlapping segments
    for i in range(0, len(words), k - o):
        segment = " ".join(words[i:i + k])
        segments.append(segment)
        
        # Stop if the next starting index goes beyond the list length
        if i + k >= len(words):
            break

    return segments

In [226]:
class Preprocessor:
    def __init__(self, lowe_case=False, 
                 lemmatization=False, 
                 stem=False, 
                 stop_word_removal=False, 
                 split_pattern=r'[ ,.]', 
                 white_space_replace_pattern=r'\s+|[.,!@#$%^&*()_+={}\[\]:;"\'|<>,.?/~`\\-]',
                 chunk_mode = False,
                 chunk_size=10,
                 overlap=3):
        self.text_processor = TextProcessor(lowe_case, 
                                            lemmatization, 
                                            stem, 
                                            stop_word_removal, 
                                            split_pattern, 
                                            white_space_replace_pattern)
        self.chunk_mode = chunk_mode
        self.chunk_size = chunk_size
        self.overlap = overlap

    def create_chunk_metadata(self, actual_text: str, preprocessed_text: str , chunk_id: int):
        return {
            'actual_text': actual_text,
            'preprocessed_text': preprocessed_text,
            'chunk_id': chunk_id
        }

    def preprocess(self, text_metatdata: str) -> List[str]:
        """ Preprocess a single text_metatdata. Returns a list of preprocessed texts """
        """
        Input:
        text_metatdata: {
            'text': str,
            'doc_id': str
        }
        """
        # processed_text = self.text_processor.process_text(text)
        # if not self.chunk_mode:
        #     return [processed_text]
        # chunks = chunk_docs(processed_text, self.chunk_size, self.overlap, self.text_processor.split_pattern)


        if not self.chunk_mode:
            return [self.create_chunk_metadata(actual_text = self.text_processor.process_text(text_metatdata['text']), 
                                               preprocessed_text = text_metatdata['text'], 
                                               chunk_id = text_metatdata['text']+"_"+"0")
                                               ]
        
        chunks = chunk_docs(text_metatdata['text'], self.chunk_size, self.overlap, self.text_processor.split_pattern)

        chunks = [self.create_chunk_metadata(actual_text = chunk,
                                             preprocessed_text = self.text_processor.process_text(chunk),
                                             chunk_id = text_metatdata['doc_id']+"_"+str(i)) for i, chunk in enumerate(chunks)]
        
        return chunks
    
    def preprocess_all(self, texts: List[Dict]) -> List[List[str]]:
        """ Preprocess a list of texts. Returns a list of preprocessed texts """
        outputs = []
        for text in texts:
            outputs.extend(self.preprocess(text))
        return outputs

In [227]:
sample_docs = [
    "Concurrent lab workup indicates hypoxemia with a PaO2 of 55 mmHg, mild hypercapnia (PaCO2 of 47 mmHg), and a pH of 7.35 suggestive of compensated respiratory acidosis. Pulmonary function tests (PFTs) demonstrate a decreased FEV1/FVC ratio, confirming obstructive ventilatory impairment. The patient’s peak expiratory flow (PEF) remains under 70% of predicted values, correlating with symptomatic bronchospasm.",
    
    "The patient, a 67-year-old male with a BMI of 32 kg/m², presents with a 12-year history of HTN, T2DM (HbA1c 8.9%), and a recent NSTEMI managed with PCI to the LAD, complicated by LVEF of 38%. Baseline labs show elevated CRP (12 mg/L), BUN (24 mg/dL), and serum Cr of 1.5 mg/dL (eGFR of 52 mL/min/1.73m²). EKG reveals a QRS duration of 132 ms with LBBB morphology, while a TTE indicates moderate MR and dilated LV with an end-diastolic diameter of 6.5 cm. PFTs confirm moderate obstructive lung disease with FEV1 at 60% predicted and an FEV1/FVC ratio of 0.58, with DLCO reduced to 65% of predicted, suggestive of concurrent emphysema secondary to a 45-pack-year smoking history. The patient is currently on dual antiplatelet therapy (DAPT) with ASA 81 mg and clopidogrel 75 mg, high-intensity atorvastatin (80 mg), and ACEi titrated to target dose; however, BP remains uncontrolled (average 158/92 mmHg) despite the addition of HCTZ 25 mg and amlodipine 10 mg daily.",
    
    "The 55-year-old female with a BMI of 28 kg/m² reports progressive dyspnea and orthopnea. BNP levels are elevated at 520 pg/mL, with a serum Na+ of 133 mEq/L and K+ of 5.1 mEq/L. Echocardiogram reveals an LVEF of 25%, along with grade III diastolic dysfunction and severe MR. Coronary angiography identifies significant stenosis in the RCA, requiring DES placement. Pulmonary function tests (PFTs) show a TLC of 80% predicted, with a DLCO at 55%, consistent with restrictive physiology.",
    
    "Patient, a 72-year-old male with known COPD (GOLD stage III) and an FEV1/FVC ratio of 0.52, presents with worsening SOB and peripheral edema. ABG on room air shows pH 7.33, PaCO2 of 60 mmHg, and PaO2 of 62 mmHg, indicating acute-on-chronic hypercapnic respiratory failure. Chest CT reveals bilateral basilar opacities consistent with chronic interstitial changes. Labs notable for elevated BNP (750 pg/mL) and serum Cr 1.3 mg/dL (eGFR of 58 mL/min/1.73m²).",
    
    "A 49-year-old male presents post-syncope with elevated troponin I (0.6 ng/mL) and CK-MB at 15 U/L. EKG shows T-wave inversions in leads V2-V4 and LBBB. Cath reveals a 95% stenosis in the proximal LAD, managed with PCI and BMS. Blood work reveals Hb of 13.5 g/dL, platelets at 250,000/µL, and LFTs showing AST/ALT of 56/63 U/L, suggesting mild hepatocellular injury. The patient is started on beta-blockers, ACEi, and ASA.",
    
    "The 60-year-old male with a 30-pack-year smoking history presents with exertional angina and a recent NSTEMI. Post-PCI with stenting to RCA, ECHO shows an EF of 45%, and LGE on CMR suggests nonviable myocardium. LDL remains elevated at 132 mg/dL despite atorvastatin 40 mg. Labs reveal CRP 14 mg/L and fibrinogen of 480 mg/dL, indicative of systemic inflammation. Dual antiplatelet therapy initiated, with target BP <130/80 mmHg using losartan and thiazide.",
    
    "An 82-year-old female with CKD stage 3 (eGFR 48 mL/min/1.73m²) and longstanding atrial fibrillation presents with a baseline INR of 2.8 on warfarin. TTE shows LA enlargement with mild MR and TR. Recent labs reveal HbA1c of 7.8%, serum Cr at 1.4 mg/dL, and BNP of 480 pg/mL. PFTs reveal FVC at 75% of predicted and FEV1/FVC of 0.65, consistent with obstructive ventilatory defect secondary to COPD.",
    
    "The patient, a 68-year-old with T2DM and HTN, presents with peripheral neuropathy and nephropathy. A1C is elevated at 9.2%, while urine microalbumin is 230 mg/g. Serum Na+ is 136 mEq/L, K+ at 4.6 mEq/L, and fasting lipid panel shows LDL 145 mg/dL, HDL 38 mg/dL. Diabetic retinopathy is confirmed by fundoscopy, showing microaneurysms and hemorrhages. Treatment plan includes SGLT2 inhibitor addition and ACE inhibitor titration.",
    
    "75-year-old male with HFrEF (EF 35%) and NYHA Class III symptoms reports increased exertional fatigue. Baseline labs show BUN 29 mg/dL, serum Cr 1.6 mg/dL, and NT-proBNP at 1400 pg/mL. TTE indicates moderate MR and severe TR with an RVSP of 45 mmHg. CXR reveals cardiomegaly and pulmonary vascular congestion. The patient is on sacubitril/valsartan, metoprolol succinate, and spironolactone.",
    
    "53-year-old female with a BMI of 31 kg/m² and history of recurrent DVT presents with left leg swelling. Doppler US shows acute proximal DVT. Labs notable for elevated D-dimer (5.8 µg/mL), INR of 1.2, and PLT count of 330,000/µL. Hypercoagulable workup pending, though heterozygous Factor V Leiden mutation was previously identified. Anticoagulation initiated with LMWH bridging to warfarin."
]

sample_docs_metadata = [
    {"text": doc, "doc_id": str(i)} for i, doc in enumerate(sample_docs)
]

In [357]:
preprocessor = Preprocessor(lowe_case=True,
                            lemmatization=True,
                            stop_word_removal=True,
                            split_pattern=r'[ ,.]',
                            white_space_replace_pattern=r'\s+|d+|[.,!@#$%^&*()_+={}\[\]:;"\'|<>,.?/~`\\-]',
                            chunk_mode=True,
                            chunk_size=32,
                            overlap=10)

In [358]:
preprocessed_docs = preprocessor.preprocess_all(sample_docs_metadata)

In [359]:
print(len(preprocessed_docs))
print(preprocessed_docs[2])

41
{'actual_text': 'impairment  The patient’s peak expiratory flow (PEF) remains under 70% of predicted values  correlating with symptomatic bronchospasm ', 'preprocessed_text': 'impairment patient’s peak expiratory flow pef remains 70 pre icte value correlating symptomatic bronchospasm ', 'chunk_id': '0_2'}


## Way 1 : Shingling And Binarization

In [432]:
class Shingler:
    def __init__(self, k=2):
        self.k = k
        self.vocab = None

    def _shingle_document(self, doc):
        """Generate k-shingles for a single document (private method)."""
        shingles = set()
        for i in range(len(doc) - self.k + 1):
            shingles.add(doc[i:i + self.k])
        return shingles

    def _create_vocab(self, shingles_list):
        """Create a vocabulary from a list of shingle sets (private method)."""
        vocab = shingles_list[0]
        for shingles in shingles_list[1:]:
            vocab = vocab.union(shingles)
        return list(vocab)  # Convert to list for consistent ordering

    def _create_1hot_encoding(self, shingles):
        """Create a one-hot encoding vector for a single set of shingles."""
        return [1 if v in shingles else 0 for v in self.vocab]

    def build(self, docs):

        """Shingles documents, creates a vocabulary, and produces one-hot encodings."""
        # Step 1: Shingle each document
        shingles_list = [self._shingle_document(doc) for doc in docs]

        # Step 2: Create the vocabulary from shingles
        self.vocab = self._create_vocab(shingles_list)

        # Step 3: Generate one-hot encodings for each document
        one_hot_encodings = [self._create_1hot_encoding(shingles) for shingles in shingles_list]

        return one_hot_encodings
    
    def convert_text_to_one_hot(self, text:str):
        shingles = self._shingle_document(text)
        return self._create_1hot_encoding(shingles)

In [494]:
class LshSimilaritySearch:
    def __init__(self, documents: List[str], k=2, num_bits=256):
        self.original_documents = documents
        self.documents = [doc['preprocessed_text'] for doc in documents]
        self.shingler = Shingler(k=k)
        self.one_hot_encodings = self.shingler.build(self.documents)
        self.one_hot_encodings_numpy = np.array(self.one_hot_encodings)
        self.d = self.one_hot_encodings_numpy.shape[1]
        self.num_bits = num_bits
        self.index = faiss.IndexLSH(self.d, self.num_bits)
        self.index.add(self.one_hot_encodings_numpy)

    def search(self, query: str, top_k=5):
        query_one_hot = self.shingler.convert_text_to_one_hot(query)
        query_one_hot_numpy = np.array(query_one_hot).reshape(1, -1)
        distances, indices = self.index.search(query_one_hot_numpy, top_k)
        results = []
        for i in indices[0]:
            if i != -1:
                results.append(self.original_documents[i])
        return results

In [495]:
lsh_similarity_search = LshSimilaritySearch(preprocessed_docs, k=3, num_bits=256)

In [496]:
query = "Sample AIC value for peripheral neuropathy "
results = lsh_similarity_search.search(query, top_k=5)

In [493]:
for result in results:
    print(f"Document ID: {result['chunk_id']} - Actual Text: {result['actual_text']}")
    print("-" * 100)

Document ID: 7_0 - Actual Text: The patient  a 68-year-old with T2DM and HTN  presents with peripheral neuropathy and nephropathy  A1C is elevated at 9 2%  while urine microalbumin is 230 mg/g  Serum
----------------------------------------------------------------------------------------------------
Document ID: 9_0 - Actual Text: 53-year-old female with a BMI of 31 kg/m² and history of recurrent DVT presents with left leg swelling  Doppler US shows acute proximal DVT  Labs notable for elevated D-dimer (5
----------------------------------------------------------------------------------------------------
Document ID: 6_0 - Actual Text: An 82-year-old female with CKD stage 3 (eGFR 48 mL/min/1 73m²) and longstanding atrial fibrillation presents with a baseline INR of 2 8 on warfarin  TTE shows LA enlargement with mild
----------------------------------------------------------------------------------------------------
Document ID: 3_2 - Actual Text: 62 mmHg  indicating acute-on-chronic hy

## Permuatator

In [364]:
class UnkChecker:
    def __init__(self, dict_path=None):
        self.dict_path = dict_path
        with open(dict_path, "r") as f:
            self.dictionary = set(f.read().splitlines())
    
    def check(self, word: str) -> bool:
        """ Check if a word is in the dictionary. True if the word is in the dictionary, False otherwise. """
        check_flag = word in self.dictionary
        return check_flag

In [365]:
UNK_CHECKER = UnkChecker(dict_path="../dictionary/dict_v0_processed.txt")

In [390]:
def permutate_word(word: str, shingle_length=3):
    """ Creates some permutation of a word """
    if len(word) <= shingle_length:
        return [word]
    permutations = []
    for i in range(len(word) - shingle_length + 1):
        permutations.append(word[i:i + shingle_length])
    return permutations

def permutate_para(text: str, split_pattern=r'[ ,.]', long_word_length=6, shingle_length=3):
    
    assert long_word_length is not None, "long_word_length must be provided"
    assert shingle_length is not None, "shingle_length must be provided"

    """ Creates some permutation of each long and unknown words in the text """
    outout_words = []
    all_words = re.split(split_pattern, text)
    for word in all_words:
        if len(word) > long_word_length and not UNK_CHECKER.check(word):
            outout_words.extend(word)
            outout_words.extend(permutate_word(word))
        else:
            outout_words.append(word)
    
    output_text = ' '.join(outout_words)
    return output_text

## Way 2: TF-IDF

In [417]:
class TfidfSimilaritySearch:
    def __init__(self, preprocessed_documents, permutate_mode=False, long_word_length=None, shingle_length=None):
        self.original_documents = preprocessed_documents
        self.permutate_mode = permutate_mode
        self.long_word_length = long_word_length
        self.shingle_length = shingle_length

        self.documents = [doc['preprocessed_text'] for doc in self.original_documents]
        if self.permutate_mode:
            self.documents = [permutate_para(doc, 
                                             long_word_length=self.long_word_length,
                                             shingle_length=self.shingle_length) for doc in self.documents]

        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        self.preprocessor = Preprocessor(lowe_case=True,
                                         lemmatization=True,
                                         stop_word_removal=True,
                                         split_pattern=r'[ ,.]',
                                         white_space_replace_pattern=r'\s+|d+|[.,!@#$%^&*()_+={}\[\]:;"\'|<>,.?/~`\\-]',
                                         chunk_mode=False)

    def search(self, query, top_n=3, query_permute_mode=True):
        # Process the query
        query = self.preprocessor.preprocess({'text': query, 'doc_id': 'query'})[0]['preprocessed_text']

        if query_permute_mode:
         # Permuate long and unk words in the query
            query = permutate_para(query, long_word_length=self.long_word_length, shingle_length=self.shingle_length)

        # Transform the query into TF-IDF vector
        query_tfidf = self.vectorizer.transform([query])
        # Calculate cosine similarities between the query and documents
        similarities = cosine_similarity(query_tfidf, self.tfidf_matrix).flatten()
        # Get top_n most similar documents
        top_indices = np.argsort(similarities)[::-1][:top_n]
        
        return [(self.original_documents[i], similarities[i]) for i in top_indices if similarities[i] > 0]

In [466]:
tfidf_search = TfidfSimilaritySearch(preprocessed_docs, permutate_mode=True, long_word_length=6, shingle_length=4)

In [474]:
query = "Sample AIC value for peripheral neuropathy"

In [475]:
top_results = tfidf_search.search(query, top_n=5, query_permute_mode=False)

In [476]:
for i, (doc, sim) in enumerate(top_results):
    print(f"Result {i+1} (Similarity: {sim:.2f}): {doc['actual_text']} from doc_id: {doc['chunk_id']}")
    print("-" * 100)

Result 1 (Similarity: 0.12): impairment  The patient’s peak expiratory flow (PEF) remains under 70% of predicted values  correlating with symptomatic bronchospasm  from doc_id: 0_2
----------------------------------------------------------------------------------------------------
Result 2 (Similarity: 0.12): Patient  a 72-year-old male with known COPD (GOLD stage III) and an FEV1/FVC ratio of 0 52  presents with worsening SOB and peripheral edema  ABG on room air shows from doc_id: 3_0
----------------------------------------------------------------------------------------------------
Result 3 (Similarity: 0.11): SOB and peripheral edema  ABG on room air shows pH 7 33  PaCO2 of 60 mmHg  and PaO2 of 62 mmHg  indicating acute-on-chronic hypercapnic respiratory failure  Chest from doc_id: 3_1
----------------------------------------------------------------------------------------------------
Result 4 (Similarity: 0.09): The patient  a 68-year-old with T2DM and HTN  presents with peripher

## BM25

In [485]:
class BM25SimilaritySearch:
    def __init__(self, documents, permutate_mode=False, long_word_length=None, shingle_length=None):
        self.original_documents = documents
        self.permutate_mode = permutate_mode
        self.long_word_length = long_word_length
        self.shingle_length = shingle_length
        
        # Apply shingling if needed
        self.documents = [doc['preprocessed_text'] for doc in documents]

        
        if permutate_mode:
            self.documents = [permutate_para(doc, 
                                             long_word_length=long_word_length,
                                             shingle_length=shingle_length) for doc in self.documents]

        # Tokenize documents
        self.tokenized_docs = [self._tokenize(doc) for doc in self.documents]
        self.bm25 = BM25Okapi(self.tokenized_docs)
        
        self.preprocessor = Preprocessor(
            lowe_case=True,
            lemmatization=True,
            stop_word_removal=True,
            split_pattern=r'[ ,.]',
            white_space_replace_pattern=r'\s+|d+|[.,!@#$%^&*()_+={}\[\]:;"\'|<>,.?/~`\\-]',
            chunk_mode=False
        )

    def _tokenize(self, text):
        return re.split(r'\s+', text)

    def search(self, query, top_n=3, query_permute_mode=False):
        # Preprocess the query
        query = self.preprocessor.preprocess({'text': query, 'doc_id': 'query'})[0]['preprocessed_text']

        if query_permute_mode:
            query = permutate_para(query, long_word_length=self.long_word_length, shingle_length=self.shingle_length)

        # Tokenize the query
        tokenized_query = self._tokenize(query)
        # Get BM25 scores
        scores = self.bm25.get_scores(tokenized_query)
        # Get top_n results
        top_indices = np.argsort(scores)[::-1][:top_n]
        
        return [(self.original_documents[i], scores[i]) for i in top_indices if scores[i] > 0]


In [486]:
bm25_search = BM25SimilaritySearch(preprocessed_docs, permutate_mode=True, long_word_length=6, shingle_length=4)

In [487]:
query = "Sample AIC value for peripheral neuropathy"

In [488]:
top_results = bm25_search.search(query, top_n=5)

In [489]:
for i, (doc, sim) in enumerate(top_results):
    print(f"Result {i+1} (Similarity: {sim:.2f}): {doc['actual_text']} from doc_id: {doc['chunk_id']}")
    print("-" * 100)

Result 1 (Similarity: 3.01): Patient  a 72-year-old male with known COPD (GOLD stage III) and an FEV1/FVC ratio of 0 52  presents with worsening SOB and peripheral edema  ABG on room air shows from doc_id: 3_0
----------------------------------------------------------------------------------------------------
Result 2 (Similarity: 2.68): SOB and peripheral edema  ABG on room air shows pH 7 33  PaCO2 of 60 mmHg  and PaO2 of 62 mmHg  indicating acute-on-chronic hypercapnic respiratory failure  Chest from doc_id: 3_1
----------------------------------------------------------------------------------------------------
Result 3 (Similarity: 2.62): impairment  The patient’s peak expiratory flow (PEF) remains under 70% of predicted values  correlating with symptomatic bronchospasm  from doc_id: 0_2
----------------------------------------------------------------------------------------------------
Result 4 (Similarity: 2.33): The patient  a 68-year-old with T2DM and HTN  presents with peripher