<a href="https://colab.research.google.com/github/EdiNel0407/us-ie-big-data-technologies/blob/main/postblock3/Q5/Postblock3Q5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Question 5.1 – Extract ICD Subset (Lines 6000–12000)
In this step, I used a BASH command in Colab to extract rows 6000 to 12000 of the ICD-10 dataset.

In [1]:
# Verify the file is present
import os, pandas as pd
path = "ICDCodeSet.csv"
assert os.path.exists(path), "Upload ICDCodeSet.csv to the Colab working directory first."
print("File found:", path)
# Peek at header
print(pd.read_csv(path, nrows=5).head())


File found: ICDCodeSet.csv
  ICDCode                                        Description
0    A000     Cholera due to Vibrio cholerae 01, biovar c...
1    A001     Cholera due to Vibrio cholerae 01, biovar e...
2    A009                               Cholera, unspecified
3   A0100                         Typhoid fever, unspecified
4   A0101                                 Typhoid meningitis


In [2]:
%%bash
sed -n '6000,12000p' ICDCodeSet.csv > icd_subset.csv
wc -l icd_subset.csv
head -n 3 icd_subset.csv


6001 icd_subset.csv
H05329, Deformity of unspecified orbit due to bone disease
H05331, Deformity of right orbit due to trauma or surgery
H05332, Deformity of left orbit due to trauma or surgery


In [3]:
import pandas as pd

# Read the true header from the original file
with open("ICDCodeSet.csv", "r", encoding="utf-8", errors="replace") as f:
    header_line = f.readline().strip()

headers = [h.strip() for h in header_line.split(",")]
print("Detected headers:", headers)

# Load the subset with those headers
df = pd.read_csv("icd_subset.csv", names=headers, header=None, encoding="utf-8", engine="python")

# Basic clean-up
df["ICDCode"] = df["ICDCode"].astype(str).str.strip()
df["Description"] = df["Description"].astype(str).str.strip()
df = df[df["Description"].str.len() > 0].reset_index(drop=True)

print(df.shape)
df.head(10)


Detected headers: ['ICDCode', 'Description']
(6001, 2)


Unnamed: 0,ICDCode,Description
0,H05329,Deformity of unspecified orbit due to bone dis...
1,H05331,Deformity of right orbit due to trauma or surgery
2,H05332,Deformity of left orbit due to trauma or surgery
3,H05333,Deformity of bilateral orbits due to trauma or...
4,H05339,Deformity of unspecified orbit due to trauma o...
5,H05341,Enlargement of right orbit
6,H05342,Enlargement of left orbit
7,H05343,Enlargement of bilateral orbits
8,H05349,Enlargement of unspecified orbit
9,H05351,Exostosis of right orbit


In [4]:
!pip -q install faiss-cpu sentence-transformers


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

# A small, strong general-purpose model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

descriptions = df["Description"].tolist()
embeddings = model.encode(descriptions, convert_to_numpy=True, normalize_embeddings=True).astype("float32")
embeddings.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(6001, 384)

In [6]:
import faiss
import numpy as np

d = embeddings.shape[1]                # vector dimension
index = faiss.IndexFlatIP(d)           # cosine similarity if vectors are L2-normalized; IP works with normalized vecs
index.add(embeddings)                  # add all vectors
print("Index size:", index.ntotal)


Index size: 6001


In [7]:
def search_icd(query_text: str, top_k: int = 5):
    # Encode the query the same way
    q_vec = model.encode([query_text], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q_vec, top_k)
    hits = []
    for score, i in zip(scores[0], idxs[0]):
        if i == -1:
            continue
        hits.append({
            "rank": len(hits)+1,
            "ICDCode": df.iloc[i]["ICDCode"],
            "Description": df.iloc[i]["Description"],
            "score": float(score)
        })
    return hits

# Quick smoke test
for h in search_icd("severe headache with nausea", top_k=5):
    print(h)


{'rank': 1, 'ICDCode': 'K910', 'Description': 'Vomiting following gastrointestinal surgery', 'score': 0.4514538645744324}
{'rank': 2, 'ICDCode': 'H81311', 'Description': 'Aural vertigo, right ear', 'score': 0.3898845911026001}
{'rank': 3, 'ICDCode': 'H81312', 'Description': 'Aural vertigo, left ear', 'score': 0.3850812017917633}
{'rank': 4, 'ICDCode': 'H5712', 'Description': 'Ocular pain, left eye', 'score': 0.37837085127830505}
{'rank': 5, 'ICDCode': 'J690', 'Description': 'Pneumonitis due to inhalation of food and vomit', 'score': 0.37140971422195435}


In [8]:
def icd(query, k=5):
    return pd.DataFrame(search_icd(query, k))

# Example queries:
icd("painful swelling of right orbit")


Unnamed: 0,rank,ICDCode,Description,score
0,1,H05331,Deformity of right orbit due to trauma or surgery,0.694082
1,2,H05332,Deformity of left orbit due to trauma or surgery,0.682345
2,3,H05339,Deformity of unspecified orbit due to trauma o...,0.632626
3,4,H05333,Deformity of bilateral orbits due to trauma or...,0.600028
4,5,H05329,Deformity of unspecified orbit due to bone dis...,0.59437


In [9]:
# Save
faiss.write_index(index, "icd_faiss.index")
df.to_parquet("icd_subset.parquet", index=False)

# Reload later (fresh session)
# index = faiss.read_index("icd_faiss.index")
# df = pd.read_parquet("icd_subset.parquet")


### Question 5.2 – Store Embeddings
Store embeddings of these data in an in-memory FAISS vector store and output the total number or records in the index.

In [10]:
# Install dependencies (run in Colab)
!pip -q install faiss-cpu sentence-transformers

# Build embeddings for the previously created subset (icd_subset.csv)
from sentence_transformers import SentenceTransformer
import pandas as pd, numpy as np, faiss

# Load the subset created with sed (lines 6000–12000)
df = pd.read_csv("icd_subset.csv", names=["ICDCode","Description"], header=None)

# Create sentence embeddings (L2-normalized so inner product = cosine similarity)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb = model.encode(
    df["Description"].astype(str).tolist(),
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

# In-memory FAISS index
index = faiss.IndexFlatIP(emb.shape[1])  # IP with normalized vectors -> cosine
index.add(emb)

print("Index size:", index.ntotal)


Index size: 6001


###Question 5.3 — Natural-language query
Create a function that takes a natural language query as input and which returns the best matching ICD-10 code and its description.

In [11]:
# Assumes you've already built `df`, `model`, and `index` in Q5.1.
from typing import Tuple
import numpy as np
import pandas as pd
import faiss

def best_icd_match(query: str) -> Tuple[str, str, float]:
    """Return (ICDCode, Description, score) for the top FAISS hit."""
    q_vec = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q_vec, 1)
    i = int(idxs[0][0])
    return df.iloc[i]["ICDCode"], df.iloc[i]["Description"], float(scores[0][0])

# Test with the given query
query = """I’ve had a throbbing pain and a feeling of fullness deep inside my left ear
for the last two days."""

icd_code, icd_desc, icd_score = best_icd_match(query)
print("Query:", query)
print("Best match:", icd_code, "-", icd_desc)
print("Score:", round(icd_score, 4))


Query: I’ve had a throbbing pain and a feeling of fullness deep inside my left ear
for the last two days.
Best match: H9222 -   Otorrhagia, left ear
Score: 0.6719
