In [59]:
import pandas as pd
import ast
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# === Step 1: Load Data ===
diagnosis_df = pd.read_csv("Diagnoses_list.csv")  # Column: Diagnoses_list (stringified list)
icd_df = pd.read_csv("icd10_codes.csv")  # Columns: ICDCode, Description


In [60]:
# Clean column names
icd_df.columns = [col.strip() for col in icd_df.columns]
icd_df['Description'] = icd_df['Description'].str.strip()

# Convert stringified list to actual Python list
if diagnosis_df["Diagnoses_list"].dtype == object:
    diagnosis_df["Diagnoses_list"] = diagnosis_df["Diagnoses_list"].apply(ast.literal_eval)


In [66]:
# Show the full table with all rows and columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    # Display the full content of the diagnosis_df DataFrame
    print(diagnosis_df.to_string(index=False))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [12]:
# === Step 2: Load the sentence embedding model ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Pre-compute ICD description embeddings
icd_descriptions = icd_df["Description"].tolist()
icd_embeddings = embedding_model.encode(icd_descriptions, convert_to_tensor=False)


In [13]:
# === Step 3: Function to map diagnosis to ICD code ===
def map_diagnosis_to_icd(diagnosis):
    diag_embedding = embedding_model.encode([diagnosis])
    sims = cosine_similarity(diag_embedding, icd_embeddings)[0]
    best_idx = np.argmax(sims)
    best_score = sims[best_idx]
    return {
        "diagnosis": diagnosis,
        "code": icd_df.iloc[best_idx]["ICDCode"],
        "description": icd_df.iloc[best_idx]["Description"],
        "score": best_score
    }


In [40]:
import os
import pandas as pd
import google.generativeai as genai
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import time

# --- CONFIG ---
GEMINI_API_KEY = "AIzaSyCID65cgNwnm4ZYQtdpniim1IsfjDCd5Go"  # <-- Replace with your actual key
SIMILARITY_THRESHOLD = 0.75
LOCAL_MODEL_NAME = "bkholyday/Qwen2.5-0.5B-Instruct-medicalLLM-HuatuoGPT-o1-sft"  # Medical LLM, or use BioGPT-Large if you have resources

# --- SETUP ---
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL_NAME, trust_remote_code=True)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=120, do_sample=True, temperature=0.7)







Device set to use cpu


In [55]:
def build_rag_prompt(diagnosis, code, description, score):
    prompt = (
        f"You are a medical coding expert.\n"
        f"Diagnosis: \"{diagnosis}\"\n"
        f"ICD-10 Code: {code}\n"
        f"ICD-10 Description: \"{description}\"\n"
        f"Similarity score: {score:.2f}\n"
    )
    if score < SIMILARITY_THRESHOLD:
        prompt += (
            "The similarity score is low, so this mapping may be ambiguous.\n"
            "Explain why this code was selected for the diagnosis, and if you think another code is more appropriate, suggest it and explain why.\n"
        )
    else:
        prompt += (
            "Explain clearly and concisely why this ICD-10 code is the best match for the diagnosis above."
        )
    return prompt

In [56]:
def generate_explanation_local(prompt):
    result = generator(prompt)
    assistant_message = next((msg for msg in result if msg.get('role') == 'assistant'), None)
    if assistant_message:
        explanation = assistant_message['content'].strip()
    else:
        explanation = ""  # or handle the error as needed
    # if len(explanation) < 20 or not any(c.isalpha() for c in explanation):
    #     explanation = "nn"
    return explanation

def generate_explanation_gemini(prompt):
    model = genai.GenerativeModel("gemini-1.5-flash-latest")
    response = model.generate_content(prompt)
    return response.text.strip()

In [57]:


def main():
    df = pd.read_csv("output.csv")
    output_rows = []
    for i, row in df.iterrows():
        if i == 3:break
        prompt = build_rag_prompt(row["Original Diagnosis"], row["ICD-10 Code"], row["ICD-10 Description"], row["Similarity Score"])
        if row["Similarity Score"] < SIMILARITY_THRESHOLD:
            try:
                explanation = generate_explanation_gemini(prompt)
                model_used = "Gemini"
            except Exception as e:
                explanation = f"Error: {e}"
                model_used = "Gemini (Error)"
            time.sleep(0.5)  # To avoid Gemini rate limits
        else:
            explanation = generate_explanation_local(prompt)
            model_used = "Local LLM"
        output_row = row.to_dict()
        output_row["RAG Explanation"] = explanation
        output_row["Model Used"] = model_used
        output_rows.append(output_row)
        if (i+1) % 50 == 0:
            print(f"Processed {i+1} rows...")
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv("rag_output.csv", index=False)
    print("Saved rag_output.csv")

if __name__ == "__main__":
    main()


Saved rag_output.csv


In [None]:
df['Explanation']

0         jpg
1         NaN
2          me
3         jpg
4         mer
5         jpg
6           e
7         com
8    compleas
9         plz
Name: Explanation, dtype: object

In [None]:
df['Explanation']

0         jpg
1         NaN
2          me
3         jpg
4         mer
5         jpg
6           e
7         com
8    compleas
9         plz
Name: Explanation, dtype: object

In [39]:
results = []
for idx, row in diagnosis_df.iterrows():
    if idx == 1:break
    for diag in row["Diagnoses_list"]:
        icd_rows, sims = retrieve_top_icd(diag, top_n=3)
        prompt = build_rag_prompt(diag, icd_rows, sims)
        # Use Gemini if top similarity is less than 0.75, else use local LLM
        if sims[0] < 0.75:
            explanation = generate_explanation_gemini(prompt)
            model_used = "Gemini"
        else:
            explanation = generate_explanation_local(prompt)
            model_used = "Local LLM"
        results.append({
            "Original Diagnosis": diag,
            "Top ICD-10 Codes": "; ".join(icd_rows["ICDCode"].tolist()),
            "Top ICD-10 Descriptions": "; ".join(icd_rows["Description"].tolist()),
            "Top Similarity Scores": "; ".join([f"{s:.3f}" for s in sims]),
            "Explanation": explanation,
            "Model Used": model_used
        })
        # Optionally, add a delay if using Gemini API to avoid rate limits
        # import time; time.sleep(0.5)
output_df = pd.DataFrame(results)
output_df.to_csv("rag_output.csv", index=False)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [14]:
# # === Step 4: Process all rows and save output ===
# results = []
# for row_id, row in diagnosis_df.iterrows():
#     for diag in row["Diagnoses_list"]:
#         mapped = map_diagnosis_to_icd(diag)
#         results.append({
#             "Row_ID": row_id,  # <-- Add this line
#             "Original Diagnosis": mapped["diagnosis"],
#             "ICD-10 Code": mapped["code"],
#             "ICD-10 Description": mapped["description"],
#             "Similarity Score": round(mapped["score"], 3)
#         })


In [15]:
output_df = pd.DataFrame(results)
output_df.to_csv("output.csv", index=False)
print("Saved output to output.csv") 

Saved output to output.csv


In [23]:
import os
import pandas as pd
import google.generativeai as genai
import time

# 1. Set your Gemini API key
# To resume with a new key, change it here and rerun the script.
os.environ["GEMINI_API_KEY"] = "AIzaSyArPysYVgaFcFoA_vKBeRp50Vmku2dVdcA"  # <-- Replace with your actual key

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# 2. Improved prompt builder
def build_prompt(diagnosis, code, description, score, threshold=0.8):
    base = (
        f"You are a medical coding expert.\n"
        f"A patient has the diagnosis: \"{diagnosis}\".\n"
        f"The selected ICD‑10 code is {code} — \"{description}\".\n"
    )
    if score < threshold:
        base += (
            f"The similarity score for this match is {score:.2f}, which is considered low.\n"
            f"1. Explain why this code was selected despite the low similarity.\n"
            f"2. Suggest up to 2 alternative ICD-10 codes that might also be appropriate, and explain your reasoning for each.\n"
        )
    else:
        base += (
            f"Write a concise, clinically-accurate explanation of why this code applies to the diagnosis."
        )
    return base

# 3. Gemini call function
def explain_with_gemini_api(prompt):
    model = genai.GenerativeModel("gemini-1.5-flash-latest")
    response = model.generate_content(prompt)
    return response.text.strip()



In [24]:
def main():
    input_file = "output.csv"
    output_file = "output_with_explanations.csv"
    df = pd.read_csv(input_file)
    # If output file exists, load it; else, create new columns
    if os.path.exists(output_file):
        out_df = pd.read_csv(output_file)
        if "Gemini Explanation" not in out_df.columns:
            out_df["Gemini Explanation"] = ""
        if "Proposed By" not in out_df.columns:
            out_df["Proposed By"] = ""
    else:
        out_df = df.copy()
        out_df["Gemini Explanation"] = ""
        out_df["Proposed By"] = ""
    # Process only rows where explanation is missing or NaN
    for idx, row in out_df.iterrows():
        if pd.isna(row["Gemini Explanation"]) or row["Gemini Explanation"] == "":
            prompt = build_prompt(
                row["Original Diagnosis"],
                row["ICD-10 Code"],
                row["ICD-10 Description"],
                row["Similarity Score"]
            )
            try:
                explanation = explain_with_gemini_api(prompt)
                flag = "Gemini Proposed" if row["Similarity Score"] < 0.8 else "Cosine Proposed"
            except Exception as e:
                explanation = f"Error: {e}"
                flag = "Error"
            out_df.at[idx, "Gemini Explanation"] = explanation
            out_df.at[idx, "Proposed By"] = flag
            out_df.to_csv(output_file, index=False)
            print(f"Processed row {idx+1}/{len(out_df)}")
            time.sleep(0.5)
    print(f"All done! Saved to {output_file}")


In [25]:
if __name__ == "__main__":
    main()

Processed row 57/3171
Processed row 58/3171
Processed row 59/3171
Processed row 60/3171
Processed row 61/3171
Processed row 62/3171
Processed row 63/3171
Processed row 64/3171
Processed row 65/3171
Processed row 66/3171
Processed row 67/3171
Processed row 68/3171
Processed row 69/3171
Processed row 70/3171
Processed row 71/3171
Processed row 72/3171
Processed row 73/3171
Processed row 74/3171
Processed row 75/3171
Processed row 76/3171
Processed row 77/3171
Processed row 78/3171
Processed row 79/3171
Processed row 80/3171
Processed row 81/3171
Processed row 82/3171
Processed row 83/3171
Processed row 84/3171
Processed row 85/3171
Processed row 86/3171
Processed row 87/3171
Processed row 88/3171
Processed row 89/3171
Processed row 90/3171
Processed row 91/3171
Processed row 92/3171
Processed row 93/3171
Processed row 94/3171
Processed row 95/3171
Processed row 96/3171
Processed row 97/3171
Processed row 98/3171
Processed row 99/3171
Processed row 100/3171
Processed row 101/3171
Processe

In [26]:
df = pd.read_csv('output_with_explanations.csv')
df

Unnamed: 0,Row_ID,Original Diagnosis,ICD-10 Code,ICD-10 Description,Similarity Score,Gemini Explanation,Proposed By
0,0,Diabetes mellitus without mention of complicat...,E1169,Type 2 diabetes mellitus with other specified ...,0.880,"The code E1169, ""Type 2 diabetes mellitus with...",Cosine Proposed
1,0,Pure hypercholesterolemia,E7800,"Pure hypercholesterolemia, unspecified",0.946,"The ICD-10 code E78.00, ""Pure hypercholesterol...",Cosine Proposed
2,0,Unspecified acquired hypothyroidism,E039,"Hypothyroidism, unspecified",0.938,"E039, ""Hypothyroidism, unspecified,"" accuratel...",Cosine Proposed
3,0,Tobacco use disorder,Z720,Tobacco use,0.802,"Z720, Tobacco use, is appropriate because it a...",Cosine Proposed
4,0,Personal history of malignant melanoma of skin,Z85820,Personal history of malignant melanoma of skin,1.000,"Z85820, ""Personal history of malignant melanom...",Cosine Proposed
...,...,...,...,...,...,...,...
3166,99,Esophageal reflux,T85591D,Other mechanical complication of esophageal an...,0.867,"Error: 429 You exceeded your current quota, pl...",Error
3167,99,Dysuria,R300,Dysuria,1.000,"Error: 429 You exceeded your current quota, pl...",Error
3168,99,Other forms of acute ischemic heart disease,I248,Other forms of acute ischemic heart disease,1.000,"Error: 429 You exceeded your current quota, pl...",Error
3169,99,Other primary cardiomyopathies,I428,Other cardiomyopathies,0.912,"Error: 429 You exceeded your current quota, pl...",Error


In [67]:
import pandas as pd

# Load the full data
df = pd.read_csv("output.csv")

# Number of rows per chunk
chunk_size = 500

# Split and save
for i in range(0, len(df), chunk_size):
    import os
    output_folder = "output_chunks"
    os.makedirs(output_folder, exist_ok=True)
    chunk = df.iloc[i:i+chunk_size]
    chunk_num = (i // chunk_size) + 1
    chunk_path = os.path.join(output_folder, f"output_part_{chunk_num}.csv")
    chunk.to_csv(chunk_path, index=False)
    print(f"Saved {chunk_path} with {len(chunk)} rows")

Saved output_chunks\output_part_1.csv with 500 rows
Saved output_chunks\output_part_2.csv with 500 rows
Saved output_chunks\output_part_3.csv with 500 rows
Saved output_chunks\output_part_4.csv with 500 rows
Saved output_chunks\output_part_5.csv with 500 rows
Saved output_chunks\output_part_6.csv with 500 rows
Saved output_chunks\output_part_7.csv with 171 rows
