##  MODULE 1 — CLINICAL NOTE AUTO-CODING (NLP)

# Importing Relevant Libraries

In [6]:
from openai import OpenAI
import pandas as pd
import re

client = OpenAI(api_key="sk-proj-wjYSA0JvApzRnMBewl4f8QTT5GJaSP90PHgfB9_b14UXEq9UvCd-ZLAblBDmDNMA")


# Cleaning The Data

In [7]:
def clean(t):
    t = re.sub(r"\[\*\*.*?\*\*\]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

# Example Usage

In [8]:
FEW_SHOT = """
You are a medical coding expert. Your task is to predict ICD-9 or ICD-10 codes
based on a clinical discharge summary or symptoms.

Return ONLY the ICD codes as a Python list.

Example 1:
Summary:
"Patient admitted with chest pain radiating to left arm, troponin elevated,
diagnosed with acute myocardial infarction."
ICD codes: ["I21.3"]

Example 2:
Summary:
"75-year-old female with fever, cough, and right lower lobe pneumonia."
ICD codes: ["J18.9"]

Example 3:
Summary:
"Patient with uncontrolled type 2 diabetes and diabetic neuropathy."
ICD codes: ["E11.40"]

Now predict ICD codes for the following summary.
Summary: 
"""

# ICD Code Predictor

In [9]:
def predict_icd(summary):
    summary = clean(summary)
    query = FEW_SHOT + summary + "\nICD codes:"

    response = client.responses.create(
        model="gpt-4o-mini",
        input=query
    )

    output = response.output_text

    # Extract ICD codes using regex
    codes = re.findall(r"[A-TV-Z]\d{1,3}\.?\d*|\d{3}\.?\d*", output)
    return codes

# Creating User Prompt

In [11]:
while True:
    print("\n--------------------------------------------")
    summary = input("Enter symptoms / discharge summary (or type 'exit'): ")

    if summary.lower().strip() == "exit":
        print("Exiting ICD predictor...")   # ← prevents Jupyter cell from disappearing
        break

    icd_codes = predict_icd(summary)
    print("\nPredicted ICD Codes:", icd_codes)


--------------------------------------------


Enter symptoms / discharge summary (or type 'exit'):  patient was having headache, cold and fever



Predicted ICD Codes: ['R51', 'J20.9', 'R50.9']

--------------------------------------------


Enter symptoms / discharge summary (or type 'exit'):  another patient was having bloating, chest pain



Predicted ICD Codes: ['R14.0', 'R07.9']

--------------------------------------------


Enter symptoms / discharge summary (or type 'exit'):  exit


Exiting ICD predictor...


# MODULE 2 — CLAIM DENIAL RISK PREDICTION

# Importing Relevant Libraries

In [56]:
import os
import numpy as np
import pandas as pd
from xgboost.callback import EarlyStopping
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_fscore_support
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import warnings
warnings.filterwarnings("ignore")

# Copying the Data

In [57]:
patients = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\PATIENTS.csv\PATIENTS.csv")
admissions = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\ADMISSIONS.csv\ADMISSIONS.csv")
diag_icd = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\DIAGNOSES_ICD.csv\DIAGNOSES_ICD.csv")
proc_icd = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\PROCEDURES_ICD.csv\PROCEDURES_ICD.csv")
dicd_diag = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\D_ICD_DIAGNOSES.csv\D_ICD_DIAGNOSES.csv")
dicd_proc = pd.read_csv(r"C:\Users\ashee\OneDrive\Desktop\Ntigra Job Application\archive\mimic-iii-clinical-database-1.4\D_ICD_PROCEDURES.csv\D_ICD_PROCEDURES.csv")

# Removing Unnecessary Columns

In [58]:
patients = patients[['SUBJECT_ID', 'GENDER']]
patients.head()

Unnamed: 0,SUBJECT_ID,GENDER
0,249,F
1,250,F
2,251,M
3,252,M
4,253,F


# Checking Null Values

In [59]:
patients.isnull().sum()

SUBJECT_ID    0
GENDER        0
dtype: int64

# Removing Unnecessary Columns

In [60]:
admissions = admissions[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME','ADMISSION_TYPE', 'ADMISSION_LOCATION',
                         'DISCHARGE_LOCATION', 'INSURANCE','DIAGNOSIS']]
admissions.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,DIAGNOSIS
0,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,BENZODIAZEPINE OVERDOSE
1,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...
2,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,BRAIN MASS
3,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,INTERIOR MYOCARDIAL INFARCTION
4,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ACUTE CORONARY SYNDROME


# Checking Null Values and Removing Them

In [61]:
admissions.isnull().sum()

SUBJECT_ID             0
HADM_ID                0
ADMITTIME              0
DISCHTIME              0
ADMISSION_TYPE         0
ADMISSION_LOCATION     0
DISCHARGE_LOCATION     0
INSURANCE              0
DIAGNOSIS             25
dtype: int64

In [62]:
admissions = admissions.dropna()

In [63]:
admissions.isnull().sum()

SUBJECT_ID            0
HADM_ID               0
ADMITTIME             0
DISCHTIME             0
ADMISSION_TYPE        0
ADMISSION_LOCATION    0
DISCHARGE_LOCATION    0
INSURANCE             0
DIAGNOSIS             0
dtype: int64

In [64]:
len(admissions)

58951

In [65]:
diag_icd = diag_icd[['SUBJECT_ID', 'HADM_ID','ICD9_CODE']]
diag_icd.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,109,172335,40301
1,109,172335,486
2,109,172335,58281
3,109,172335,5855
4,109,172335,4254


In [66]:
diag_icd.isnull().sum()

SUBJECT_ID     0
HADM_ID        0
ICD9_CODE     47
dtype: int64

In [67]:
diag_icd = diag_icd.dropna()
diag_icd.isnull().sum()

SUBJECT_ID    0
HADM_ID       0
ICD9_CODE     0
dtype: int64

# Removing Unnecessary Columns

In [68]:
proc_icd = proc_icd[['SUBJECT_ID', 'HADM_ID','ICD9_CODE']]
proc_icd.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,62641,154460,3404
1,2592,130856,9671
2,2592,130856,3893
3,55357,119355,9672
4,55357,119355,331


In [69]:
dicd_diag = dicd_diag[['ICD9_CODE', 'SHORT_TITLE']]
dicd_diag.head()

Unnamed: 0,ICD9_CODE,SHORT_TITLE
0,1166,TB pneumonia-oth test
1,1170,TB pneumothorax-unspec
2,1171,TB pneumothorax-no exam
3,1172,TB pneumothorx-exam unkn
4,1173,TB pneumothorax-micro dx


# Checking Null Values

In [70]:
dicd_diag.isnull().sum()

ICD9_CODE      0
SHORT_TITLE    0
dtype: int64

In [71]:
dicd_proc = dicd_proc[['ICD9_CODE', 'SHORT_TITLE']]
dicd_proc.head()

Unnamed: 0,ICD9_CODE,SHORT_TITLE
0,851,Canthotomy
1,852,Blepharorrhaphy
2,859,Adjust lid position NEC
3,861,Lid reconst w skin graft
4,862,Lid reconst w muc graft


In [72]:
dicd_proc.isnull().sum()

ICD9_CODE      0
SHORT_TITLE    0
dtype: int64

 # THERE IS NO TARGET COLUMN IN THE DATASET. THAT IS WHY WE HAVE TO CREATE A SYNTHETIC COLUMN BUT BECAUSE IT IS NOT A REAL DATA THAT IS WHY THIS ENTIRE CODE IS JUST A BLUE PRINT"

In [73]:
target_col = None
for c in ["DENIED", "CLAIM_DENIED", "CLAIM_STATUS", "DENIAL", "IS_DENIED"]:
    if c in admissions.columns:
        target_col = c
        break

if target_col is None:
    print("\nNo denial label found in ADMISSIONS. Creating a synthetic demo target 'DENIED' (random).")
    np.random.seed(42)
    admissions['DENIED'] = np.random.binomial(1, 0.15, size=len(admissions))  # 15% synthetic denial rate
    target_col = 'DENIED'
else:
    print(f"\nFound target column in ADMISSIONS: {target_col}. Using that as label.")


No denial label found in ADMISSIONS. Creating a synthetic demo target 'DENIED' (random).


In [74]:
admissions.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,DIAGNOSIS,DENIED
0,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,BENZODIAZEPINE OVERDOSE,0
1,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,1
2,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,BRAIN MASS,0
3,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,INTERIOR MYOCARDIAL INFARCTION,0
4,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ACUTE CORONARY SYNDROME,0


# Preparing patient-level features: admission, gender

In [75]:
print("\nBuilding core features...")
# merge patients -> admissions
ad = admissions.merge(patients[['SUBJECT_ID', 'GENDER']], on='SUBJECT_ID', how='left')

# select core categorical features from admissions
core_cat = ['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'INSURANCE', 'MARITAL_STATUS', 'ETHNICITY']
for c in core_cat:
    if c not in ad.columns:
        ad[c] = np.nan


Building core features...


# Building diagnosis and procedure aggregated features (per admission)

In [76]:
# Create aggregated string of ICD codes per HADM_ID for diagnoses and procedures
print("Aggregating ICD codes per admission...")
diag_per_adm = diag_icd.groupby('HADM_ID')['ICD9_CODE'].agg(lambda codes: ' '.join(map(str, codes.dropna().astype(str)))).reset_index().rename(columns={'ICD9_CODE':'DIAG_CODES'})
proc_per_adm = proc_icd.groupby('HADM_ID')['ICD9_CODE'].agg(lambda codes: ' '.join(map(str, codes.dropna().astype(str)))).reset_index().rename(columns={'ICD9_CODE':'PROC_CODES'})

# merge into admissions dataframe (ad)
ad = ad.merge(diag_per_adm, on='HADM_ID', how='left')
ad = ad.merge(proc_per_adm, on='HADM_ID', how='left')

# Replace NaN strings with empty
ad['DIAG_CODES'] = ad['DIAG_CODES'].fillna("")
ad['PROC_CODES'] = ad['PROC_CODES'].fillna("")

Aggregating ICD codes per admission...


# Feature engineering from ICD text: top-N code bag-of-codes (CountVectorizer on codes)

In [77]:
# Use CountVectorizer tokenizing on whitespace because ICD codes are space-separated strings
top_n_diag = 500   # adjustable
top_n_proc = 300   # adjustable

print(f"Vectorizing top {top_n_diag} diagnosis codes and top {top_n_proc} procedure codes...")
diag_vec = CountVectorizer(token_pattern=r"[^ ]+", max_features=top_n_diag, binary=True)
proc_vec = CountVectorizer(token_pattern=r"[^ ]+", max_features=top_n_proc, binary=True)

diag_X = diag_vec.fit_transform(ad['DIAG_CODES'])
proc_X = proc_vec.fit_transform(ad['PROC_CODES'])

# small numeric features
num_df = ad[['HADM_ID']].set_index('HADM_ID')

# categorical features: we will one-hot encode a selected small set
cat_cols = ['GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'MARITAL_STATUS', 'ETHNICITY']
ad_cats = ad[cat_cols].fillna('MISSING')
ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
cat_X = ohe.fit_transform(ad_cats)

Vectorizing top 500 diagnosis codes and top 300 procedure codes...


# Combining all features into X (sparse)

In [78]:
print("Combining numeric, categorical, and ICD features...")
from scipy.sparse import csr_matrix
num_X = csr_matrix(num_df.values)  # convert numeric to sparse
X = hstack([num_X, cat_X, diag_X, proc_X], format='csr')

# Align y (target) to HADM_ID order
y = ad.set_index('HADM_ID')[target_col].reindex(index=num_df.index).values
print("Feature matrix shape:", X.shape)
print("Target distribution:", pd.Series(y).value_counts())

Combining numeric, categorical, and ICD features...
Feature matrix shape: (58951, 813)
Target distribution: 0    50142
1     8809
Name: count, dtype: int64


# Train / Test split (stratified)

In [79]:
print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


Splitting data...


# Model training - XGBoost (with early stopping) and RandomForest for comparison

In [80]:

print("\nTraining XGBoost...")
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

# Fit with early stopping: need validation set from train
X_tr_sub, X_val_sub, y_tr_sub, y_val_sub = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)
xgb.fit(X_tr_sub, y_tr_sub, eval_set=[(X_val_sub, y_val_sub)],verbose=True)


Training XGBoost...
[0]	validation_0-auc:0.49916
[1]	validation_0-auc:0.49974
[2]	validation_0-auc:0.50066
[3]	validation_0-auc:0.50130
[4]	validation_0-auc:0.50229
[5]	validation_0-auc:0.50141
[6]	validation_0-auc:0.50171
[7]	validation_0-auc:0.50255
[8]	validation_0-auc:0.49988
[9]	validation_0-auc:0.50019
[10]	validation_0-auc:0.49761
[11]	validation_0-auc:0.49649
[12]	validation_0-auc:0.49967
[13]	validation_0-auc:0.49925
[14]	validation_0-auc:0.49916
[15]	validation_0-auc:0.49962
[16]	validation_0-auc:0.49925
[17]	validation_0-auc:0.49937
[18]	validation_0-auc:0.49895
[19]	validation_0-auc:0.49889
[20]	validation_0-auc:0.50033
[21]	validation_0-auc:0.50069
[22]	validation_0-auc:0.50028
[23]	validation_0-auc:0.50132
[24]	validation_0-auc:0.50332
[25]	validation_0-auc:0.50489
[26]	validation_0-auc:0.50427
[27]	validation_0-auc:0.50808
[28]	validation_0-auc:0.50950
[29]	validation_0-auc:0.50904
[30]	validation_0-auc:0.50849
[31]	validation_0-auc:0.50966
[32]	validation_0-auc:0.50908

# Evaluating on test

In [81]:
print("\nEvaluating XGBoost on test set...")
y_pred_proba = xgb.predict_proba(X_test)[:,1]
y_pred = (y_pred_proba >= 0.5).astype(int)
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Evaluating XGBoost on test set...
ROC AUC: 0.5044504025726075
              precision    recall  f1-score   support

           0     0.8505    0.9997    0.9191     10029
           1     0.0000    0.0000    0.0000      1762

    accuracy                         0.8503     11791
   macro avg     0.4253    0.4999    0.4595     11791
weighted avg     0.7234    0.8503    0.7818     11791

Confusion matrix:
 [[10026     3]
 [ 1762     0]]


# Quick inference example for a new HADM_ID (reuse parts of pipeline)

In [82]:
def predict_hadm(hadm_id, ad_df=ad, diag_vectorizer=diag_vec, proc_vectorizer=proc_vec, encoder=ohe, xgb_model=xgb):
    """Return probability of denial for a given HADM_ID (if in current ad_df)."""
    row = ad_df[ad_df['HADM_ID']==hadm_id]
    if row.empty:
        raise ValueError("HADM_ID not found.")
        
    # cat
    cat_arr = row[cat_cols].fillna('MISSING')
    cat_sp = encoder.transform(cat_arr)
    # diag/proc
    diag_sp = diag_vectorizer.transform(row['DIAG_CODES'])
    proc_sp = proc_vectorizer.transform(row['PROC_CODES'])
    X_new = hstack([cat_sp, diag_sp, proc_sp], format='csr')
    prob = xgb_model.predict_proba(X_new)[:,1][0]
    return prob

# Example usage (pick a random HADM_ID from test set)

In [83]:
example_hadm = num_df.index[0]
print(f"\nExample prediction for HADM_ID {example_hadm}: denial probability = {predict_hadm(example_hadm):.4f}")

print("\nPipeline complete. Replace synthetic labels with real claim denial labels before final evaluation.")


Example prediction for HADM_ID 165315: denial probability = 0.1878

Pipeline complete. Replace synthetic labels with real claim denial labels before final evaluation.


# WE JUST HAVE TO REPLACE SYNTHETIC LABELS WITH REAL CLAIM DENIAL LABELS IN ABOVE CODE