In [1]:
import pandas as pd
final_df = pd.read_csv('/kaggle/input/df-final/combined_cleaned_df_final.csv')

In [2]:
final_df = final_df.drop('Unnamed: 0', axis=1)
from sklearn.preprocessing import LabelEncoder

gender_encoder = LabelEncoder()

final_df['gender'] = gender_encoder.fit_transform(final_df['gender'])
df_icd9 = final_df[final_df['icd_version'] == 9]

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
icd_encoded = encoder.fit_transform(df_icd9[['icd_code']])
icd_encoded_df = pd.DataFrame(
    icd_encoded,
    columns=encoder.get_feature_names_out(['icd_code'])
)
X = df_icd9.drop(['hadm_id', 'icd_code', 'subject_id'], axis=1)
y = icd_encoded_df
y_labels = y.idxmax(axis=1)
X.columns = X.columns.astype(str)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X)

import numpy as np
y_class = np.argmax(y.values, axis=1)

In [4]:
import numpy as np
import lightgbm as lgb
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------------------------------------------------------------------------
# 1. Define the PyTorch model architecture
# ------------------------------------------------------------------------------
class SimpleNet(nn.Module):
    def __init__(self, input_size=25, hidden1=256, hidden2=512, hidden3=256, hidden4=128, num_classes=1203):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden3, hidden4)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(hidden4, num_classes)
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.fc5(x)  # Raw logits (will apply softmax later)
        return x

# ------------------------------------------------------------------------------
# 2. Load the pre-trained models
# ------------------------------------------------------------------------------

# Load LightGBM model (assumed saved as a text file)
lgb_model_path = '/kaggle/input/models-arogo/lgb_model.txt'  # Update with your actual file path
loaded_lgb = lgb.Booster(model_file=lgb_model_path)

# Load XGBoost model from a pickle file
xgb_model_path = '/kaggle/input/models-arogo/xgb_model.pkl'  # Update with your actual file path
loaded_xgb = joblib.load(xgb_model_path)

# Load PyTorch model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pytorch_model_path = '/kaggle/input/models-arogo/simple_net_state_dict.pth'  # Update with your actual file path
pytorch_model = SimpleNet(input_size=25, num_classes=1203).to(device)
pytorch_model.load_state_dict(torch.load(pytorch_model_path, map_location=device))
pytorch_model.eval()

# ------------------------------------------------------------------------------
# 3. Define the ensemble prediction functions
# ------------------------------------------------------------------------------

def ensemble_predict(X, weights=None):
    """
    Returns the ensemble's top-1 prediction for each sample.
    This is useful for computing overall accuracy.
    
    Args:
        X (np.array): Input feature array of shape (n_samples, n_features)
        weights (list or tuple): Weights for the three models [w_lgb, w_xgb, w_pt].
                                 Defaults to equal weighting if None.
    
    Returns:
        final_preds (np.array): Array of shape (n_samples,) with the predicted class labels.
    """
    if weights is None:
        weights = [1/3, 1/3, 1/3]
    w_lgb, w_xgb, w_pt = weights

    # LightGBM: Get probability distribution
    lgb_probs = loaded_lgb.predict(X)  # Shape: (n_samples, num_classes)
    
    # XGBoost: Get probability distribution
    xgb_probs = loaded_xgb.predict_proba(X)  # Shape: (n_samples, num_classes)
    
    # PyTorch: Get probability distribution (apply softmax to logits)
    X_tensor = torch.from_numpy(X).float().to(device)
    with torch.no_grad():
        outputs = pytorch_model(X_tensor)
        pt_probs = F.softmax(outputs, dim=1).cpu().numpy()  # Shape: (n_samples, num_classes)
    
    # Compute weighted sum of probabilities
    ensemble_probs = w_lgb * lgb_probs + w_xgb * xgb_probs + w_pt * pt_probs

    # Return the class with the highest probability for each sample
    final_preds = np.argmax(ensemble_probs, axis=1)
    return final_preds

def ensemble_topk(X, top_k=3, weights=None):
    """
    Returns the top-k predictions (class indices and their probabilities) for each sample.
    
    Args:
        X (np.array): Input feature array of shape (n_samples, n_features)
        top_k (int): Number of top predictions to return.
        weights (list or tuple): Weights for the three models [w_lgb, w_xgb, w_pt].
                                 Defaults to equal weighting if None.
    
    Returns:
        top_k_indices (np.array): Array of shape (n_samples, top_k) with top-k class indices.
        top_k_probs (np.array): Array of shape (n_samples, top_k) with corresponding probabilities.
    """
    if weights is None:
        weights = [1/3, 1/3, 1/3]
    w_lgb, w_xgb, w_pt = weights

    # Get probability distributions from each model
    lgb_probs = loaded_lgb.predict(X)
    xgb_probs = loaded_xgb.predict_proba(X)
    X_tensor = torch.from_numpy(X).float().to(device)
    with torch.no_grad():
        outputs = pytorch_model(X_tensor)
        pt_probs = F.softmax(outputs, dim=1).cpu().numpy()

    # Compute weighted sum of probabilities
    ensemble_probs = w_lgb * lgb_probs + w_xgb * xgb_probs + w_pt * pt_probs

    # For each sample, retrieve the indices of the top k probabilities.
    top_k_indices = np.argsort(ensemble_probs, axis=1)[:, -top_k:][:, ::-1]
    top_k_probs = np.take_along_axis(ensemble_probs, top_k_indices, axis=1)
    
    return top_k_indices, top_k_probs

# ------------------------------------------------------------------------------
# 4. Example usage
# ------------------------------------------------------------------------------

# Assuming X_scaled is your NumPy array of input features:
# For instance:
# X_scaled = np.load('X_scaled.npy')

# # Get top-1 predictions (for accuracy computation)
# final_predictions = ensemble_predict(X_scaled, weights=[0.4, 0.3, 0.3])
# print("Final predictions (top-1) for each sample:\n", final_predictions)

# # # Get top-3 predictions (with probabilities)
# top_k_indices, top_k_probs = ensemble_topk(X_scaled, top_k=3, weights=[0.4, 0.3, 0.3])
# print("Top-3 class indices for each sample:\n", top_k_indices)
# print("Corresponding probabilities for top-3 predictions:\n", top_k_probs)

  pytorch_model.load_state_dict(torch.load(pytorch_model_path, map_location=device))


In [6]:
# Get top-1 predictions (for accuracy computation)
final_predictions = ensemble_predict(X_scaled, weights=[0.4, 0.3, 0.3])

In [7]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_class, final_predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.98


In [13]:
tests = list(final_df.columns[5:])
tests

['50868',
 '50882',
 '50893',
 '50902',
 '50912',
 '50931',
 '50960',
 '50970',
 '50971',
 '50983',
 '51006',
 '51221',
 '51222',
 '51237',
 '51248',
 '51249',
 '51250',
 '51265',
 '51274',
 '51275',
 '51277',
 '51279',
 '51301']

In [14]:
y

Unnamed: 0,icd_code_00845,icd_code_0088,icd_code_0090,icd_code_01300,icd_code_01896,icd_code_0340,icd_code_035,icd_code_0380,icd_code_03811,icd_code_03812,...,icd_code_V5811,icd_code_V5812,icd_code_V5883,icd_code_V600,icd_code_V618,icd_code_V6284,icd_code_V714,icd_code_V7189,icd_code_V7281,icd_code_V7651
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
icd_codes = []
for i in y.columns:
    icd_codes.append(i[9:])
icd_codes

In [19]:
icd_df = pd.read_csv('/kaggle/input/icd-code-map/d_icd_diagnoses.csv')
icd_df = icd_df[icd_df['icd_version'] == 9]

In [20]:
icd_df

Unnamed: 0,icd_code,icd_version,long_title
0,0010,9,Cholera due to vibrio cholerae
1,0011,9,Cholera due to vibrio cholerae el tor
2,0019,9,"Cholera, unspecified"
3,0020,9,Typhoid fever
4,0021,9,Paratyphoid fever A
...,...,...,...
102715,V9129,9,"Quadruplet gestation, unable to determine numb..."
102802,V9190,9,"Other specified multiple gestation, unspecifie..."
102803,V9191,9,"Other specified multiple gestation, with two o..."
102804,V9192,9,"Other specified multiple gestation, with two o..."


In [21]:
# # Get top-3 predictions (with probabilities)
top_k_indices, top_k_probs = ensemble_topk(X_scaled, top_k=3, weights=[0.4, 0.3, 0.3])

In [26]:
top_k_probs[:5]

array([[0.60780269, 0.00900517, 0.0059068 ],
       [0.55496096, 0.02033082, 0.009049  ],
       [0.35938857, 0.01381496, 0.00984106],
       [0.55590244, 0.00946842, 0.00580056],
       [0.37038364, 0.01350639, 0.01231651]])

In [31]:
def format_ensemble_predictions(top_k_indices, y_labels, icd_df, top_k_probs):
    """
    Given an array of top k prediction indices for each sample,
    a list of y_labels mapping indices to ICD codes, and a dataframe containing
    ICD code details (with columns "icd_code" and "long_title"), returns a list
    of formatted strings describing the predictions in natural language.

    Args:
        top_k_indices (np.array): Array of shape (n_samples, top_k) with predicted class indices.
        y_labels (list): List of ICD code strings, mapping model output indices to ICD codes.
        icd_df (pd.DataFrame): DataFrame with columns "icd_code" and "long_title".
        
    Returns:
        List[str]: A list of formatted strings, one for each sample.
    """
    formatted_outputs = []
    n_samples, top_k = top_k_indices.shape
    
    for i in range(10):
        # Convert predicted indices to ICD codes.
        predicted_codes = [y_labels[idx] for idx in top_k_indices[i]]
        # Lookup the long_title for each ICD code.
        titles = []
        probs = []
        ct = 0
        for code in predicted_codes:
            # Find the row with the matching ICD code.
            match = icd_df[icd_df['icd_code'] == code]
            if not match.empty:
                title = match.iloc[0]['long_title']
            else:
                title = "Unknown condition"
            titles.append(title)
            probs.append(top_k_probs[i][ct])
            ct+=1
        
        # Format a natural language string.
        formatted_str = f"Sample {i+1}: The top {top_k} predicted diagnoses are:\n"
        for rank, (code, title) in enumerate(zip(predicted_codes, titles), start=1):
            prob_this = float("{:.2f}".format(probs[rank-1]*100))
            formatted_str += f"  {rank}. {title} (ICD Code: {code}) with Probability : {prob_this}%\n"
        
        formatted_outputs.append(formatted_str)
        
    return formatted_outputs

# Example usage:
# Assume top_k_indices is obtained from ensemble_topk, y_labels is your list of ICD codes,
# and icd_df is your DataFrame.
formatted_results = format_ensemble_predictions(top_k_indices, icd_codes, icd_df, top_k_probs)

# Print the formatted output for each sample.
for result in formatted_results:
    print(result)

Sample 1: The top 3 predicted diagnoses are:
  1. Portal hypertension (ICD Code: 5723) with Probability : 60.78%
  2. Coronary atherosclerosis of native coronary artery (ICD Code: 41401) with Probability : 0.9%
  3. Atrial fibrillation (ICD Code: 42731) with Probability : 0.59%

Sample 2: The top 3 predicted diagnoses are:
  1. Unspecified viral hepatitis C with hepatic coma (ICD Code: 07071) with Probability : 55.5%
  2. Hyposmolality and/or hyponatremia (ICD Code: 2761) with Probability : 2.03%
  3. Coronary atherosclerosis of native coronary artery (ICD Code: 41401) with Probability : 0.9%

Sample 3: The top 3 predicted diagnoses are:
  1. Chronic hepatitis C without mention of hepatic coma (ICD Code: 07054) with Probability : 35.94%
  2. Closed fracture of four ribs (ICD Code: 80704) with Probability : 1.38%
  3. Coronary atherosclerosis of native coronary artery (ICD Code: 41401) with Probability : 0.98%

Sample 4: The top 3 predicted diagnoses are:
  1. Other iatrogenic hypotensi