# Objectives

# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score, precision_score


# Dataset

In [2]:
df = pd.read_csv('..\models\df_balanced_full.csv', low_memory=False)
display(df)

Unnamed: 0,subject_id,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,10014078,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3,-3.0,mEq/L,Unknown,12.316667
1,10011398,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.150000
2,10011398,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384,384.0,mm Hg,Unknown,5.616667
3,10011398,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98,98.0,%,Unknown,6.666667
4,10023771,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,10023117,104,104.0,mg/dL,Glucose (serum),Labs,9.050000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,EMERGENCY ROOM,58,2,2.0,Unknown,STAT,2.666667
49250,10019003,Obeys Commands,6.0,Unknown,GCS - Motor Response,Neurological,1.600000,Z87891,EW EMER.,Other,WHITE,TRANSFER FROM HOSPITAL,72,1.1,1.1,mg/dL,STAT,6.000000
49251,10023117,No response,1.0,Unknown,GCS - Motor Response,Neurological,4.950000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,TRANSFER FROM HOSPITAL,58,___,133.0,mEq/L,STAT,2.566667
49252,10037861,No movement,0.0,Unknown,Strength L Arm,Neurological,23.433333,Z87891,EW EMER.,Medicare,UNKNOWN,EMERGENCY ROOM,79,21,21.0,mg/dL,STAT,9.183333


# Treatment

In [25]:
df = df.copy()

In [26]:
class Treatment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = X.copy()

        # Guardar nomes das colunas categóricas para OneHotEncoding
        self.variables_cat = ['admission_type', 'insurance', 'priority']
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoder.fit(df[self.variables_cat])
        self.ohe_columns = self.encoder.get_feature_names_out(self.variables_cat)

        # Treinar LabelEncoders para outras colunas
        self.label_encoders = {}
        variables_label = ['value_chartevent', 'label_chartevent', 'valueuom_labevent',
                           'valueuom_chartevent', 'admission_location', 'race', 'category']
        
        for col in variables_label:
            if col in df.columns:
                le = LabelEncoder()
                le.fit(df[col].astype(str))
                self.label_encoders[col] = le

        return self

    def transform(self, X):
        df = X.copy()

        # Combined
        df['combined'] = np.where(
            (df['valueuom_chartevent'] == 'Unknown'),
            df['label_chartevent'],
            df['label_chartevent'] + ' (' + df['valueuom_chartevent'] + ')'
        )

        # Feature Interactions
        urgency_rank = {
            'ELECTIVE': 1, 'OBSERVATION ADMIT': 2,
            'SURGICAL SAME DAY ADMISSION': 3, 'URGENT': 4,
            'DIRECT EMER.': 5, 'EW EMER.': 6
        }
        df['urgency_score'] = df['admission_type'].map(urgency_rank)
        df['urgency_x_lab_delay'] = df['urgency_score'] * df['time_since_admission_labevent']
        df['admission_x_age'] = df['urgency_score'] * df['age']

        # OneHotEncoding com encoder treinado no fit()
        one_hot_encoded = self.encoder.transform(df[self.variables_cat])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.ohe_columns, index=df.index)
        df = pd.concat([df.drop(columns=self.variables_cat), one_hot_df], axis=1)

        # LabelEncoding com encoders treinados no fit()
        for col, le in self.label_encoders.items():
            if col in df.columns:
                df[col] = le.transform(df[col].astype(str))
            else:
                print(f"Aviso: Coluna '{col}' não encontrada em transform().")

        # Frequência de valores
        df['value_labevent'] = df['value_labevent'].astype(str)
        df['value_labevent_freq'] = df['value_labevent'].map(df['value_labevent'].value_counts())
        df.drop(columns=['value_labevent'], inplace=True)

        df['combined'] = df['combined'].astype(str)
        df['combined_freq'] = df['combined'].map(df['combined'].value_counts())
        df.drop(columns=['combined'], inplace=True)

        return df

# Model

In [43]:

le = LabelEncoder()
y = le.fit_transform(df['icd_code'])
X = df.drop('icd_code', axis=1)


In [44]:
print(le.classes_)
joblib.dump(le, 'label_encoder_icd.pkl')

['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


['label_encoder_icd.pkl']

In [62]:
treatment = Treatment()
X_processed = treatment.fit_transform(X)
joblib.dump(treatment, 'treatment.pkl')

['treatment.pkl']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, stratify=y, test_size=0.2, random_state=42)

In [64]:

param_dist = {
    'model__max_depth': [5],
    'model__learning_rate': [0.05],
    'model__gamma': [0.5],
    'model__min_child_weight': [3],
    'model__reg_alpha': [0],
    'model__reg_lambda': [1],
    'model__n_estimators': [200]  
}


pipeline = Pipeline([
    ('model', xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(le.classes_),
        tree_method='hist',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    ))
])

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_dist,
    scoring='precision_weighted',
    cv=5,
    n_jobs=-1,
    verbose=2 )

In [65]:

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [66]:

y_pred = grid_search.predict(X_test)

# mean CV = 5
print("Accuracy - Internal validation:", grid_search.best_score_)

print("\nAccuracy - Real", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy - Internal validation: 0.7290169982228374

Accuracy - Real 0.6612526647040909
Precision (macro): 0.7443562338073063

Classification Report:
               precision    recall  f1-score   support

        2724       0.65      0.95      0.77      1641
        4019       0.91      0.49      0.64      1642
        E039       0.68      0.50      0.58      1642
        E785       0.99      0.14      0.25      1642
        Z794       0.72      1.00      0.84      1642
      Z87891       0.51      0.88      0.65      1642

    accuracy                           0.66      9851
   macro avg       0.74      0.66      0.62      9851
weighted avg       0.74      0.66      0.62      9851



In [67]:

joblib.dump(grid_search.best_estimator_, 'modelo_pipeline_final.pkl')

['modelo_pipeline_final.pkl']

# Example

## Download 

In [96]:
treatment = joblib.load('treatment.pkl')
pipeline = joblib.load('modelo_pipeline_final.pkl')
le = joblib.load('label_encoder_icd.pkl')

In [107]:
cases = pd.read_csv('..\models\cases.csv')

In [108]:
case = treatment.transform(cases)

expected_cols = pipeline.named_steps['model'].feature_names_in_
missing_cols = set(expected_cols) - set(case.columns)

print("Missing columns:")
print(missing_cols)


Missing columns:
set()


In [109]:
print(case.columns.tolist())

['subject_id', 'value_chartevent', 'valuenum_chartevent', 'valueuom_chartevent', 'label_chartevent', 'category', 'time_since_admission_chartevent', 'race', 'admission_location', 'age', 'valuenum_labevent', 'valueuom_labevent', 'time_since_admission_labevent', 'urgency_score', 'urgency_x_lab_delay', 'admission_x_age', 'admission_type_DIRECT EMER.', 'admission_type_ELECTIVE', 'admission_type_EW EMER.', 'admission_type_OBSERVATION ADMIT', 'admission_type_SURGICAL SAME DAY ADMISSION', 'admission_type_URGENT', 'insurance_Medicaid', 'insurance_Medicare', 'insurance_Other', 'priority_ROUTINE', 'priority_STAT', 'priority_Unknown', 'value_labevent_freq', 'combined_freq']


In [110]:
preds = pipeline.predict(case)
print(preds)

[1 1 1 1]


In [111]:
lable = pd.read_csv('..\models\icd_match.csv')

In [85]:
icd_dict = dict(zip(lable['icd_code'], lable['long_title']))

In [None]:

predicted_icd_codes = le.inverse_transform(preds)
for i, (index, row) in enumerate(cases.iterrows()):
    icd_code = predicted_icd_codes[i]
    description = icd_dict.get(icd_code)
    
    print(f"Example {i+1}:")
    print(row.to_dict())  
    print(f"Result: {icd_code} - {description}")
    print("-" * 40)

# the last case, not correctly predicted


Exemplo 1:
{'subject_id': 1, 'value_chartevent': 'No movement', 'valuenum_chartevent': 36.8, 'valueuom_chartevent': 'Unknown', 'label_chartevent': 'Temperature Celsius', 'category': 'Routine Vital Signs', 'time_since_admission_chartevent': 12.0, 'admission_type': 'URGENT', 'insurance': 'Medicare', 'race': 'WHITE', 'admission_location': 'EMERGENCY ROOM', 'age': 72, 'value_labevent': 2.0, 'valuenum_labevent': 2.0, 'valueuom_labevent': 'mg/dL', 'priority': 'STAT', 'time_since_admission_labevent': 1.2}
Predição ICD: 4019 - Unspecified essential hypertension
----------------------------------------
Exemplo 2:
{'subject_id': 2, 'value_chartevent': '27', 'valuenum_chartevent': 27.0, 'valueuom_chartevent': 'mA', 'label_chartevent': 'Temporary Ventricular Stim Setting mA', 'category': 'Cardiovascular (Pacer Data)', 'time_since_admission_chartevent': 2.5, 'admission_type': 'EW EMER.', 'insurance': 'Other', 'race': 'HISPANIC OR LATINO', 'admission_location': 'PHYSICIAN REFERRAL', 'age': 67, 'valu