# Objectives

# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score, precision_score


# Dataset

In [2]:
df = pd.read_csv('..\models\df_final.csv', low_memory=False)
display(df)

Unnamed: 0.1,Unnamed: 0,value_chart,valuenum_chartevent,chart_label,category,time_since_admission_chartevent,icd_code,admission_type,race,age,...,lab_value_unit,priority,time_since_admission_labevent,death,BMI (kg/m2),Height (Inches),Weight (Lbs),urgency_score,urgency_x_lab_delay,admission_x_age
0,0,Full resistance,5.0,Strength L Arm,Neurological,13.116667,2724,SURGICAL SAME DAY ADMISSION,OTHER,66,...,mg/dL,Unknown,1.900000,0,0.0,71.00,0.00,3,5.700000,198
1,1,1,1.0,20 Gauge Dressing Occlusive,Access Lines - Peripheral,4.183333,2724,EW EMER.,HISPANIC/LATINO - CUBAN,80,...,%,ROUTINE,4.200000,0,23.6,60.00,121.00,6,25.200000,480
2,2,100,100.0,O2 saturation pulseoxymetry,Respiratory,5.316667,2724,ELECTIVE,WHITE,70,...,mg/dL,ROUTINE,20.000000,0,0.0,0.00,0.00,1,20.000000,70
3,3,Some resistance,4.0,Strength L Leg,Neurological,8.500000,2724,EW EMER.,HISPANIC/LATINO - CUBAN,80,...,mEq/L,ROUTINE,4.200000,0,23.6,60.00,121.00,6,25.200000,480
4,4,106,106.0,Non Invasive Blood Pressure mean,Routine Vital Signs,1.583333,2724,EW EMER.,WHITE,65,...,mEq/L,STAT,10.200000,0,0.0,0.00,0.00,6,61.200000,390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439545,465283,74,74.0,Non Invasive Blood Pressure mean,Routine Vital Signs,14.450000,Z87891,EW EMER.,WHITE,89,...,sec,ROUTINE,10.083333,0,26.0,66.00,161.16,6,60.500000,534
439546,465284,-1 Awakens to voice (eye opening/contact) > 10...,-1.0,Richmond-RAS Scale,Pain/Sedation,16.516667,Z87891,OBSERVATION ADMIT,WHITE,58,...,K/uL,STAT,2.566667,0,25.8,67.63,165.00,2,5.133333,116
439547,465285,22,22.0,Peak Insp. Pressure,Respiratory,9.516667,Z87891,OBSERVATION ADMIT,WHITE,58,...,sec,STAT,7.200000,0,25.8,67.63,165.00,2,14.400000,116
439548,465286,74,74.0,Non Invasive Blood Pressure mean,Routine Vital Signs,4.766667,Z87891,OBSERVATION ADMIT,WHITE,58,...,g/dL,STAT,5.483333,0,25.8,67.63,165.00,2,10.966667,116


# Treatment

In [3]:
df = df.copy()

In [8]:
class Treatment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = X.copy()
        self.variables_cat = ['priority']
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoder.fit(df[self.variables_cat])
        self.ohe_columns = self.encoder.get_feature_names_out(self.variables_cat)

        self.label_encoders = {}
        variables_label = ['icd_code', 'category', 'race', 'lab_value_unit']
        
        for col in variables_label:
            if col in df.columns:
                le = LabelEncoder()
                le.fit(df[col].astype(str))
                self.label_encoders[col] = le

        return self

    def transform(self, X):
        df = X.copy()


        # OneHotEncoding
        one_hot_encoded = self.encoder.transform(df[self.variables_cat])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.ohe_columns, index=df.index)
        df = pd.concat([df.drop(columns=self.variables_cat), one_hot_df], axis=1)

        # LabelEncoding 
        for col, le in self.label_encoders.items():
            if col in df.columns:
                df[col] = le.transform(df[col].astype(str))
            else:
                print(f"Aviso: Coluna '{col}' não encontrada em transform().")

        # Frequency
        df['value_chart'] = df['value_chart'].astype(str)
        df['value_chart_freq'] = df['value_chart'].map(df['value_chart'].value_counts())
        df.drop(columns=['value_chart'], inplace=True)

        df['chart_label'] = df['chart_label'].astype(str)
        df['chart_label_freq'] = df['chart_label'].map(df['chart_label'].value_counts())
        df.drop(columns=['chart_label'], inplace=True)

        return df

# Model

In [None]:

le = LabelEncoder()
y = le.fit_transform(df['icd_code'])
X = df.drop('icd_code', axis=1)

In [10]:
print(le.classes_)
joblib.dump(le, 'label_encoder_icd.pkl')

['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


['label_encoder_icd.pkl']

In [11]:
treatment = Treatment()
X_processed = treatment.fit_transform(X)
joblib.dump(treatment, 'treatment.pkl')

['treatment.pkl']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, stratify=y, test_size=0.2, random_state=42)

In [13]:

param_dist = {
    'model__max_depth': [6],
    'model__learning_rate': [0.1],
    'model__gamma': [0.5],
    'model__min_child_weight': [3],
    'model__reg_alpha': [0],
    'model__reg_lambda': [1],
    'model__n_estimators': [200]  
}


pipeline = Pipeline([
    ('model', xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(le.classes_),
        tree_method='hist',
        eval_metric='mlogloss',
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        random_state=42
    ))
])

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_dist,
    scoring='precision_weighted',
    cv=5,
    n_jobs=-1,
    verbose=2 )

In [15]:

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
                             ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\catar\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\catar\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\sklearn.py", line 1663, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
           ^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
                                              ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 620, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
                                   ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
  File "C:\Users\catar\AppData\Roaming\Python\Python311\site-packages\xgboost\data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:admission_type: object


In [None]:

y_pred = grid_search.predict(X_test)

# mean CV = 5
print("Accuracy - Internal validation:", grid_search.best_score_)

print("\nAccuracy - Real", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy - Internal validation: 0.6447120561723713

Accuracy - Real 0.6503908232666734
Precision (macro): 0.6521365227851583

Classification Report:
               precision    recall  f1-score   support

        2724       0.68      0.82      0.74      1641
        4019       0.77      0.62      0.69      1642
        E039       0.62      0.57      0.59      1642
        E785       0.61      0.25      0.35      1642
        Z794       0.72      0.98      0.83      1642
      Z87891       0.51      0.67      0.58      1642

    accuracy                           0.65      9851
   macro avg       0.65      0.65      0.63      9851
weighted avg       0.65      0.65      0.63      9851



In [13]:

joblib.dump(grid_search.best_estimator_, 'modelo_pipeline_final.pkl')

['modelo_pipeline_final.pkl']

# Example

## Download 

In [14]:
treatment = joblib.load('treatment.pkl')
pipeline = joblib.load('modelo_pipeline_final.pkl')
le = joblib.load('label_encoder_icd.pkl')

In [15]:
cases = pd.read_csv('..\models\cases.csv')

In [18]:
case = treatment.transform(cases)

expected_cols = pipeline.named_steps['model'].feature_names_in_
missing_cols = set(expected_cols) - set(case.columns)

print("Missing columns:")
print(missing_cols)


Missing columns:
set()


In [19]:
display(cases)

Unnamed: 0,subject_id,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,1,No movement,36.8,Unknown,Temperature Celsius,Routine Vital Signs,12.0,URGENT,Medicare,WHITE,EMERGENCY ROOM,72,2.0,2.0,mg/dL,STAT,1.2
1,2,27,27.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),2.5,EW EMER.,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,333.0,333.0,mm Hg,Unknown,2.0
2,3,8,8.0,insp/min,Resp Alarm - Low,Alarms,12.55,EW EMER.,Other,WHITE,PHYSICIAN REFERRAL,59,8.9,8.9,g/dL,STAT,14.95


In [20]:
preds = pipeline.predict(case)
print(preds)

[1 1 4]


In [21]:
lable = pd.read_csv('..\models\icd_match.csv')

In [22]:
icd_dict = dict(zip(lable['icd_code'], lable['long_title']))

In [24]:

predicted_icd_codes = le.inverse_transform(preds)
for i, (index, row) in enumerate(cases.iterrows()):
    icd_code = predicted_icd_codes[i]
    description = icd_dict.get(icd_code)
    
    print(f"Example {i+1}:")
    print(row.to_dict())  
    print(f"Result: {icd_code} - {description}")
    print("-" * 40)




Example 1:
{'subject_id': 1, 'value_chartevent': 'No movement', 'valuenum_chartevent': 36.8, 'valueuom_chartevent': 'Unknown', 'label_chartevent': 'Temperature Celsius', 'category': 'Routine Vital Signs', 'time_since_admission_chartevent': 12.0, 'admission_type': 'URGENT', 'insurance': 'Medicare', 'race': 'WHITE', 'admission_location': 'EMERGENCY ROOM', 'age': 72, 'value_labevent': 2.0, 'valuenum_labevent': 2.0, 'valueuom_labevent': 'mg/dL', 'priority': 'STAT', 'time_since_admission_labevent': 1.2}
Result: 4019 - Unspecified essential hypertension
----------------------------------------
Example 2:
{'subject_id': 2, 'value_chartevent': '27', 'valuenum_chartevent': 27.0, 'valueuom_chartevent': 'mA', 'label_chartevent': 'Temporary Ventricular Stim Setting mA', 'category': 'Cardiovascular (Pacer Data)', 'time_since_admission_chartevent': 2.5, 'admission_type': 'EW EMER.', 'insurance': 'Other', 'race': 'HISPANIC OR LATINO', 'admission_location': 'PHYSICIAN REFERRAL', 'age': 67, 'value_labe