In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score, precision_score
import xgboost as xgb
import joblib
import scipy.stats as stats

In [2]:
df = pd.read_csv('..\models\df_balanced_full.csv', low_memory=False)
display(df)

Unnamed: 0,subject_id,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,10014078,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3,-3.0,mEq/L,Unknown,12.316667
1,10011398,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.150000
2,10011398,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384,384.0,mm Hg,Unknown,5.616667
3,10011398,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98,98.0,%,Unknown,6.666667
4,10023771,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,10023117,104,104.0,mg/dL,Glucose (serum),Labs,9.050000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,EMERGENCY ROOM,58,2,2.0,Unknown,STAT,2.666667
49250,10019003,Obeys Commands,6.0,Unknown,GCS - Motor Response,Neurological,1.600000,Z87891,EW EMER.,Other,WHITE,TRANSFER FROM HOSPITAL,72,1.1,1.1,mg/dL,STAT,6.000000
49251,10023117,No response,1.0,Unknown,GCS - Motor Response,Neurological,4.950000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,TRANSFER FROM HOSPITAL,58,___,133.0,mEq/L,STAT,2.566667
49252,10037861,No movement,0.0,Unknown,Strength L Arm,Neurological,23.433333,Z87891,EW EMER.,Medicare,UNKNOWN,EMERGENCY ROOM,79,21,21.0,mg/dL,STAT,9.183333


# Treatment

In [3]:
df = df.copy()

In [None]:
class Treatment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Combined
        df['combined'] = np.where(
            (df['valueuom_chartevent'] == 'Unknown'),
            df['label_chartevent'],
            df['label_chartevent'] + ' (' + df['valueuom_chartevent'] + ')'
        )

        # Feature Interactions
        urgency_rank = {
            'ELECTIVE': 1, 'OBSERVATION ADMIT': 2,
            'SURGICAL SAME DAY ADMISSION': 3, 'URGENT': 4,
            'DIRECT EMER.': 5, 'EW EMER.': 6
        }
        df['urgency_score'] = df['admission_type'].map(urgency_rank)
        df['urgency_x_lab_delay'] = df['urgency_score'] * df['time_since_admission_labevent']
        df['admission_x_age'] = df['urgency_score'] * df['age']

        print("Fisrt display Combine + FI")
        display(df.head())

        # OneHot Encoding
        variables_cat = ['admission_type', 'insurance', 'priority']
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        one_hot_encoded = encoder.fit_transform(df[variables_cat])

        one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(variables_cat),
                          index=df.index)

        df = pd.concat([df.drop(columns=variables_cat), one_hot_df], axis=1)
        print("Second display OneHot enconding")
        display(df.head())
        
        # Label Encoding
        variables = ['value_chartevent', 'label_chartevent', 'valueuom_labevent',
                     'valueuom_chartevent', 'admission_location', 'race', 'category']

        label_encoded_df = pd.DataFrame(index=df.index)
        for col in variables:
            if col in df.columns:
                le = LabelEncoder()
                label_encoded_df[col] = le.fit_transform(df[col].astype(str))
            else:
                print(f"Aviso: Coluna '{col}' não encontrada.")


        df = pd.concat([df.drop(columns=variables), label_encoded_df], axis=1)

        print("Thrid Display Label enconding", display(df))

        # Frequency
        df['value_labevent'] = df['value_labevent'].astype(str)
        df['value_labevent_freq'] = df['value_labevent'].map(df['value_labevent'].value_counts())
        df.drop(columns=['value_labevent'], inplace=True)

        df['combined'] = df['combined'].astype(str)
        df['combined_freq'] = df['combined'].map(df['combined'].value_counts())
        df.drop(columns=['combined'], inplace=True)

        # Drop colunas
        cols_to_drop = ['subject_id', 'valueuom_chartevent','label_chartevent','valueuom_labevent']
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)


        print("Final")
        display(df.head())
        print("ahhh")

        return df

# Model

In [15]:

df_processed = Treatment().fit_transform(df)

X = df_processed.drop('icd_code', axis=1)
y = df_processed['icd_code']

le = LabelEncoder()
y = le.fit_transform(y) 


Unnamed: 0,subject_id,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,...,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent,combined,urgency_score,urgency_x_lab_delay,admission_x_age
0,10014078,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,...,60,-3,-3.0,mEq/L,Unknown,12.316667,Temperature Fahrenheit (°F),6,73.900000,360
1,10011398,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,...,67,1.4,1.4,Unknown,STAT,4.150000,TCO2 (calc) Arterial (mEq/L),3,12.450000,201
2,10011398,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,...,67,384,384.0,mm Hg,Unknown,5.616667,Temporary Ventricular Stim Setting mA (mA),3,16.850000,201
3,10011398,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,...,67,98,98.0,%,Unknown,6.666667,Temperature Celsius (°C),3,20.000000,201
4,10023771,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,...,70,1.2,1.2,Unknown,ROUTINE,15.366667,O2 saturation pulseoxymetry (%),1,15.366667,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,10023117,104,104.0,mg/dL,Glucose (serum),Labs,9.050000,Z87891,OBSERVATION ADMIT,Medicare,...,58,2,2.0,Unknown,STAT,2.666667,Glucose (serum) (mg/dL),2,5.333333,116
49250,10019003,Obeys Commands,6.0,Unknown,GCS - Motor Response,Neurological,1.600000,Z87891,EW EMER.,Other,...,72,1.1,1.1,mg/dL,STAT,6.000000,GCS - Motor Response,6,36.000000,432
49251,10023117,No response,1.0,Unknown,GCS - Motor Response,Neurological,4.950000,Z87891,OBSERVATION ADMIT,Medicare,...,58,___,133.0,mEq/L,STAT,2.566667,GCS - Motor Response,2,5.133333,116
49252,10037861,No movement,0.0,Unknown,Strength L Arm,Neurological,23.433333,Z87891,EW EMER.,Medicare,...,79,21,21.0,mg/dL,STAT,9.183333,Strength L Arm,6,55.100000,474


Fisrt display Combine + FI None
Second display OneHot enconding    subject_id value_chartevent  valuenum_chartevent valueuom_chartevent  \
0    10014078              103                103.0                  °F   
1    10011398               25                 25.0               mEq/L   
2    10011398               25                 25.0                  mA   
3    10011398             36.8                 36.8                  °C   
4    10023771              100                100.0                   %   

                        label_chartevent                     category  \
0                 Temperature Fahrenheit          Routine Vital Signs   
1                   TCO2 (calc) Arterial                         Labs   
2  Temporary Ventricular Stim Setting mA  Cardiovascular (Pacer Data)   
3                    Temperature Celsius          Routine Vital Signs   
4            O2 saturation pulseoxymetry                  Respiratory   

   time_since_admission_chartevent icd_code   

Unnamed: 0,subject_id,valuenum_chartevent,time_since_admission_chartevent,icd_code,age,value_labevent,valuenum_labevent,time_since_admission_labevent,combined,urgency_score,...,priority_ROUTINE,priority_STAT,priority_Unknown,value_chartevent,label_chartevent,valueuom_labevent,valueuom_chartevent,admission_location,race,category
0,10014078,103.0,4.850000,2724,60,-3,-3.0,12.316667,Temperature Fahrenheit (°F),6,...,0.0,0.0,1.0,132,312,13,33,1,6,17
1,10011398,25.0,7.900000,2724,67,1.4,1.4,4.150000,TCO2 (calc) Arterial (mEq/L),3,...,0.0,1.0,0.0,406,308,9,14,3,1,10
2,10011398,25.0,16.750000,2724,67,384,384.0,5.616667,Temporary Ventricular Stim Setting mA (mA),3,...,0.0,0.0,1.0,406,323,19,13,3,1,4
3,10011398,36.8,18.750000,2724,67,98,98.0,6.666667,Temperature Celsius (°C),3,...,0.0,0.0,1.0,568,311,4,32,3,1,17
4,10023771,100.0,22.750000,2724,70,1.2,1.2,15.366667,O2 saturation pulseoxymetry (%),1,...,1.0,0.0,0.0,120,211,9,0,3,8,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,10023117,104.0,9.050000,Z87891,58,2,2.0,2.666667,Glucose (serum) (mg/dL),2,...,0.0,1.0,0.0,133,135,9,19,1,8,10
49250,10019003,6.0,1.600000,Z87891,72,1.1,1.1,6.000000,GCS - Motor Response,6,...,0.0,1.0,0.0,1038,130,17,5,5,8,12
49251,10023117,1.0,4.950000,Z87891,58,___,133.0,2.566667,GCS - Motor Response,2,...,0.0,1.0,0.0,1027,130,13,5,5,8,12
49252,10037861,0.0,23.433333,Z87891,79,21,21.0,9.183333,Strength L Arm,6,...,0.0,1.0,0.0,1026,300,17,5,1,7,12


Thrid Display Label enconding None
Final


Unnamed: 0,valuenum_chartevent,time_since_admission_chartevent,icd_code,age,valuenum_labevent,time_since_admission_labevent,urgency_score,urgency_x_lab_delay,admission_x_age,admission_type_DIRECT EMER.,...,insurance_Other,priority_ROUTINE,priority_STAT,priority_Unknown,value_chartevent,admission_location,race,category,value_labevent_freq,combined_freq
0,103.0,4.85,2724,60,-3.0,12.316667,6,73.9,360,0.0,...,0.0,0.0,0.0,1.0,132,1,6,17,17,623
1,25.0,7.9,2724,67,1.4,4.15,3,12.45,201,0.0,...,1.0,0.0,1.0,0.0,406,3,1,10,241,157
2,25.0,16.75,2724,67,384.0,5.616667,3,16.85,201,0.0,...,1.0,0.0,0.0,1.0,406,3,1,4,12,8
3,36.8,18.75,2724,67,98.0,6.666667,3,20.0,201,0.0,...,1.0,0.0,0.0,1.0,568,3,1,17,170,162
4,100.0,22.75,2724,70,1.2,15.366667,1,15.366667,70,0.0,...,0.0,1.0,0.0,0.0,120,3,8,15,266,2447


ahhh


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:

param_dist = {
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'gamma': [0, 0.5],
    'min_child_weight': [1, 3],
    'reg_alpha': [0, 1],
    'reg_lambda': [1, 2],
    'n_estimators': [100, 200]
}

# === Pipeline === #
pipeline = Pipeline([
    ('preprocessing', Treatment()),
    ('imputer', SimpleImputer(strategy='mean')),  # se necessário
    ('model', xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(le.classes_),
        tree_method='hist',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    ))
])

# === Randomized Search com pipeline === #
grid_search = GridSearchCV(
    pipeline
    param_grid=param_dist,
    scoring='precision_weighted',
    cv=3,
    n_jobs=-1,
    verbose=2 )

Fitting 3 folds for each of 20 candidates, totalling 60 fits




In [None]:
# === Treinar === #
grid_search.fit(X_train, y_train)

# === Resultados === #
print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor accuracy em validação cruzada:", grid_search.best_score_)

In [None]:
# === Testar no conjunto de teste === #
y_pred = grid_search.predict(X_test)
print("\nAccuracy no teste:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
# === Guardar pipeline === #
joblib.dump(grid_search.best_estimator_, 'modelo_pipeline_final.pkl')