# Install

In [31]:
#!pip install xgboost scikit-learn pandas numpy

# Imports

In [47]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Dataset

In [48]:
df = pd.read_csv('..\models\df_balanced_full.csv', low_memory=False)
display(df)

Unnamed: 0,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3,-3.0,mEq/L,Unknown,12.316667
1,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.150000
2,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384,384.0,mm Hg,Unknown,5.616667
3,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98,98.0,%,Unknown,6.666667
4,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,104,104.0,mg/dL,Glucose (serum),Labs,9.050000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,EMERGENCY ROOM,58,2,2.0,Unknown,STAT,2.666667
49250,Obeys Commands,6.0,Unknown,GCS - Motor Response,Neurological,1.600000,Z87891,EW EMER.,Other,WHITE,TRANSFER FROM HOSPITAL,72,1.1,1.1,mg/dL,STAT,6.000000
49251,No response,1.0,Unknown,GCS - Motor Response,Neurological,4.950000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,TRANSFER FROM HOSPITAL,58,___,133.0,mEq/L,STAT,2.566667
49252,No movement,0.0,Unknown,Strength L Arm,Neurological,23.433333,Z87891,EW EMER.,Medicare,UNKNOWN,EMERGENCY ROOM,79,21,21.0,mg/dL,STAT,9.183333


In [49]:
df.describe()

Unnamed: 0,valuenum_chartevent,time_since_admission_chartevent,age,valuenum_labevent,time_since_admission_labevent
count,49254.0,49254.0,49254.0,49254.0,49254.0
mean,64.377868,11.649927,62.735189,108.838101,9.07176
std,689.927564,6.909472,14.341153,1418.442488,7.985474
min,-23.0,-1.8,29.0,-21.0,-20.3
25%,1.84,5.616667,56.0,3.8,3.85
50%,18.0,11.416667,63.0,15.0,9.016667
75%,83.0,17.4,72.0,48.0,15.233333
max,62656.0,24.0,91.0,62656.0,23.983333


In [50]:
df.describe(exclude=np.number)

Unnamed: 0,value_chartevent,valueuom_chartevent,label_chartevent,category,icd_code,admission_type,insurance,race,admission_location,value_labevent,valueuom_labevent,priority
count,49254,49254,49254,49254,49254,49254,49254,49254,49254,49254,49254,49254
unique,1077,34,362,22,6,6,3,9,7,1044,29,3
top,1,Unknown,Heart Rate,Routine Vital Signs,2724,EW EMER.,Other,WHITE,EMERGENCY ROOM,___,mEq/L,STAT
freq,3893,17903,2482,11152,8209,25601,24221,37442,20280,4762,10605,23997


# Treatment

In [51]:
df['value_labevent'] = df['value_labevent'].fillna('Unknown')


In [52]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
print(pd.DataFrame({"Missing": missing, "Percent (%)": missing_percent}))

                                 Missing  Percent (%)
value_chartevent                       0          0.0
valuenum_chartevent                    0          0.0
valueuom_chartevent                    0          0.0
label_chartevent                       0          0.0
category                               0          0.0
time_since_admission_chartevent        0          0.0
icd_code                               0          0.0
admission_type                         0          0.0
insurance                              0          0.0
race                                   0          0.0
admission_location                     0          0.0
age                                    0          0.0
value_labevent                         0          0.0
valuenum_labevent                      0          0.0
valueuom_labevent                      0          0.0
priority                               0          0.0
time_since_admission_labevent          0          0.0


# XGBoost DMatrix

In [53]:
# Extract feature and target arrays
X = df.drop('icd_code', axis=1)
y = df['icd_code']

In [54]:

le = LabelEncoder()
y_encoded = le.fit_transform(y)

y = y_encoded

In [55]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [56]:
print(X.dtypes)

value_chartevent                   category
valuenum_chartevent                 float64
valueuom_chartevent                category
label_chartevent                   category
category                           category
time_since_admission_chartevent     float64
admission_type                     category
insurance                          category
race                               category
admission_location                 category
age                                   int64
value_labevent                     category
valuenum_labevent                   float64
valueuom_labevent                  category
priority                           category
time_since_admission_labevent       float64
dtype: object


In [57]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Model

In [None]:
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'multi:softprob',  # para classificação multi-classe
    'num_class': len(le.classes_),  # número de doenças diferentes
    'eval_metric': 'merror',      # métrica de avaliação or mlogloss
    'tree_method': 'hist',         
    'seed': 42                      
}

In [60]:
n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain,
   num_boost_round=n,
)

In [62]:
preds = model.predict(dtest)
y_pred = preds.argmax(axis=1)

from sklearn.metrics import accuracy_score, classification_report
from sklearn import metrics


print("Acuraccy:", accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Acuraccy: 0.6251142016038981
Precision: 0.6191114836443844
Recall: 0.6251142016038981
              precision    recall  f1-score   support

        2724       0.71      0.72      0.71      1678
        4019       0.69      0.68      0.69      1582
        E039       0.60      0.58      0.59      1595
        E785       0.49      0.39      0.43      1735
        Z794       0.72      0.84      0.78      1618
      Z87891       0.51      0.56      0.53      1643

    accuracy                           0.63      9851
   macro avg       0.62      0.63      0.62      9851
weighted avg       0.62      0.63      0.62      9851

