In [2]:
import pandas as pd
import numpy as np
import os 
import seaborn as sns
import graphviz

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from scipy.stats import randint
from sklearn.model_selection import train_test_split  # Correção aqui
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer

# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
import joblib


In [3]:
df = pd.read_csv('..\models\FINAL.csv', low_memory=False)
display(df)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,value_chart,valuenum_chartevent,chart_label,category,time_since_admission_chartevent,icd_code,admission_type,race,age,lab_results,lab_value_unit,priority,time_since_admission_labevent,death,BMI (kg/m2),Height (Inches),Weight (Lbs)
0,0,117271,65,65.0,Heart Rate,Routine Vital Signs,10.216667,2724,EW EMER.,WHITE,87,12.40,K/uL,STAT,4.933333,0,0.0,0.00,0.0
1,1,164988,Some resistance,4.0,Strength R Arm,Neurological,2.600000,2724,EW EMER.,WHITE,87,25.70,sec,STAT,4.933333,0,0.0,0.00,0.0
2,2,164461,100,100.0,O2 saturation pulseoxymetry,Respiratory,0.000000,2724,URGENT,UNKNOWN,88,3.40,m/uL,ROUTINE,23.250000,0,0.0,0.00,0.0
3,3,128539,0.9,0.9,Creatinine (serum),Labs,4.150000,2724,EW EMER.,WHITE,56,105.00,mEq/L,ROUTINE,4.150000,0,28.3,67.00,180.4
4,4,21639,3.6,3.6,Potassium (whole blood),Labs,9.500000,2724,ELECTIVE,WHITE,70,1.15,mmol/L,Unknown,15.666667,0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65995,65995,275391,1,1.0,PICC Line Dressing Occlusive,Access Lines - Invasive,23.683333,E039,EW EMER.,BLACK/AFRICAN AMERICAN,48,13.20,K/uL,ROUTINE,19.550000,0,18.9,62.50,105.0
65996,65996,261802,24,24.0,HCO3 (serum),Labs,18.650000,E039,EW EMER.,WHITE,72,28.00,%,ROUTINE,14.200000,0,0.0,63.25,0.0
65997,65997,284264,0 Alert and calm,0.0,Richmond-RAS Scale,Pain/Sedation,23.683333,E039,EW EMER.,BLACK/AFRICAN AMERICAN,48,141.00,mEq/L,ROUTINE,13.533333,0,18.9,62.50,105.0
65998,65998,276005,4 - Chair - Transfer to chair/bed,4.0,Activity / Mobility (JH-HLM),Treatments,9.900000,E039,EW EMER.,WHITE,32,7.80,mg/dL,ROUTINE,17.400000,0,25.1,65.00,152.0


In [4]:
df = df.copy()

In [5]:
class Treatment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = X.copy()
        self.variables_cat = ['priority']
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoder.fit(df[self.variables_cat])
        self.ohe_columns = self.encoder.get_feature_names_out(self.variables_cat)

        self.label_encoders = {}
        variables_label = ['icd_code', 'category', 'race', 'lab_value_unit']
        
        for col in variables_label:
            if col in df.columns:
                le = LabelEncoder()
                le.fit(df[col].astype(str))
                self.label_encoders[col] = le

        return self

    def transform(self, X):
        df = X.copy()

        # Feature Interactions
        urgency_rank = {
            'Unknown': 0, 'ELECTIVE': 1, 'OBSERVATION ADMIT': 2,
            'SURGICAL SAME DAY ADMISSION': 3, 'URGENT': 4,
            'DIRECT EMER.': 5, 'EW EMER.': 6
        }
        
        if 'admission_type' in df.columns:
            df['urgency_score'] = df['admission_type'].map(urgency_rank)
            df['urgency_x_lab_delay'] = df['urgency_score'] * df['time_since_admission_labevent']
            df['admission_x_age'] = df['urgency_score'] * df['age']
            # Drop da variável original
            df.drop(columns=['admission_type'], inplace=True)

        # OneHotEncoding
        one_hot_encoded = self.encoder.transform(df[self.variables_cat])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.ohe_columns, index=df.index)
        df = pd.concat([df.drop(columns=self.variables_cat), one_hot_df], axis=1)

        # LabelEncoding 
        for col, le in self.label_encoders.items():
            if col in df.columns:
                df[col] = le.transform(df[col].astype(str))
            else:
                print(f"Aviso: Coluna '{col}' não encontrada em transform().")

        # Frequency encoding
        if 'value_chart' in df.columns:
            df['value_chart'] = df['value_chart'].astype(str)
            df['value_chart_freq'] = df['value_chart'].map(df['value_chart'].value_counts())
            df.drop(columns=['value_chart'], inplace=True)

        if 'chart_label' in df.columns:
            df['chart_label'] = df['chart_label'].astype(str)
            df['chart_label_freq'] = df['chart_label'].map(df['chart_label'].value_counts())
            df.drop(columns=['chart_label'], inplace=True)

        return df


In [6]:
le = LabelEncoder()
y = le.fit_transform(df['icd_code'])
X = df.drop('icd_code', axis=1)

print(le.classes_)
joblib.dump(le, 'label_encoder_icd.pk2')

['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


['label_encoder_icd.pk2']

In [7]:
print(X.columns.tolist())


['Unnamed: 0.1', 'Unnamed: 0', 'value_chart', 'valuenum_chartevent', 'chart_label', 'category', 'time_since_admission_chartevent', 'admission_type', 'race', 'age', 'lab_results', 'lab_value_unit', 'priority', 'time_since_admission_labevent', 'death', 'BMI (kg/m2)', 'Height (Inches)', 'Weight (Lbs)']


In [8]:
treatment = Treatment()
X_processed = treatment.fit_transform(X)
joblib.dump(treatment, 'treatment.pk2')

['treatment.pk2']

In [9]:
# 0.2 of data test size
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,150),
              'max_depth': randint(3,10)}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rand_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist, 
    n_iter=5, 
    cv=5,
    n_jobs=-1
)

rand_search.fit(X_train, y_train)

In [11]:
best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 9, 'n_estimators': 141}


In [12]:
rf2 = RandomForestClassifier( n_estimators= 141, criterion= 'entropy',
min_samples_split = 10,
max_depth = 9,
random_state= 42

)

In [13]:
rf2.fit(X_train, y_train)

In [14]:
y_pred = rf2.predict(X_test)

In [15]:

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2179
           1       1.00      1.00      1.00      2183
           2       1.00      1.00      1.00      2231
           3       1.00      1.00      1.00      2234
           4       1.00      1.00      1.00      2175
           5       1.00      1.00      1.00      2198

    accuracy                           1.00     13200
   macro avg       1.00      1.00      1.00     13200
weighted avg       1.00      1.00      1.00     13200

