In [17]:
import pandas as pd
import numpy as np
import os 
import seaborn as sns
import graphviz

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from scipy.stats import randint
from sklearn.model_selection import train_test_split  # Correção aqui
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer

# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
import joblib


In [18]:
df = pd.read_csv('..\models\df_final.csv', low_memory=False)
display(df)

Unnamed: 0.1,Unnamed: 0,value_chart,valuenum_chartevent,chart_label,category,time_since_admission_chartevent,icd_code,admission_type,race,age,lab_results,lab_value_unit,priority,time_since_admission_labevent,death,BMI (kg/m2),Height (Inches),Weight (Lbs)
0,0,Full resistance,5.0,Strength L Arm,Neurological,13.116667,2724,SURGICAL SAME DAY ADMISSION,OTHER,66,138.0,mg/dL,Unknown,1.900000,0,0.0,71.00,0.00
1,1,1,1.0,20 Gauge Dressing Occlusive,Access Lines - Peripheral,4.183333,2724,EW EMER.,HISPANIC/LATINO - CUBAN,80,17.4,%,ROUTINE,4.200000,0,23.6,60.00,121.00
2,2,100,100.0,O2 saturation pulseoxymetry,Respiratory,5.316667,2724,ELECTIVE,WHITE,70,0.8,mg/dL,ROUTINE,20.000000,0,0.0,0.00,0.00
3,3,Some resistance,4.0,Strength L Leg,Neurological,8.500000,2724,EW EMER.,HISPANIC/LATINO - CUBAN,80,106.0,mEq/L,ROUTINE,4.200000,0,23.6,60.00,121.00
4,4,106,106.0,Non Invasive Blood Pressure mean,Routine Vital Signs,1.583333,2724,EW EMER.,WHITE,65,24.0,mEq/L,STAT,10.200000,0,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439545,465283,74,74.0,Non Invasive Blood Pressure mean,Routine Vital Signs,14.450000,Z87891,EW EMER.,WHITE,89,11.9,sec,ROUTINE,10.083333,0,26.0,66.00,161.16
439546,465284,-1 Awakens to voice (eye opening/contact) > 10...,-1.0,Richmond-RAS Scale,Pain/Sedation,16.516667,Z87891,OBSERVATION ADMIT,WHITE,58,15.1,K/uL,STAT,2.566667,0,25.8,67.63,165.00
439547,465285,22,22.0,Peak Insp. Pressure,Respiratory,9.516667,Z87891,OBSERVATION ADMIT,WHITE,58,31.7,sec,STAT,7.200000,0,25.8,67.63,165.00
439548,465286,74,74.0,Non Invasive Blood Pressure mean,Routine Vital Signs,4.766667,Z87891,OBSERVATION ADMIT,WHITE,58,12.9,g/dL,STAT,5.483333,0,25.8,67.63,165.00


In [20]:
df = df.copy()

In [21]:
class Treatment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = X.copy()
        self.variables_cat = ['priority']
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoder.fit(df[self.variables_cat])
        self.ohe_columns = self.encoder.get_feature_names_out(self.variables_cat)

        self.label_encoders = {}
        variables_label = ['icd_code', 'category', 'race', 'lab_value_unit']
        
        for col in variables_label:
            if col in df.columns:
                le = LabelEncoder()
                le.fit(df[col].astype(str))
                self.label_encoders[col] = le

        return self

    def transform(self, X):
        df = X.copy()

        # Feature Interactions
        urgency_rank = {
            'Unknown': 0, 'ELECTIVE': 1, 'OBSERVATION ADMIT': 2,
            'SURGICAL SAME DAY ADMISSION': 3, 'URGENT': 4,
            'DIRECT EMER.': 5, 'EW EMER.': 6
        }
        
        if 'admission_type' in df.columns:
            df['urgency_score'] = df['admission_type'].map(urgency_rank)
            df['urgency_x_lab_delay'] = df['urgency_score'] * df['time_since_admission_labevent']
            df['admission_x_age'] = df['urgency_score'] * df['age']
            # Drop da variável original
            df.drop(columns=['admission_type'], inplace=True)

        # OneHotEncoding
        one_hot_encoded = self.encoder.transform(df[self.variables_cat])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.ohe_columns, index=df.index)
        df = pd.concat([df.drop(columns=self.variables_cat), one_hot_df], axis=1)

        # LabelEncoding 
        for col, le in self.label_encoders.items():
            if col in df.columns:
                df[col] = le.transform(df[col].astype(str))
            else:
                print(f"Aviso: Coluna '{col}' não encontrada em transform().")

        # Frequency encoding
        if 'value_chart' in df.columns:
            df['value_chart'] = df['value_chart'].astype(str)
            df['value_chart_freq'] = df['value_chart'].map(df['value_chart'].value_counts())
            df.drop(columns=['value_chart'], inplace=True)

        if 'chart_label' in df.columns:
            df['chart_label'] = df['chart_label'].astype(str)
            df['chart_label_freq'] = df['chart_label'].map(df['chart_label'].value_counts())
            df.drop(columns=['chart_label'], inplace=True)

        return df


In [22]:
le = LabelEncoder()
y = le.fit_transform(df['icd_code'])
X = df.drop('icd_code', axis=1)

print(le.classes_)
joblib.dump(le, 'label_encoder_icd.pk2')

['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


['label_encoder_icd.pk2']

In [23]:
print(X.columns.tolist())


['Unnamed: 0', 'value_chart', 'valuenum_chartevent', 'chart_label', 'category', 'time_since_admission_chartevent', 'admission_type', 'race', 'age', 'lab_results', 'lab_value_unit', 'priority', 'time_since_admission_labevent', 'death', 'BMI (kg/m2)', 'Height (Inches)', 'Weight (Lbs)']


In [24]:
treatment = Treatment()
X_processed = treatment.fit_transform(X)
joblib.dump(treatment, 'treatment.pk2')

['treatment.pk2']

In [27]:
# 0.2 of data test size
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


print(classification_report(y_test, y_pred))