# Imports

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np
import os 
import seaborn as sns
from sklearn import svm
from sklearn import metrics

from sklearn.model_selection import train_test_split

# encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('df_balanced_full.csv', low_memory=False)

In [4]:
display(df)

Unnamed: 0,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3,-3.00,mEq/L,Unknown,12.316667
1,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.40,Unknown,STAT,4.150000
2,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384,384.00,mm Hg,Unknown,5.616667
3,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98,98.00,%,Unknown,6.666667
4,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.20,Unknown,ROUTINE,15.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37195,0 Alert and calm,0.0,Unknown,Goal Richmond-RAS Scale,Pain/Sedation,3.583333,Z87891,EW EMER.,Other,WHITE,EMERGENCY ROOM,70,___,36.00,mm Hg,Unknown,4.133333
37196,127,127.0,bpm,Heart Rate,Routine Vital Signs,21.800000,Z87891,URGENT,Medicare,UNKNOWN,TRANSFER FROM HOSPITAL,81,38.2,38.20,%,ROUTINE,18.600000
37197,475,475.0,mL,Tidal Volume (observed),Respiratory,23.516667,Z87891,OBSERVATION ADMIT,Medicare,WHITE,TRANSFER FROM HOSPITAL,58,71,71.00,fL,STAT,15.233333
37198,12,12.0,insp/min,Respiratory Rate,Respiratory,11.916667,Z87891,SURGICAL SAME DAY ADMISSION,Medicare,WHITE,PHYSICIAN REFERRAL,72,7.37,7.37,units,Unknown,16.300000


In [5]:

print("Features: ", df.columns.tolist())
print("Labels: ", df['icd_code'].unique())


Features:  ['value_chartevent', 'valuenum_chartevent', 'valueuom_chartevent', 'label_chartevent', 'category', 'time_since_admission_chartevent', 'icd_code', 'admission_type', 'insurance', 'race', 'admission_location', 'age', 'value_labevent', 'valuenum_labevent', 'valueuom_labevent', 'priority', 'time_since_admission_labevent']
Labels:  ['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


In [6]:
df.head()

Unnamed: 0,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,103.0,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.85,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3.0,-3.0,mEq/L,Unknown,12.316667
1,25.0,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.9,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.15
2,25.0,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.75,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384.0,384.0,mm Hg,Unknown,5.616667
3,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.75,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98.0,98.0,%,Unknown,6.666667
4,100.0,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.75,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667


# Treatment Strings

In [7]:
# Strings with medium number of unique types -> Frequency
variables = [
   'value_chartevent', 'label_chartevent', 'valueuom_labevent', 'valueuom_chartevent'
]

# String with the higest number of unique types -> Frequency
lab = ['value_labevent']


# Strings with low number of unique types -> OneHotEncoding
variables_cat = ['admission_type', 'insurance', 'priority', 'race', 'admission_location', 'category']

# Numerical
numerical_cols = [
    'valuenum_chartevent', 'valuenum_labevent',
    'time_since_admission_chartevent', 'time_since_admission_labevent', 'age'
]

# Target (icd_code)
target = 'icd_code'

In [8]:
X_raw = df[variables + numerical_cols + variables_cat + lab].copy()
y = df[target].copy()

In [9]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

y = y_encoded

In [10]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(X_raw[variables_cat])

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(variables_cat),
                          index=X_raw.index)

X_encoded = pd.concat([X_raw.drop(columns=variables_cat), one_hot_df], axis=1)

In [11]:
for col in variables:
    freq = X_encoded[col].value_counts()
    X_encoded[col + '_freq'] = X_encoded[col].map(freq)

# Remove original high unique values columns
X_encoded.drop(columns=variables, inplace=True)

# missing values indicator

for col in numerical_cols:
    X_encoded[col + '_missing'] = X_encoded[col].isna().astype(int)

# Preencher os NaNs com um valor neutro (-999)
X_encoded[numerical_cols] = X_encoded[numerical_cols].fillna(-999)

display(X_encoded)

Unnamed: 0,valuenum_chartevent,valuenum_labevent,time_since_admission_chartevent,time_since_admission_labevent,age,value_labevent,admission_type_DIRECT EMER.,admission_type_ELECTIVE,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,...,category_Treatments,value_chartevent_freq,label_chartevent_freq,valueuom_labevent_freq,valueuom_chartevent_freq,valuenum_chartevent_missing,valuenum_labevent_missing,time_since_admission_chartevent_missing,time_since_admission_labevent_missing,age_missing
0,103.0,-3.00,4.850000,12.316667,60,-3,0.0,0.0,1.0,0.0,...,0.0,131,474,8052,474,0,0,0,0,0
1,25.0,1.40,7.900000,4.150000,67,1.4,0.0,0.0,0.0,0.0,...,0.0,147,121,2204,1559,0,0,0,0,0
2,25.0,384.00,16.750000,5.616667,67,384,0.0,0.0,0.0,0.0,...,0.0,147,7,2052,57,0,0,0,0,0
3,36.8,98.00,18.750000,6.666667,67,98,0.0,0.0,0.0,0.0,...,0.0,17,124,4113,212,0,0,0,0,0
4,100.0,1.20,22.750000,15.366667,70,1.2,0.0,1.0,0.0,0.0,...,0.0,914,1863,2204,3052,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37195,0.0,36.00,3.583333,4.133333,70,___,0.0,0.0,1.0,0.0,...,0.0,385,294,2052,13476,0,0,0,0,0
37196,127.0,38.20,21.800000,18.600000,81,38.2,0.0,0.0,0.0,0.0,...,0.0,27,1903,4113,2307,0,0,0,0,0
37197,475.0,71.00,23.516667,15.233333,58,71,0.0,0.0,0.0,1.0,...,0.0,2,113,1270,351,0,0,0,0,0
37198,12.0,7.37,11.916667,16.300000,72,7.37,0.0,0.0,0.0,0.0,...,0.0,243,1846,1232,2751,0,0,0,0,0


In [12]:

X_encoded['value_labevent'] = X_encoded['value_labevent'].astype(str)

freq = X_encoded['value_labevent'].value_counts()
X_encoded['value_labevent_freq'] = X_encoded['value_labevent'].map(freq)

# Remove original
X_encoded.drop(columns=['value_labevent'], inplace=True)

In [13]:
X = X_encoded

# Nan

In [14]:
X = X.dropna()

# Model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [16]:
cols = X_train.columns

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [18]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [19]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [20]:
X_train.describe()

Unnamed: 0,valuenum_chartevent,valuenum_labevent,time_since_admission_chartevent,time_since_admission_labevent,age,admission_type_DIRECT EMER.,admission_type_ELECTIVE,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,...,value_chartevent_freq,label_chartevent_freq,valueuom_labevent_freq,valueuom_chartevent_freq,valuenum_chartevent_missing,valuenum_labevent_missing,time_since_admission_chartevent_missing,time_since_admission_labevent_missing,age_missing,value_labevent_freq
count,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,...,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0,29760.0
mean,2.98447e-18,-1.9100610000000003e-17,-3.1325e-16,1.656978e-16,-2.043765e-16,-4.4408920000000007e-17,-5.4675500000000005e-17,1.069634e-16,-7.401487e-17,-1.9578130000000003e-17,...,2.244322e-17,-3.9812840000000006e-17,-3.7484950000000003e-17,-8.022257e-17,0.0,0.0,0.0,0.0,0.0,-3.151601e-17
std,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017,...,1.000017,1.000017,1.000017,1.000017,0.0,0.0,0.0,0.0,0.0,1.000017
min,-0.1281597,-0.08971949,-1.939575,-3.666185,-2.343028,-0.2011226,-0.2081632,-1.03865,-0.4511031,-0.3769187,...,-0.6028174,-0.8931661,-1.492978,-1.391844,0.0,0.0,0.0,0.0,0.0,-0.4351847
25%,-0.09220701,-0.07219809,-0.8835067,-0.6493871,-0.4653846,-0.2011226,-0.2081632,-1.03865,-0.4511031,-0.3769187,...,-0.5151752,-0.6807061,-0.989747,-0.9512866,0.0,0.0,0.0,0.0,0.0,-0.4035749
50%,-0.06896052,-0.0642852,-0.03385179,-0.003971305,0.02141185,-0.2011226,-0.2081632,0.9627886,-0.4511031,-0.3769187,...,-0.4241621,-0.5390661,-0.07015558,0.08973616,0.0,0.0,0.0,0.0,0.0,-0.3508919
75%,0.02489177,-0.04097043,0.828404,0.7726096,0.647293,-0.2011226,-0.2081632,0.9627886,-0.4511031,-0.3769187,...,-0.21854,0.3453971,0.9736862,1.182533,0.0,0.0,0.0,0.0,0.0,-0.2464839
max,90.37288,44.19209,1.775865,1.865652,1.968598,4.972092,4.803922,0.9627886,2.216788,2.653092,...,2.674776,2.100159,1.293138,1.182533,0.0,0.0,0.0,0.0,0.0,3.026565


In [21]:
clf = svm.SVC()

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))
