# Imports

In [3]:
from sklearn import datasets
import pandas as pd
import numpy as np
import os 
import seaborn as sns
from sklearn import svm
from sklearn import metrics

from sklearn.model_selection import train_test_split

# encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv('df_balanced_full.csv', low_memory=False)

In [5]:
display(df)

Unnamed: 0,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,103,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.850000,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3,-3.0,mEq/L,Unknown,12.316667
1,25,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.900000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.150000
2,25,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384,384.0,mm Hg,Unknown,5.616667
3,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.750000,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98,98.0,%,Unknown,6.666667
4,100,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.750000,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,104,104.0,mg/dL,Glucose (serum),Labs,9.050000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,EMERGENCY ROOM,58,2,2.0,Unknown,STAT,2.666667
49250,Obeys Commands,6.0,Unknown,GCS - Motor Response,Neurological,1.600000,Z87891,EW EMER.,Other,WHITE,TRANSFER FROM HOSPITAL,72,1.1,1.1,mg/dL,STAT,6.000000
49251,No response,1.0,Unknown,GCS - Motor Response,Neurological,4.950000,Z87891,OBSERVATION ADMIT,Medicare,WHITE,TRANSFER FROM HOSPITAL,58,___,133.0,mEq/L,STAT,2.566667
49252,No movement,0.0,Unknown,Strength L Arm,Neurological,23.433333,Z87891,EW EMER.,Medicare,UNKNOWN,EMERGENCY ROOM,79,21,21.0,mg/dL,STAT,9.183333


In [6]:

print("Features: ", df.columns.tolist())
print("Labels: ", df['icd_code'].unique())

Features:  ['value_chartevent', 'valuenum_chartevent', 'valueuom_chartevent', 'label_chartevent', 'category', 'time_since_admission_chartevent', 'icd_code', 'admission_type', 'insurance', 'race', 'admission_location', 'age', 'value_labevent', 'valuenum_labevent', 'valueuom_labevent', 'priority', 'time_since_admission_labevent']
Labels:  ['2724' '4019' 'E039' 'E785' 'Z794' 'Z87891']


In [7]:
df.head()

Unnamed: 0,value_chartevent,valuenum_chartevent,valueuom_chartevent,label_chartevent,category,time_since_admission_chartevent,icd_code,admission_type,insurance,race,admission_location,age,value_labevent,valuenum_labevent,valueuom_labevent,priority,time_since_admission_labevent
0,103.0,103.0,°F,Temperature Fahrenheit,Routine Vital Signs,4.85,2724,EW EMER.,Medicaid,UNABLE TO OBTAIN,EMERGENCY ROOM,60,-3.0,-3.0,mEq/L,Unknown,12.316667
1,25.0,25.0,mEq/L,TCO2 (calc) Arterial,Labs,7.9,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,1.4,1.4,Unknown,STAT,4.15
2,25.0,25.0,mA,Temporary Ventricular Stim Setting mA,Cardiovascular (Pacer Data),16.75,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,384.0,384.0,mm Hg,Unknown,5.616667
3,36.8,36.8,°C,Temperature Celsius,Routine Vital Signs,18.75,2724,SURGICAL SAME DAY ADMISSION,Other,HISPANIC OR LATINO,PHYSICIAN REFERRAL,67,98.0,98.0,%,Unknown,6.666667
4,100.0,100.0,%,O2 saturation pulseoxymetry,Respiratory,22.75,2724,ELECTIVE,Medicare,WHITE,PHYSICIAN REFERRAL,70,1.2,1.2,Unknown,ROUTINE,15.366667


# Treatment Strings

In [8]:
# Strings with medium number of unique types -> Frequency
variables = [
   'value_chartevent', 'label_chartevent', 'valueuom_labevent', 'valueuom_chartevent'
]

# String with the higest number of unique types -> Frequency
lab = ['value_labevent']


# Strings with low number of unique types -> OneHotEncoding
variables_cat = ['admission_type', 'insurance', 'priority', 'race', 'admission_location', 'category']

# Numerical
numerical_cols = [
    'valuenum_chartevent', 'valuenum_labevent',
    'time_since_admission_chartevent', 'time_since_admission_labevent', 'age'
]

# Target (icd_code)
target = 'icd_code'

In [9]:
X_raw = df[variables + numerical_cols + variables_cat + lab].copy()
y = df[target].copy()

In [10]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

y = y_encoded

In [11]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(X_raw[variables_cat])

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(variables_cat),
                          index=X_raw.index)

X_encoded = pd.concat([X_raw.drop(columns=variables_cat), one_hot_df], axis=1)

In [None]:
for col in variables:
    freq = X_encoded[col].value_counts()
    X_encoded[col + '_freq'] = X_encoded[col].map(freq)

X_encoded.drop(columns=variables, inplace=True)

# missing values indicator

for col in numerical_cols:
    X_encoded[col + '_missing'] = X_encoded[col].isna().astype(int)

X_encoded[numerical_cols] = X_encoded[numerical_cols].fillna(-999)

display(X_encoded)

Unnamed: 0,valuenum_chartevent,valuenum_labevent,time_since_admission_chartevent,time_since_admission_labevent,age,value_labevent,admission_type_DIRECT EMER.,admission_type_ELECTIVE,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,...,category_Treatments,value_chartevent_freq,label_chartevent_freq,valueuom_labevent_freq,valueuom_chartevent_freq,valuenum_chartevent_missing,valuenum_labevent_missing,time_since_admission_chartevent_missing,time_since_admission_labevent_missing,age_missing
0,103.0,-3.0,4.850000,12.316667,60,-3,0.0,0.0,1.0,0.0,...,0.0,163,623,10605,623,0,0,0,0,0
1,25.0,1.4,7.900000,4.150000,67,1.4,0.0,0.0,0.0,0.0,...,0.0,191,157,2878,2053,0,0,0,0,0
2,25.0,384.0,16.750000,5.616667,67,384,0.0,0.0,0.0,0.0,...,0.0,191,8,2726,71,0,0,0,0,0
3,36.8,98.0,18.750000,6.666667,67,98,0.0,0.0,0.0,0.0,...,0.0,25,162,5471,281,0,0,0,0,0
4,100.0,1.2,22.750000,15.366667,70,1.2,0.0,1.0,0.0,0.0,...,0.0,1201,2447,2878,3993,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49249,104.0,2.0,9.050000,2.666667,58,2,0.0,0.0,0.0,1.0,...,0.0,111,253,2878,1769,0,0,0,0,0
49250,6.0,1.1,1.600000,6.000000,72,1.1,0.0,0.0,1.0,0.0,...,0.0,401,510,9394,17903,0,0,0,0,0
49251,1.0,133.0,4.950000,2.566667,58,___,0.0,0.0,0.0,1.0,...,0.0,59,510,10605,17903,0,0,0,0,0
49252,0.0,21.0,23.433333,9.183333,79,21,0.0,0.0,1.0,0.0,...,0.0,49,351,9394,17903,0,0,0,0,0


In [13]:

X_encoded['value_labevent'] = X_encoded['value_labevent'].astype(str)

freq = X_encoded['value_labevent'].value_counts()
X_encoded['value_labevent_freq'] = X_encoded['value_labevent'].map(freq)

# Remove original
X_encoded.drop(columns=['value_labevent'], inplace=True)

In [14]:
X = X_encoded

# Model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [17]:
cols = X_train.columns

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [19]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [20]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [21]:
X_train.describe()

Unnamed: 0,valuenum_chartevent,valuenum_labevent,time_since_admission_chartevent,time_since_admission_labevent,age,admission_type_DIRECT EMER.,admission_type_ELECTIVE,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,...,value_chartevent_freq,label_chartevent_freq,valueuom_labevent_freq,valueuom_chartevent_freq,valuenum_chartevent_missing,valuenum_labevent_missing,time_since_admission_chartevent_missing,time_since_admission_labevent_missing,age_missing,value_labevent_freq
count,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,...,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0,39403.0
mean,-8.655699e-18,6.852428e-18,-1.0819620000000002e-17,2.055729e-16,-3.9446550000000004e-17,-6.52784e-17,4.5081770000000005e-17,7.618819e-17,-3.426214e-18,1.298355e-17,...,-4.4901440000000006e-17,-2.4524480000000003e-17,-7.717998000000001e-17,2.9573640000000003e-17,0.0,0.0,0.0,0.0,0.0,-1.956549e-17
std,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,1.000013,...,1.000013,1.000013,1.000013,1.000013,0.0,0.0,0.0,0.0,0.0,1.000013
min,-0.1266021,-0.09500497,-1.945728,-3.673713,-2.348718,-0.2001029,-0.2074658,-1.04128,-0.4525082,-0.3754003,...,-0.6017929,-0.896389,-1.498511,-1.386515,0.0,0.0,0.0,0.0,0.0,-0.4347421
25%,-0.09076861,-0.07629107,-0.8727323,-0.6528758,-0.4681807,-0.2001029,-0.2074658,-1.04128,-0.4525082,-0.3754003,...,-0.5168804,-0.6797891,-0.9912797,-0.9517002,0.0,0.0,0.0,0.0,0.0,-0.4034098
50%,-0.06736129,-0.0678177,-0.0336253,-0.006595903,0.01936605,-0.2001029,-0.2074658,0.9603568,-0.4525082,-0.3754003,...,-0.4193571,-0.539777,-0.05744726,0.09087802,0.0,0.0,0.0,0.0,0.0,-0.3502178
75%,0.026557,-0.04327364,0.8320051,0.7710248,0.6462119,-0.2001029,-0.2074658,0.9603568,-0.4525082,-0.3754003,...,-0.2360806,0.3062788,0.9762504,1.185025,0.0,0.0,0.0,0.0,0.0,-0.2511204
max,90.43808,47.23879,1.786851,1.865531,1.969553,4.997428,4.820071,0.9603568,2.209905,2.663823,...,2.670282,2.072585,1.295345,1.185025,0.0,0.0,0.0,0.0,0.0,3.034397


In [22]:
clf = svm.SVC()

In [23]:
clf.fit(X_train, y_train)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
from sklearn import metrics
from sklearn.metrics import classification_report

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.6186173992488072
Precision: 0.6369445435286155
Recall: 0.6186173992488072
              precision    recall  f1-score   support

        2724       0.67      0.76      0.71      1678
        4019       0.57      0.59      0.58      1582
        E039       0.64      0.48      0.55      1595
        E785       0.72      0.15      0.25      1735
        Z794       0.71      0.99      0.83      1618
      Z87891       0.50      0.77      0.61      1643

    accuracy                           0.62      9851
   macro avg       0.64      0.62      0.59      9851
weighted avg       0.64      0.62      0.58      9851



## With Linear

### C = 1.0

In [None]:
clf1 = svm.SVC(kernel='linear', C=1.0)

In [None]:
clf1.fit(X_train, y_train)

In [None]:
y_pred = clf1.predict(X_test)

In [60]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5889757385037052


In [61]:
from sklearn import metrics
from sklearn.metrics import classification_report


print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred, target_names=le.classes_))

Precision: 0.5841213677957854
Recall: 0.5889757385037052
              precision    recall  f1-score   support

        2724       0.65      0.68      0.66      1678
        4019       0.46      0.56      0.51      1582
        E039       0.69      0.43      0.53      1595
        E785       0.48      0.24      0.32      1735
        Z794       0.71      0.99      0.83      1618
      Z87891       0.52      0.65      0.58      1643

    accuracy                           0.59      9851
   macro avg       0.58      0.59      0.57      9851
weighted avg       0.58      0.59      0.57      9851



### C = 100.0

In [62]:
clf100 = svm.SVC(kernel='linear', C=100.0)

In [None]:
clf100.fit(X_train, y_train)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report


print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred, target_names=le.classes_))

### C = 1000.0

In [None]:
clf1000 = svm.SVC(kernel='linear', C=100.0)

In [None]:
clf1000.fit(X_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred, target_names=le.classes_))