In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Load the datasets
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv').squeeze()

In [2]:
original_feature_names = X_train.columns
rf = LGBMClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

n = 400  
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:n]  


# Step 3: Map the top 10 features to their names
top_10_feature_names = original_feature_names[indices[:10]]

# Print top 10 feature names
print("Top 10 features selected by LightGBM:")
for i, feature in enumerate(top_10_feature_names, start=1):
    print(f"{i}. {feature}")


X_train_top_n = X_train.iloc[:, indices]
X_test_top_n = X_test.iloc[:, indices]

print(f"Top {n} features selected by Random Forest.")

[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.025378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945310
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 4263
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296
Top 10 features selected by LightGBM:
1. glucose_d1_min_icu_id_mean
2. glucose_d1_value_range
3. age_counts
4. age
5. arf_apache
6. d1_glucose_max_indicator
7. glucose_apache_d1_max_ratio
8. glucose_d1_min_icu_id_std
9. glucose_d1_max_icu_id_norm_rank
10. glucose_d1_h1_std
Top 400 features selected by Random Forest.


In [3]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1] 

print("Classification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     30574
         1.0       0.69      0.49      0.58      8474

    accuracy                           0.84     39048
   macro avg       0.78      0.72      0.74     39048
weighted avg       0.83      0.84      0.83     39048

AUC-ROC Score: 0.8713104814670278


In [None]:
rf.fit(X_train_top_n, y_train)

In [None]:
y_pred = rf.predict(X_test_top_n)
y_pred_proba = rf.predict_proba(X_test_top_n)[:, 1] 

print("Classification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)

In [6]:
def get_apache_meta_features(df):

    df['d1_glucose_min_flag'] = np.where(df['d1_glucose_min'] > (120*.0555), 1, 0)

    df['SerumPotassium_apache'] = (df['d1_potassium_min'].values + df['d1_potassium_max'].values)/2
    df['SerumBicarb_apache'] = (df['d1_hco3_min'].values + df['d1_hco3_max'].values)/2

    df['temperature'] = 0
    df.loc[(df['temp_apache']< 36) | (df['temp_apache'] > 38.5), "temperature"] = 4

    df['arterial_pressure'] = 0
    df.loc[df['map_apache'] < 50, 'arterial_pressure'] = 4
    df.loc[(df['map_apache'] >= 50) &
           (df['map_apache'] < 70), 'arterial_pressure'] = 2
    df.loc[(df['map_apache'] >= 70) &
           (df['map_apache'] < 110), 'arterial_pressure'] = 0
    df.loc[(df['map_apache'] >= 110) &
           (df['map_apache'] < 130), 'arterial_pressure'] = 2
    df.loc[(df['map_apache'] >= 130) &
           (df['map_apache'] < 160), 'arterial_pressure'] = 3
    df.loc[df['map_apache'] >= 160, 'arterial_pressure'] = 4

    df['heart_rate_pulse'] = 0
    df.loc[df['heart_rate_apache'] < 40, 'heart_rate_pulse'] = 4
    df.loc[(df['heart_rate_apache'] >= 40) &
           (df['heart_rate_apache'] < 55), 'heart_rate_pulse'] = 3
    df.loc[(df['heart_rate_apache'] >= 55) &
           (df['heart_rate_apache'] < 70), 'heart_rate_pulse'] = 2
    df.loc[(df['heart_rate_apache'] >= 70) &
           (df['heart_rate_apache'] < 110), 'heart_rate_pulse'] = 0
    df.loc[(df['heart_rate_apache'] >= 110) &
           (df['heart_rate_apache'] < 140), 'heart_rate_pulse'] = 2
    df.loc[(df['heart_rate_apache'] >= 140) &
           (df['heart_rate_apache'] < 180), 'heart_rate_pulse'] = 3
    df.loc[df['heart_rate_apache'] >= 180, 'heart_rate_pulse'] = 4

    df['respiration_rate'] = 0
    df.loc[df['resprate_apache'] < 6, 'respiration_rate'] = 4
    df.loc[(df['resprate_apache'] >= 6) &
           (df['resprate_apache'] < 10), 'respiration_rate'] = 3
    df.loc[(df['resprate_apache'] >= 10) &
           (df['resprate_apache'] < 12), 'respiration_rate'] = 2
    df.loc[(df['resprate_apache'] >= 12) &
           (df['resprate_apache'] < 25), 'respiration_rate'] = 0
    df.loc[(df['resprate_apache'] >= 25) &
           (df['resprate_apache'] < 35), 'respiration_rate'] = 2
    df.loc[(df['resprate_apache'] >= 35) &
           (df['resprate_apache'] < 50), 'respiration_rate'] = 3
    df.loc[df['resprate_apache'] >= 50, 'respiration_rate'] = 4

    df['oxygenation_rate'] = 0
    df['pao2_apache'] = df['pao2_apache'].fillna(0)

    df.loc[df['pao2_apache'] < 200, 'oxygenation_rate'] = 0
    df.loc[(df['pao2_apache'] >= 200) &
           (df['pao2_apache'] < 350), 'oxygenation_rate'] = 2
    df.loc[(df['pao2_apache'] >= 350) &
           (df['pao2_apache'] < 500), 'oxygenation_rate'] = 3
    df.loc[df['pao2_apache'] >= 500, 'oxygenation_rate'] = 4

    df.drop(columns=['pao2_apache'], inplace=True)

    df['serum_bicarb'] = 0
    df.loc[df['SerumBicarb_apache'] < 15, 'serum_bicarb'] = 4
    df.loc[(df['SerumBicarb_apache'] >= 14) &
           (df['SerumBicarb_apache'] < 18), 'serum_bicarb'] = 3
    df.loc[(df['SerumBicarb_apache'] >= 18) &
           (df['SerumBicarb_apache'] < 22), 'serum_bicarb'] = 2
    df.loc[(df['SerumBicarb_apache'] >= 22) &
           (df['SerumBicarb_apache'] < 32), 'serum_bicarb'] = 0
    df.loc[(df['SerumBicarb_apache'] >= 32) &
           (df['SerumBicarb_apache'] < 41), 'serum_bicarb'] = 2
    df.loc[(df['SerumBicarb_apache'] >= 41) &
           (df['SerumBicarb_apache'] < 52), 'serum_bicarb'] = 3
    df.loc[df['SerumBicarb_apache'] >= 52, 'serum_bicarb'] = 4

    df['arterial_ph'] = 0

    df.loc[df['ph_apache'] < 7.15, 'arterial_ph'] = 4
    df.loc[(df['ph_apache'] >= 7.15) &
           (df['ph_apache'] < 7.25), 'arterial_ph'] = 3
    df.loc[(df['ph_apache'] >= 7.25) &
           (df['ph_apache'] < 7.33), 'arterial_ph'] = 2
    df.loc[(df['ph_apache'] >= 7.33) &
           (df['ph_apache'] < 7.5), 'arterial_ph'] = 0
    df.loc[(df['ph_apache'] >= 7.5) &
           (df['ph_apache'] < 7.6), 'arterial_ph'] = 1
    df.loc[(df['ph_apache'] >= 7.6) &
           (df['ph_apache'] < 7.7), 'arterial_ph'] = 3
    df.loc[df['ph_apache'] >= 7.7, 'arterial_ph'] = 4

    df.drop(columns=['ph_apache'], inplace=True)

    df['serum_sodium'] = 0
    df.loc[df['sodium_apache'] < 111, 'serum_sodium'] = 4
    df.loc[(df['sodium_apache'] >= 111) &
           (df['sodium_apache'] < 120), 'serum_sodium'] = 3
    df.loc[(df['sodium_apache'] >= 120) &
           (df['sodium_apache'] < 130), 'serum_sodium'] = 2
    df.loc[(df['sodium_apache'] >= 130) &
           (df['sodium_apache'] < 150), 'serum_sodium'] = 0
    df.loc[(df['sodium_apache'] >= 150) &
           (df['sodium_apache'] < 155), 'serum_sodium'] = 1
    df.loc[(df['sodium_apache'] >= 155) &
           (df['sodium_apache'] < 160), 'serum_sodium'] = 2
    df.loc[(df['sodium_apache'] >= 160) &
           (df['sodium_apache'] < 180), 'serum_sodium'] = 3
    df.loc[df['sodium_apache'] >= 180, 'serum_sodium'] = 4

    df['serum_potassium'] = 0
    df.loc[df['SerumPotassium_apache'] < 2.5, 'serum_potassium'] = 4
    df.loc[(df['SerumPotassium_apache'] >= 2.5) &
           (df['SerumPotassium_apache'] < 3), 'serum_potassium'] = 3
    df.loc[(df['SerumPotassium_apache'] >= 3) &
           (df['SerumPotassium_apache'] < 3.5), 'serum_potassium'] = 2
    df.loc[(df['SerumPotassium_apache'] >= 3.5) &
           (df['SerumPotassium_apache'] < 5.5), 'serum_potassium'] = 0
    df.loc[(df['SerumPotassium_apache'] >= 5.5) &
           (df['SerumPotassium_apache'] < 6), 'serum_potassium'] = 2
    df.loc[(df['SerumPotassium_apache'] >= 6) &
           (df['SerumPotassium_apache'] < 7), 'serum_potassium'] = 3
    df.loc[df['SerumPotassium_apache'] >= 7, 'serum_potassium'] = 4

    df['creatinine'] = 0
    df.loc[df['creatinine_apache'] < .62, 'creatinine'] = 4
    df.loc[(df['creatinine_apache'] >= .62) &
           (df['creatinine_apache'] < 1.47), 'creatinine'] = 0
    df.loc[(df['creatinine_apache'] >= 1.47) &
           (df['creatinine_apache'] < 1.98), 'creatinine'] = 4
    df.loc[(df['creatinine_apache'] >= 1.98) &
           (df['creatinine_apache'] < 3.39), 'creatinine'] = 2
    df.loc[df['creatinine_apache'] >= 3.39, 'creatinine'] = 8

    df['acute_renal_failure'] = 0
    df.loc[df['arf_apache'] == 1, 'acute_renal_failure'] = 4

    df['hematocrits'] = 0
    df.loc[df['creatinine_apache'] < 20, 'hematocrits'] = 4
    df.loc[(df['creatinine_apache'] >= 20) &
           (df['creatinine_apache'] < 30), 'hematocrits'] = 2
    df.loc[(df['creatinine_apache'] >= 30) &
           (df['creatinine_apache'] < 46), 'hematocrits'] = 0
    df.loc[(df['creatinine_apache'] >= 46) &
           (df['creatinine_apache'] < 50), 'hematocrits'] = 1
    df.loc[(df['creatinine_apache'] >= 50) &
           (df['creatinine_apache'] < 60), 'hematocrits'] = 2
    df.loc[df['creatinine_apache'] >= 60, 'hematocrits'] = 4

    df['white_blood_cells'] = 0
    df.loc[df['wbc_apache'] < 1, 'white_blood_cells'] = 4
    df.loc[(df['wbc_apache'] >= 1) &
           (df['wbc_apache'] < 3), 'white_blood_cells'] = 2
    df.loc[(df['wbc_apache'] >= 3) &
           (df['wbc_apache'] < 15), 'white_blood_cells'] = 0
    df.loc[(df['wbc_apache'] >= 15) &
           (df['wbc_apache'] < 20), 'white_blood_cells'] = 1
    df.loc[(df['wbc_apache'] >= 20) &
           (df['wbc_apache'] < 40), 'white_blood_cells'] = 2
    df.loc[df['wbc_apache'] >= 40, 'white_blood_cells'] = 4

    df['glasgow_comma_score_gcs'] = 0
    df.loc[(
                   df['gcs_eyes_apache'] + df['gcs_motor_apache'] +
                   df['gcs_verbal_apache']) <=3,
           'glasgow_comma_score_gcs'] = 12

    df.loc[(df['gcs_eyes_apache'] +
            df['gcs_motor_apache'] +
            df['gcs_verbal_apache']) > 3,
           'glasgow_comma_score_gcs'] = 15 - df.loc[
        (df['gcs_eyes_apache'] + df['gcs_motor_apache'] +
         df['gcs_verbal_apache']) > 3, 'gcs_eyes_apache'] + \
                                        df.loc[(df['gcs_eyes_apache'] + df['gcs_motor_apache'] +
                                                df['gcs_verbal_apache']) > 3, 'gcs_motor_apache'] + \
                                        df.loc[(df['gcs_eyes_apache'] + df['gcs_motor_apache'] +
                                                df['gcs_verbal_apache']) > 3, 'gcs_verbal_apache']

    df['age_death_prob'] = 0
    df.loc[df['wbc_apache'] < 45, 'age_death_prob'] = 0
    df.loc[(df['wbc_apache'] >= 45) &
           (df['wbc_apache'] < 55), 'age_death_prob'] = 2
    df.loc[(df['wbc_apache'] >= 55) &
           (df['wbc_apache'] < 65), 'age_death_prob'] = 3
    df.loc[(df['wbc_apache'] >= 65) &
           (df['wbc_apache'] < 75), 'age_death_prob'] = 5
    df.loc[df['wbc_apache'] >= 75, 'age_death_prob'] = 6

    df['inmunodeficiencia'] = 0
    df.loc[(df['aids'] +
            df['cirrhosis'] +
            df['hepatic_failure'] +
            df['immunosuppression'] +
            df['leukemia'] +
            df['lymphoma'] +
            df['solid_tumor_with_metastasis']) >= 1, 'inmunodeficiencia'] = 5

    df['post_operative'] = 0
    df.loc[df['apache_post_operative'] == 1, "post_operative"] = 3

    df['danger_level'] = df['temperature'] + \
                         df['arterial_pressure'] + df['heart_rate_pulse'] + \
                         df['respiration_rate'] + df['oxygenation_rate'] + \
                         df['serum_bicarb'] + df['arterial_ph'] + \
                         df['serum_sodium'] + df['serum_potassium'] + \
                         df['creatinine'] + df['acute_renal_failure'] + \
                         df['hematocrits'] + df['white_blood_cells'] + \
                         df['glasgow_comma_score_gcs'] + df['age_death_prob'] + \
                         df['inmunodeficiencia'] + df['post_operative']

    df.drop(columns=['temperature', 'arterial_pressure',
                     'heart_rate_pulse', 'respiration_rate',
                     'oxygenation_rate', 'serum_bicarb',
                     'arterial_ph', 'serum_sodium', 'serum_potassium',
                     'creatinine', 'acute_renal_failure', 'hematocrits',
                     'white_blood_cells', 'glasgow_comma_score_gcs',
                     'age_death_prob', 'inmunodeficiencia', 'post_operative'
                     ], inplace=True)

    return df['danger_level']

In [7]:
X_train['danger_level'] = get_apache_meta_features(X_train)

In [8]:
X_test['danger_level'] = get_apache_meta_features(X_test)

In [10]:
original_feature_names = X_train.columns
rf = LGBMClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

n = 400  
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:n]  


# Step 3: Map the top 10 features to their names
top_10_feature_names = original_feature_names[indices[:10]]

# Print top 10 feature names
print("Top 10 features selected by LightGBM:")
for i, feature in enumerate(top_10_feature_names, start=1):
    print(f"{i}. {feature}")


X_train_top_n = X_train.iloc[:, indices]
X_test_top_n = X_test.iloc[:, indices]

print(f"Top {n} features selected by Random Forest.")

[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.018757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945319
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 4264
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296
Top 10 features selected by LightGBM:
1. glucose_d1_min_icu_id_mean
2. glucose_d1_value_range
3. age_counts
4. age
5. arf_apache
6. d1_glucose_max_indicator
7. glucose_apache_d1_max_ratio
8. glucose_d1_min_icu_id_std
9. glucose_d1_h1_std
10. glucose_d1_max_icu_id_norm_rank
Top 400 features selected by Random Forest.


In [11]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1] 

print("Classification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     30574
         1.0       0.69      0.49      0.58      8474

    accuracy                           0.84     39048
   macro avg       0.78      0.72      0.74     39048
weighted avg       0.83      0.84      0.83     39048

AUC-ROC Score: 0.8713104814670278


In [12]:
rf.fit(X_train_top_n, y_train)

[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96742
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 400
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296


In [14]:
y_pred = rf.predict(X_test_top_n)
y_pred_proba = rf.predict_proba(X_test_top_n)[:, 1] 

print("Classification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     30574
         1.0       0.69      0.50      0.58      8474

    accuracy                           0.84     39048
   macro avg       0.78      0.72      0.74     39048
weighted avg       0.83      0.84      0.83     39048

AUC-ROC Score: 0.8714201485698412


: 