In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import os


from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.impute import SimpleImputer


In [None]:
def extract_tsfresh_features(df):
    # Define the settings for feature extraction
    extraction_settings = MinimalFCParameters()

    if 'In-hospital_death' in df.columns:
        df = df.drop(columns=['In-hospital_death'])
    if 'ICUType' in df.columns:
        df = df.drop(columns=['ICUType'])
    
    # Extract features using tsfresh with parallelization
    features = extract_features(df, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, n_jobs=os.cpu_count())

    # Replace infinite values with NaNs
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return features

In [None]:
set_a = pd.read_parquet(os.path.join('loaded_data', 'a_patient_data_processed_2.parquet'))
training_x = extract_tsfresh_features(set_a)

set_b = pd.read_parquet(os.path.join('loaded_data', 'b_patient_data_processed_2.parquet'))
validation_x = extract_tsfresh_features(set_b)

set_c = pd.read_parquet(os.path.join('loaded_data', 'c_patient_data_processed_2.parquet'))
testing_x = extract_tsfresh_features(set_c)
    

In [None]:
training_x.to_parquet('training_X_min_features.parquet')
validation_x.to_parquet('validation_X_min_features.parquet')    
testing_x.to_parquet('testing_X_min_features.parquet')

In [15]:
training_x  = pd.read_parquet('training_X_min_features.parquet')
training_x = training_x.loc[:,~training_x.columns.str.contains('ICUType')]
validation_x = pd.read_parquet('validation_X_min_features.parquet')[training_x.columns]
test_x = pd.read_parquet('test_X_min_features.parquet')[training_x.columns]

In [17]:
# get columns that are completely empty
empty_cols = training_x.columns[training_x.isnull().all()]
empty_cols

Index([], dtype='object')

In [18]:


training_x = training_x.drop(columns=empty_cols)
validation_x = validation_x.drop(columns=empty_cols)
test_x = test_x.drop(columns=empty_cols)

In [19]:
standardscaler = StandardScaler()

In [20]:
training_x_scaled = standardscaler.fit_transform(training_x)
validation_x_scaled = standardscaler.transform(validation_x)
test_x_scaled = standardscaler.transform(test_x)

In [21]:
training_x = pd.DataFrame(training_x, columns=training_x.columns, index=training_x.index)
validation_x = pd.DataFrame(validation_x, columns=training_x.columns, index=validation_x.index)
test_x = pd.DataFrame(test_x, columns=training_x.columns, index=test_x.index)

In [None]:
imputer = SimpleImputer(strategy='median')
training_x = pd.DataFrame(imputer.fit_transform(training_x), columns=training_x.columns, index=training_x.index)
validation_x = pd.DataFrame(imputer.transform(validation_x), columns=validation_x.columns, index=validation_x.index)
test_x = pd.DataFrame(imputer.transform(test_x), columns=test_x.columns, index=test_x.index)

In [23]:
training_x.head()

Unnamed: 0_level_0,ALP__sum_values,ALP__median,ALP__mean,ALP__length,ALP__standard_deviation,ALP__variance,ALP__root_mean_square,ALP__maximum,ALP__absolute_maximum,ALP__minimum,...,pH__sum_values,pH__median,pH__mean,pH__length,pH__standard_deviation,pH__variance,pH__root_mean_square,pH__maximum,pH__absolute_maximum,pH__minimum
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132540.0,-0.129376,-0.12141,-0.123435,0.577194,-0.309694,-0.084885,-0.249912,-0.207938,-0.272177,0.194261,...,-0.044405,-0.016974,-0.042276,0.577194,-0.03767,-0.031979,-0.04203,-0.037295,-0.038945,-0.006154
132543.0,0.476383,0.406187,0.448982,0.577194,-0.309694,-0.084885,0.080448,0.269782,0.010551,1.610969,...,-0.045172,-0.022248,-0.043032,0.577194,-0.040763,-0.031979,-0.041932,-0.039907,-0.041122,0.020078
132547.0,0.147899,0.333415,0.158544,-0.986212,0.421193,-0.043508,-0.079101,0.203889,-0.057525,0.194261,...,-0.034864,-0.016974,-0.037774,-0.986212,-0.034395,-0.031978,-0.043119,-0.03686,-0.036767,-0.032385
132548.0,-0.115899,-0.12141,-0.123435,-0.725644,-0.309694,-0.084885,-0.249912,-0.207938,-0.272177,0.194261,...,-0.040466,-0.022248,-0.043032,-0.725644,-0.040763,-0.031979,-0.041932,-0.039907,-0.041122,0.020078
132551.0,-0.691884,-0.649007,-0.683408,0.056059,-0.059853,-0.08005,0.319374,-0.207938,0.221371,-1.222447,...,-0.075087,-0.085543,-0.075785,0.056059,-0.031066,-0.031976,-0.027132,-0.039472,-0.028928,-0.126818


In [25]:
training_x.to_parquet(os.path.join('extracted_features', 'training_X_clean_min.parquet'))
validation_x.to_parquet(os.path.join('extracted_features', 'validation_X_clean_min.parquet'))
test_x.to_parquet(os.path.join('extracted_features', 'test_X_clean_min.parquet'))

In [24]:
for i in training_x.columns:
    print(i)

ALP__sum_values
ALP__median
ALP__mean
ALP__length
ALP__standard_deviation
ALP__variance
ALP__root_mean_square
ALP__maximum
ALP__absolute_maximum
ALP__minimum
ALT__sum_values
ALT__median
ALT__mean
ALT__length
ALT__standard_deviation
ALT__variance
ALT__root_mean_square
ALT__maximum
ALT__absolute_maximum
ALT__minimum
AST__sum_values
AST__median
AST__mean
AST__length
AST__standard_deviation
AST__variance
AST__root_mean_square
AST__maximum
AST__absolute_maximum
AST__minimum
Age__sum_values
Age__median
Age__mean
Age__length
Age__standard_deviation
Age__variance
Age__root_mean_square
Age__maximum
Age__absolute_maximum
Age__minimum
Albumin__sum_values
Albumin__median
Albumin__mean
Albumin__length
Albumin__standard_deviation
Albumin__variance
Albumin__root_mean_square
Albumin__maximum
Albumin__absolute_maximum
Albumin__minimum
BUN__sum_values
BUN__median
BUN__mean
BUN__length
BUN__standard_deviation
BUN__variance
BUN__root_mean_square
BUN__maximum
BUN__absolute_maximum
BUN__minimum
Bilirubin__s