In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import os


from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.impute import SimpleImputer


In [16]:
def extract_tsfresh_features(df):

    # Get the full row of data for the last timestamp for each RecordID
    last_data = df.sort_values("Time").groupby("RecordID").last()
    last_data = last_data.drop(columns=["Time"])
    print("last data index", last_data.index)
    # Rename columns (except RecordID) to indicate these are from the last time stamp
    last_data = last_data.rename(columns=lambda x: f"last_{x}" if x != "RecordID" else x)

   

    # Define the settings for feature extraction
    extraction_settings = {
        'abs_energy': None,
        'absolute_sum_of_changes': None,
        'agg_linear_trend': [{'attr': 'slope', 'chunk_len': 5, 'f_agg': 'mean'}, {'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'mean'}],
        'approximate_entropy': [{'m': 2, 'r': 0.2}],
        'kurtosis': None,
        'skewness': None,
        'agg_autocorrelation': [{'f_agg': 'mean', 'maxlag': 10}],
        'fft_aggregated': [{'aggtype': 'centroid'}, {'aggtype': 'variance'}, {'aggtype': 'skew'}, {'aggtype': 'kurtosis'}],
        'fourier_entropy': [{'bins': 10}],
        'longest_strike_above_mean': None,
        'time_reversal_asymmetry_statistic': [{'lag': 1}],
        'lempel_ziv_complexity': [{'bins': 10}],
        'mean': None,
        'minimum': None,
        'maximum': None,
        'variance': None,
    }
# 
    if 'In-hospital_death' in df.columns:
        df = df.drop(columns=['In-hospital_death'])
    if 'ICUType' in df.columns:
        df = df.drop(columns=['ICUType'])
    
    # Extract features using tsfresh with parallelization
    features = extract_features(df, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, n_jobs=os.cpu_count())
    print("features index", features.index)
    # Replace infinite values with NaNs
    features.replace([np.inf, -np.inf], np.nan, inplace=True)

     # Merge the tsfresh features with the last time stamp data
    final_features = features.merge(last_data, how="left", left_index=True, right_index=True)

    return final_features

In [17]:
set_a = pd.read_parquet(os.path.join('loaded_data', 'a_patient_data_processed_cluster.parquet'))
training_x = extract_tsfresh_features(set_a)

set_b = pd.read_parquet(os.path.join('loaded_data', 'b_patient_data_processed_cluster.parquet'))
validation_x = extract_tsfresh_features(set_b)

set_c = pd.read_parquet(os.path.join('loaded_data', 'c_patient_data_processed_cluster.parquet'))
test_x = extract_tsfresh_features(set_c)
    

last data index Index([132539.0, 132540.0, 132541.0, 132543.0, 132545.0, 132547.0, 132548.0,
       132551.0, 132554.0, 132555.0,
       ...
       142655.0, 142659.0, 142661.0, 142662.0, 142664.0, 142665.0, 142667.0,
       142670.0, 142671.0, 142673.0],
      dtype='float64', name='RecordID', length=3997)


Feature Extraction: 100%|██████████| 70/70 [00:38<00:00,  1.80it/s]



features index Index([132539.0, 132540.0, 132541.0, 132543.0, 132545.0, 132547.0, 132548.0,
       132551.0, 132554.0, 132555.0,
       ...
       142655.0, 142659.0, 142661.0, 142662.0, 142664.0, 142665.0, 142667.0,
       142670.0, 142671.0, 142673.0],
      dtype='float64', length=3997)
last data index Index([142675.0, 142676.0, 142680.0, 142683.0, 142688.0, 142690.0, 142691.0,
       142692.0, 142693.0, 142694.0,
       ...
       152839.0, 152840.0, 152841.0, 152842.0, 152848.0, 152849.0, 152851.0,
       152858.0, 152862.0, 152864.0],
      dtype='float64', name='RecordID', length=3993)


Feature Extraction: 100%|██████████| 70/70 [00:43<00:00,  1.62it/s]



features index Index([142675.0, 142676.0, 142680.0, 142683.0, 142688.0, 142690.0, 142691.0,
       142692.0, 142693.0, 142694.0,
       ...
       152839.0, 152840.0, 152841.0, 152842.0, 152848.0, 152849.0, 152851.0,
       152858.0, 152862.0, 152864.0],
      dtype='float64', length=3993)
last data index Index([152871.0, 152873.0, 152875.0, 152878.0, 152882.0, 152884.0, 152885.0,
       152886.0, 152887.0, 152890.0,
       ...
       163013.0, 163016.0, 163017.0, 163021.0, 163027.0, 163029.0, 163033.0,
       163034.0, 163035.0, 163037.0],
      dtype='float64', name='RecordID', length=3998)


Feature Extraction: 100%|██████████| 70/70 [00:47<00:00,  1.47it/s]



features index Index([152871.0, 152873.0, 152875.0, 152878.0, 152882.0, 152884.0, 152885.0,
       152886.0, 152887.0, 152890.0,
       ...
       163013.0, 163016.0, 163017.0, 163021.0, 163027.0, 163029.0, 163033.0,
       163034.0, 163035.0, 163037.0],
      dtype='float64', length=3998)


In [None]:
training_x.to_parquet('training_X_custom_features_2.parquet')
validation_x.to_parquet('validation_X_custom_features.parquet')    
test_x.to_parquet('testing_X_custom_features.parquet')

In [45]:
# training_x  = pd.read_parquet('training_X_min_features.parquet')
# validation_x = pd.read_parquet('validation_X_min_features.parquet')[training_x.columns]
# test_x = pd.read_parquet('testing_X_min_features.parquet')[training_x.columns]

In [18]:
# get columns that are completely empty
empty_cols = training_x.columns[training_x.isnull().all()]
empty_cols

Index(['Age__fft_aggregated__aggtype_"skew"',
       'Age__fft_aggregated__aggtype_"kurtosis"',
       'Gender__fft_aggregated__aggtype_"skew"',
       'Gender__fft_aggregated__aggtype_"kurtosis"',
       'Height__fft_aggregated__aggtype_"skew"',
       'Height__fft_aggregated__aggtype_"kurtosis"',
       'Weight__fft_aggregated__aggtype_"skew"',
       'Weight__fft_aggregated__aggtype_"kurtosis"'],
      dtype='object')

In [19]:
training_x = training_x.drop(columns=empty_cols)
validation_x = validation_x.drop(columns=empty_cols)
test_x = test_x.drop(columns=empty_cols)

In [20]:
standardscaler = StandardScaler()

In [21]:
training_x_scaled = standardscaler.fit_transform(training_x)
validation_x_scaled = standardscaler.transform(validation_x)
test_x_scaled = standardscaler.transform(test_x)

In [22]:
training_x = pd.DataFrame(training_x, columns=training_x.columns, index=training_x.index)
validation_x = pd.DataFrame(validation_x, columns=training_x.columns, index=validation_x.index)
test_x = pd.DataFrame(test_x, columns=training_x.columns, index=test_x.index)

In [23]:
imputer = SimpleImputer(strategy='median')
training_x = pd.DataFrame(imputer.fit_transform(training_x), columns=training_x.columns, index=training_x.index)
validation_x = pd.DataFrame(imputer.transform(validation_x), columns=validation_x.columns, index=validation_x.index)
test_x = pd.DataFrame(imputer.transform(test_x), columns=test_x.columns, index=test_x.index)

In [24]:
training_x.head()

Unnamed: 0,ALP__abs_energy,ALP__absolute_sum_of_changes,"ALP__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""mean""","ALP__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""mean""",ALP__approximate_entropy__m_2__r_0.2,ALP__kurtosis,ALP__skewness,"ALP__agg_autocorrelation__f_agg_""mean""__maxlag_10","ALP__fft_aggregated__aggtype_""centroid""","ALP__fft_aggregated__aggtype_""variance""",...,last_RespRate,last_SaO2,last_SysABP,last_Temp,last_TroponinI,last_TroponinT,last_Urine,last_WBC,last_Weight,last_pH
132539.0,0.90704,0.0,0.0,-0.13892,0.0,0.0,0.0,0.0,3.014734e-15,4.512324e-14,...,1.340782,0.214272,-0.030791,0.972883,-0.08847,-0.146278,0.830447,-0.477987,-0.120163,-0.013711
132540.0,0.926339,0.0,0.0,-0.13892,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.063384,-0.232318,-0.676732,0.08547,-0.08847,-0.146278,0.501009,0.120932,-0.076606,-0.017041
132541.0,5.999162,1.050731,-0.03077,0.453033,0.083586,0.306111,0.495705,0.334863,4.976976,47.95748,...,-0.063384,-1.1255,0.384456,0.212243,-0.08847,-0.146278,-0.514759,-0.969407,-1.161171,-0.00039
132543.0,2.805955,0.0,0.0,0.24178,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.238905,0.214272,-0.030791,-0.041304,-0.08847,-0.146278,2.31292,-0.70834,0.054064,-0.013711
132545.0,0.887742,0.0,0.0,-0.13892,0.0,0.0,0.0,0.0,7.121964e-16,8.380014e-15,...,-1.116509,0.214272,-0.030791,-0.421623,-0.08847,-0.146278,-0.26768,-1.184403,-0.120163,-0.013711


In [26]:
training_x.to_parquet(os.path.join('extracted_features', 'training_X_clean_custom_2.parquet'))
validation_x.to_parquet(os.path.join('extracted_features', 'validation_X_clean_custom_2.parquet'))
test_x.to_parquet(os.path.join('extracted_features', 'test_X_clean_custom_2.parquet'))

In [1]:
import pandas as pd
import os

training_x = pd.read_parquet(os.path.join('extracted_features', 'training_X_clean.parquet'))

In [2]:
for i in training_x.columns:
    print(i)

