In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib

### target groups

In [3]:
target_grps = [['K', 'CL', 'CO2', 'NA'],
               ['CREAT', 'BUN'],
               ['INR', 'PT']]
labels = []
for grp in target_grps:
    labels += [l for l in grp]

### read targets

In [41]:
targets = pd.read_csv("../data/targets.csv", index_col=0)
targets = targets.loc[:, labels]
targets.head()

Unnamed: 0_level_0,K,CL,CO2,NA,CREAT,BUN,INR,PT
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
32606973,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
32625833,1.0,0.0,0.0,1.0,0.0,0.0,,
32642729,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
32646042,1.0,1.0,1.0,1.0,1.0,1.0,,
32655313,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


### read and filter features

In [4]:
selected_features = np.loadtxt("../output/selected_features.txt", dtype=str, delimiter=',')

features = pd.read_csv("../data/all_features_178_patients.csv", index_col=0)

features = features.loc[:, selected_features]

In [9]:
features.head()

Unnamed: 0_level_0,1/2 NS + KCL 20 mEq_med-ever_occurred,12/27-NH contacted to fax med list and l_med-days_since_first_value,12/27-NH contacted to fax med list and l_med-days_since_last_value,2/8/2011-MAR not sent from NH for last d_med-days_since_first_value,2/8/2011-MAR not sent from NH for last d_med-days_since_last_value,25HVD3_root-apex_value,25HVD3_root-baseline_value,25HVD3_root-first_value,25HVD3_root-last_value,25HVD3_root-nadir_value,...,warfarin_med-days_since_last_change,weight_demo_features,zinc chloride_med-ever_occurred,zinc gluconate_med-days_since_first_value,zinc gluconate_med-days_since_last_value,zinc oxide topical_med-days_since_first_value,zinc oxide topical_med-days_since_last_value,ziprasidone_med-ever_occurred,zolpidem_med-days_since_last_change,zolpidem_med-ever_occurred
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32606973,0.0,,,,,,,,,,...,,46.0,0.0,,,,,0.0,,0.0
32625833,0.0,,,,,,,,,,...,,75.0,0.0,,,,,0.0,,0.0
32642729,0.0,,,,,,,,,,...,,119.0,0.0,,,,,0.0,,0.0
32646042,0.0,,,,,7.0,7.0,7.0,7.0,7.0,...,,81.0,0.0,,,,,0.0,,0.0
32655313,0.0,,,,,,,,,,...,,53.3,0.0,,,,,0.0,,0.0


### replace white and special characters

In [20]:
### replace white and special characters
### tetrad supports only alphanumeric and a few other characters like - and _
f_cols = features.columns
f_cols = f_cols.str.replace('\s+', '_', regex=True)
f_cols = f_cols.str.replace('\W', '-SC-', regex=True)
features.columns = f_cols

### create one dataset for each target group

In [33]:
for grp in target_grps:
    target_subset = targets.loc[:, grp]
    
    ## select rows for which all targets are available
    row_mask = target_subset.notna().all(axis=1)
    target_subset = target_subset.loc[row_mask, :].copy()
    features_subset = features.loc[row_mask, :].copy()
    
    ## drop all-null features
    features_subset.dropna(axis=1, how='all', inplace=True)
    
    ## impute missing features with mean
    features_subset = features_subset.fillna(features_subset.mean())
    
    ## add prefix to target names
    t_cols = target_subset.columns.tolist()
    t_cols = ['target_' + c for c in t_cols]
    target_subset.columns = t_cols
    
    features_subset.join(target_subset).to_csv("../data/tetrad_input " + '_'.join(grp) + ".csv")
    
    with open('../data/feature_names_tetrad ' + '_'.join(grp) + '.txt', 'w') as f:
        f.write(' '.join(features_subset.columns.tolist()))
    

### create a dataset with all labels

In [40]:
target_subset = targets.copy()

## select rows for which all targets are available
row_mask = target_subset.notna().all(axis=1)
target_subset = target_subset.loc[row_mask, :].copy()
features_subset = features.loc[row_mask, :].copy()

## drop all-null features
features_subset.dropna(axis=1, how='all', inplace=True)

## impute missing features with mean
features_subset = features_subset.fillna(features_subset.mean())

## add prefix to target names
t_cols = target_subset.columns.tolist()
t_cols = ['target_' + c for c in t_cols]
target_subset.columns = t_cols

features_subset.join(target_subset).to_csv("../data/tetrad_input " + '_'.join(labels) + ".csv")

with open('../data/feature_names_tetrad ' + '_'.join(labels) + '.txt', 'w') as f:
    f.write(' '.join(features_subset.columns.tolist()))
    

### add prefix to target columns

In [22]:
t_cols = targets.columns.tolist()
t_cols = ['target_' + c for c in t_cols]
targets.columns = t_cols

### write modified columns names to file

In [23]:
with open('../data/target_names_tetrad.txt', 'w') as f:
    f.write(' '.join(targets.columns.tolist()))