In [283]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib

In [267]:
targets = pd.read_csv("../data/targets.csv", index_col=0)

### add prefix to target columns

In [269]:
t_cols = targets.columns.tolist()
t_cols = ['target_' + c for c in t_cols]
targets.columns = t_cols

### merge features and targets, without physicians data, no imputation

In [270]:
features = pd.read_csv("../data/all_features_178_patients.csv", index_col=0)

df = features.join(targets)

df.to_csv('../data/all_data_no_phys.csv', index=True)

### impute the merged data
### fill missing features with average
### fill missing targets with mode

In [271]:
f2 = features.fillna(features.mean())

targets.dropna('columns', how='all', inplace=True)

t2 = targets.apply(lambda x: x.fillna(x.mode()[0]))

In [272]:
### replace white and special characters
### tetrad supports only alphanumeric and a few other characters like - and _
f_cols = f2.columns
f_cols = f_cols.str.replace('\s+', '_', regex=True)
f_cols = f_cols.str.replace('\W', '-SC-', regex=True)
# f_cols = f_cols.str.replace('\d', '-D-', regex=True)

f2.columns = f_cols

t2_cols = t2.columns
t2_cols = t2_cols.str.replace('\s+', '_', regex=True)
t2_cols = t2_cols.str.replace('\W', '-SC-', regex=True)
# t2_cols = t2_cols.str.replace('\d', '-D-', regex=True)

t2.columns = t2_cols

In [273]:
df = f2.join(t2)

df.to_csv("../data/all_data_no_phys_imputed.csv")

### write modified columns names to file

In [240]:
with open('../data/feature_names_tetrad.txt', 'w') as f:
    f.write(' '.join(f2.columns.tolist()))

with open('../data/target_names_tetrad.txt', 'w') as f:
    f.write(' '.join(t2.columns.tolist()))

### drop columns with no variance

In [241]:
var = df.var(axis=0)

In [242]:
var.describe()

count    2.023200e+04
mean     1.565820e+07
std      8.714454e+08
min      0.000000e+00
25%      0.000000e+00
50%      2.816601e-03
75%      6.783239e-01
max      6.888025e+10
dtype: float64

In [243]:
## sum of cols with var == 0
(var == 0).sum()

8262

In [244]:
## total targets with var == 0
df.columns[var == 0].str.contains('target_').sum()

284

In [247]:
df2 = df.loc[:, df.columns[var != 0]]

In [248]:
df2.to_csv('../data/all_data_no_phys_imputed_nonzeroVariance.csv')

In [249]:
with open('../data/feature_names_tetrad_nonZeroVar.txt', 'w') as f:
    f.write(' '.join(df2.columns[df2.columns.isin(f2.columns)].tolist()))

with open('../data/target_names_tetrad_nonZeroVar.txt', 'w') as f:
    f.write(' '.join(df2.columns[df2.columns.isin(t2.columns)].tolist()))

### Missing values stat

In [250]:
features.isnull().sum().describe()

count    19807.000000
mean        96.773565
std         81.150797
min          0.000000
25%          0.000000
50%        144.000000
75%        175.000000
max        177.000000
dtype: float64

In [251]:
targets.isnull().sum().describe()

count    430.000000
mean     139.048837
std       52.491224
min        0.000000
25%      130.250000
50%      165.000000
75%      174.000000
max      177.000000
dtype: float64


### #samples per target

In [276]:
targets = pd.read_csv("../data/targets.csv", index_col=0)

target_samples = targets.notnull().sum()

target_samples.name = '#samples'

target_samples.index.name = 'target'

target_samples.to_csv("../data/target_sampleSize.csv",header=True)

target_samples.describe()