In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
from mvlearn.cluster import MultiviewKMeans, MultiviewSphericalKMeans
from sklearn.cluster import KMeans
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import normalized_mutual_info_score as nmi_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from scipy.io import savemat
from sklearn.ensemble import RandomForestRegressor
import os

In [None]:
wd = '/cluster/work/borgw/SPSS/MultiOmicsAnalysis/ConsensusClustering/'
os.chdir(wd)

# load and process data
### only use sample with serum sampled between day 0 and 6 (387 samples)

In [None]:
feature_dict = pd.read_pickle('selected_features.pkl')
selected_features = feature_dict['features']
features_to_scale = feature_dict['features_to_scale']
data_episode = pd.read_csv('../olinks/spss_mlcb_olink_episode.csv', index_col=0)
data_episode = data_episode[(data_episode['sample.intv.d'] <=6) & (data_episode['sample.intv.d'] >=0)]

In [None]:
npx_raw = pd.read_csv('data/All Panels_2022_2020_raw_NPX_olink.csv', index_col=0)
bcday = pd.read_csv('../data/spss_mlcb_day.csv',index_col=['sample.id'])
bcday = bcday[bcday['redcap.event.name'] == 'day_sampling']
bcday = bcday[bcday['data.available'] == 'yes']
bcday_raw = bcday[bcday.index.isin(npx_raw.index)]
bcday_raw['age.at.bc'] = data_episode['age.at.bc.days']

npx_raw = npx_raw[npx_raw.index.isin(bcday_raw.index)]
npx_raw = npx_raw.loc[bcday_raw.index]
assert (npx_raw.index == bcday_raw.index).all()

npx_raw_sub = npx_raw[npx_raw.index.isin(data_episode.index)]
bcday_raw_sub = bcday_raw[bcday_raw.index.isin(data_episode.index)]
assert (npx_raw_sub.index == bcday_raw_sub.index).all()

data_episode_sub = data_episode[data_episode.index.isin(bcday_raw_sub.index)]
bcday_raw_sub['fio2.high'] = bcday_raw_sub['fio2.high'].fillna(0.21)

In [None]:
missing = bcday_raw_sub.isnull().sum().sort_values()
missing_ratio = missing/len(bcday_raw_sub)
to_drop = ['sepsis.3.complete', 'redcap.event.name', 'data.available']
bcday_raw_sub = bcday_raw_sub.loc[:, ~bcday_raw_sub.columns.isin(to_drop)]

In [None]:
CCCs = [col for col in data_episode_sub.columns if 'ccc' in col][:-1]
CCCs = data_episode_sub[CCCs].replace(['yes', 'no'], [1, 0])
Sex = data_episode_sub['sex'].replace(['male', 'female'], [1, 0])
ABXs =  data_episode_sub[[col for col in data_episode_sub.columns if 'abx.change' in col][2:]]
ABXs = ABXs.replace(['yes', 'no'], [1, 0])

focus = pd.get_dummies(data_episode_sub['focus.grp'], prefix='focus')
patho = pd.get_dummies(data_episode_sub['pathogen.grp'], prefix='patho')
cahai = pd.get_dummies(data_episode_sub['cahai'], prefix='cahai')
category = pd.get_dummies(data_episode_sub['category'], prefix='cate')
ethni = pd.get_dummies(data_episode_sub['ethnicity'], prefix='eth')
neo = data_episode_sub['neonate'].replace(['yes', 'no'], [1, 0])

In [None]:
features_to_scale = ['temp.high', 'temp.low', 'rr.high', 'sato2.low', 'hr.high',
                       'cap.refill', 'sbp.low', 'map.low', 'gcs.low', 'wcc.high', 'wcc.low',
                       'anc.low', 'lymph.low', 'platelets.low', 'lactate.high', 'crea.high',
                       'fio2.high', 'pao2.fio2.ratio.calc', 'spo2.fio2.ratio', 'pao2.fio2', 'alat.high']
binary_features = [ 'niv', 'iv', 'hfnc', 'ecls', 'inotrope', 'crrt']
bcday_raw_sub[binary_features] = bcday_raw_sub[binary_features].replace(['yes', 'no'], [1, 0])
bcday_raw_sub = pd.concat([bcday_raw_sub, Sex, CCCs, ABXs, focus, patho, cahai, category, ethni, neo], 1)

In [None]:
todrop = ['fio2.high.abga','ecls.type', 'inotrope.adr.dose', 'inotrope.noradr.dose', 'inotrope.dop.dose',
           'inotrope.dobut.dose', 'inotrope.vaso.dose', 'inotrope.milr.dose', 'bili.direct', 'pct.high',
           'pao2.fio2.ratio', 'spo2.fio2.ratio', 'pao2.fio2.ratio.calc', 'fio2.high.abga',
           'pao2.fio2']
bcday_raw_sub = bcday_raw_sub.loc[:, ~bcday_raw_sub.columns.isin(todrop)]

In [None]:
physio_view_cols = ['temp.high', 'temp.low', 'rr.high', 'sato2.low', 'hr.high',
                   'cap.refill', 'sbp.low', 'map.low', 'gcs.low', 'fio2.high', 'wcc.high', 
                   'wcc.low', 'anc.low', 'lymph.low', 'platelets.low','paco2.high', 'pao2.low', 
                   'lactate.high', 'bili.high', 'crea.high', 'inr.high', 'alat.high', 
                   'cons05.resp', 'cons05.cvs', 'cons05.cns', 'cons05.ren', 'cons05.hep', 'cons05.hem', 'cons05.score', 
                   'pelod.resp', 'pelod.cvs', 'pelod.cns', 'pelod.ren', 'pelod.hem', 'pelod.score', 
                   'psofa.resp', 'psofa.cvs', 'psofa.cns', 'psofa.ren','psofa.hep', 'psofa.hem', 'psofa.score',
                   'age.at.bc', 'ccc.summary', 'niv', 'iv', 'hfnc',  'ecls', 'inotrope', 'crrt']


contextual_view_cols = ['ccc.neuro', 
                       'ccc.cardio', 'ccc.pulmo', 'ccc.uro', 'ccc.gastro', 'ccc.haemimmuno',
                       'ccc.metabol', 'ccc.malform', 'ccc.onco', 'ccc.neo', 'ccc.surg',
                       'ccc.techdep', 'ccc.transplant', 'abx.change.pen', 'abx.change.oxa',
                       'abx.change.pip', 'abx.change.cla', 'abx.change.cfz', 'abx.change.cxm',
                       'abx.change.cro', 'abx.change.fep', 'abx.change.mem', 'abx.change.amk',
                       'abx.change.ery', 'abx.change.van', 'abx.change.cip', 'abx.change.lzd',
                       'abx.change.cli', 'abx.change.mtz', 'abx.change.sxt', 
                       'focus_abdominal', 'focus_clabsi', 'focus_cns',
                       'focus_earnosethroat', 'focus_endocarditis', 'focus_osteoarticular',
                       'focus_other', 'focus_pneumonia', 'focus_primbsi', 'focus_skin',
                       'focus_toxic_shock', 'focus_uti', 'focus_wound', 'patho_candida',
                       'patho_cons', 'patho_ecoli', 'patho_enterococcus', 'patho_hinfluenzae',
                       'patho_klebsiella', 'patho_nmeningitidis', 'patho_othergneg',
                       'patho_othergpos', 'patho_paeruginosa', 'patho_sagalactiae',
                       'patho_saureus', 'patho_spneumoniae', 'patho_spyogenes',
                       'patho_viridansgroup', 'cahai_ca', 'cahai_eos', 'cahai_hai',
                       'cahai_los.ca', 'cahai_los.hai',
                       'cate_comorbidity', 'cate_healthy', 'cate_neonate', 'eth_african',
                       'eth_asian', 'eth_caucasian', 'eth_jewish', 'eth_mixed', 'neonate', 'sex']

all_cols = physio_view_cols + contextual_view_cols

In [None]:
len(contextual_view_cols)

In [None]:
continous_cols = [col for col in bcday_raw_sub.columns if bcday_raw_sub[col].dropna().unique().shape[0]>2] 
binary_cols = [col for col in bcday_raw_sub.columns if bcday_raw_sub[col].dropna().unique().shape[0]<=2]
addtional_cols = ['cons05.resp',
                 'cons05.cvs',
                 'cons05.cns',
                 'cons05.ren',
                 'cons05.hep',
                 'cons05.hem',
                 'pelod.ren',]
continous_cols =  continous_cols + addtional_cols
binary_cols = [col for col in binary_cols if col not in addtional_cols]

In [None]:
contextual_view_raw = bcday_raw_sub[binary_cols]
phyiso_view_raw = bcday_raw_sub[continous_cols]
proteome_view_raw = npx_raw_sub
bcday_raw_sub.to_csv('data/bcday_raw_npx.csv')

In [None]:
physio_cols = ['temp.high', 'temp.low', 'rr.high', 'sato2.low', 'hr.high',
               'cap.refill', 'sbp.low', 'map.low', 'gcs.low', 'fio2.high', 'wcc.high', 
               'wcc.low', 'anc.low', 'lymph.low', 'platelets.low',
               'paco2.high', 'pao2.low', 'lactate.high', 'bili.high', 'crea.high',
               'inr.high', 'alat.high']

missing = phyiso_view_raw[physio_cols]
plt.figure(figsize=(9, 6))
ax = (missing.isnull().sum()/len(missing)).sort_values().plot.bar(fontsize=20)
ax.set_ylabel('missingness',fontsize=20)
plt.show()

In [None]:
for col in proteome_view_raw.columns:
    plt.figure(figsize=(12 ,9))
    plt.hist(proteome_view_raw[col], weights=np.ones_like(proteome_view_raw[col]) / len(proteome_view_raw[col])
             , bins=20, )
    plt.title(col, fontsize=20)
    #plt.savefig('Proteome/{}.png'.format(col), dpi=300)
    plt.show()

In [None]:
plt.figure(figsize=(12 ,9))
plt.hist(proteome_view_raw.isnull().sum()/len(proteome_view_raw),
         weights=np.ones_like(proteome_view_raw)[0] / len(proteome_view_raw)
         , bins=25, )
plt.title('Proteomic view missingness', fontsize=20)
plt.savefig('Proteomic view missingness.png', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(12 ,9))
plt.hist(phyiso_view_raw.isnull().sum()/len(phyiso_view_raw), 
         weights=np.ones_like(phyiso_view_raw)[0] / len(phyiso_view_raw),bins=10)
plt.title('Physiological view missingness', fontsize=20)
plt.savefig('Physiological view missingness.png', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(12 ,9))
plt.hist(contextual_view_raw.isnull().sum()/len(contextual_view_raw),
         weights=np.ones_like(contextual_view_raw)[0] / len(contextual_view_raw), 
         density=True,bins=10)
plt.title('Contextual view missingness', fontsize=20)
plt.savefig('Contextual view missingness.png', dpi=300)
plt.show()

## MICE imputation for the phyiso_view and proteome_view
### 0 imputation for contextual_view

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
impute_estimator = RandomForestRegressor(
        # We tuned the hyperparameters of the RandomForestRegressor to get a good
        # enough predictive performance for a restricted execution time.
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=-1,
        random_state=0,
    )
imp = IterativeImputer(max_iter=20, estimator=impute_estimator, random_state=0)

In [None]:
proteome_view =  pd.DataFrame(imp.fit_transform(proteome_view_raw),
                              index=proteome_view_raw.index, 
                              columns=proteome_view_raw.columns)
phyiso_view =  pd.DataFrame(imp.fit_transform(phyiso_view_raw),
                              index=phyiso_view_raw.index, 
                              columns=phyiso_view_raw.columns)
contextual_view = contextual_view.fillna(0)

In [None]:
phyiso_view.to_csv('data/PhysioView.csv')
proteome_view.to_csv('data/ProteomeView.csv')
contextual_view.to_csv('data/ContextualView.csv')
assert (phyiso_view.index == contextual_view.index).all() & (phyiso_view.index == contextual_view.index).all()

# log transform

In [None]:
physio_view = pd.read_csv('data/PhysioView.csv', index_col=0)
proteome_view = pd.read_csv('data/ProteomeView.csv', index_col=0)
contextual_view = pd.read_csv('data/ContextualView.csv', index_col=0)
clinical_view = pd.concat([physio_view, contextual_view], 1)
clinical_view = clinical_view.drop(['pupils', 'crp.high'], 1)

In [None]:
physio_view_cols = ['temp.high', 'temp.low', 'rr.high', 'sato2.low', 'hr.high',
                   'cap.refill', 'sbp.low', 'map.low', 'gcs.low', 'fio2.high', 'wcc.high', 
                   'wcc.low', 'anc.low', 'lymph.low', 'platelets.low','paco2.high', 'pao2.low', 
                   'lactate.high', 'bili.high', 'crea.high', 'inr.high', 'alat.high', 
                   'cons05.resp', 'cons05.cvs', 'cons05.cns', 'cons05.ren', 'cons05.hep', 'cons05.hem', 'cons05.score', 
                   'pelod.resp', 'pelod.cvs', 'pelod.cns', 'pelod.ren', 'pelod.hem', 'pelod.score', 
                   'psofa.resp', 'psofa.cvs', 'psofa.cns', 'psofa.ren','psofa.hep', 'psofa.hem', 'psofa.score',
                   'age.at.bc', 'ccc.summary', 'niv', 'iv', 'hfnc',  'ecls', 'inotrope', 'crrt']


contextual_view_cols = ['ccc.neuro', 
                       'ccc.cardio', 'ccc.pulmo', 'ccc.uro', 'ccc.gastro', 'ccc.haemimmuno',
                       'ccc.metabol', 'ccc.malform', 'ccc.onco', 'ccc.neo', 'ccc.surg',
                       'ccc.techdep', 'ccc.transplant', 'abx.change.pen', 'abx.change.oxa',
                       'abx.change.pip', 'abx.change.cla', 'abx.change.cfz', 'abx.change.cxm',
                       'abx.change.cro', 'abx.change.fep', 'abx.change.mem', 'abx.change.amk',
                       'abx.change.ery', 'abx.change.van', 'abx.change.cip', 'abx.change.lzd',
                       'abx.change.cli', 'abx.change.mtz', 'abx.change.sxt', 
                       'focus_abdominal', 'focus_clabsi', 'focus_cns',
                       'focus_earnosethroat', 'focus_endocarditis', 'focus_osteoarticular',
                       'focus_other', 'focus_pneumonia', 'focus_primbsi', 'focus_skin',
                       'focus_toxic_shock', 'focus_uti', 'focus_wound', 'patho_candida',
                       'patho_cons', 'patho_ecoli', 'patho_enterococcus', 'patho_hinfluenzae',
                       'patho_klebsiella', 'patho_nmeningitidis', 'patho_othergneg',
                       'patho_othergpos', 'patho_paeruginosa', 'patho_sagalactiae',
                       'patho_saureus', 'patho_spneumoniae', 'patho_spyogenes',
                       'patho_viridansgroup', 'cahai_ca', 'cahai_eos', 'cahai_hai',
                       'cahai_los.ca', 'cahai_los.hai',
                       'cate_comorbidity', 'cate_healthy', 'cate_neonate', 'eth_african',
                       'eth_asian', 'eth_caucasian', 'eth_jewish', 'eth_mixed', 'neonate', 'sex']

all_cols = physio_view_cols + contextual_view_cols

cols_to_log = ['alat.high', 
                'anc.low',
                'bili.high', 
                'cap.refill', 
                'crea.high',
                'lactate.high',
                'lymph.low', 
                'pao2.low',
                'wcc.high', 
                'wcc.low',
                'cons05.score', 
                'pelod.score',
                'psofa.score',
                 'sato2.low',
                 'fio2.high']

clinical_view['sato2.low'] = 100 - clinical_view['sato2.low']
clinical_view['fio2.high'] = clinical_view['fio2.high'] - 0.21
clinical_view[cols_to_log] = np.log(clinical_view[cols_to_log] + 1)
#clinical_view = clinical_view.replace([np.inf, -np.inf], np.nan).fillna(0)

# PCA on binarys

In [None]:
demographics = ['cate_neonate', 'eth_asian', 'eth_caucasian', 'eth_african', 
                'eth_jewish', 'eth_mixed', 'neonate', 'sex']

chronic_conditions = ['ccc.neuro', 'ccc.cardio', 'ccc.pulmo', 'ccc.uro', 'ccc.gastro', 'ccc.haemimmuno',
                      'ccc.metabol', 'ccc.malform', 'ccc.onco', 'ccc.neo', 'ccc.surg', 'ccc.techdep',
                      'ccc.transplant', 'cate_comorbidity', 'cate_healthy']

cons_scores = ['cons05.resp', 'cons05.cvs', 'cons05.cns', 'cons05.ren', 'cons05.hep', 'cons05.hem']

infections = ['focus_abdominal', 'focus_clabsi', 'focus_cns',
               'focus_earnosethroat', 'focus_endocarditis', 'focus_osteoarticular',
               'focus_other', 'focus_pneumonia', 'focus_primbsi', 'focus_skin',
               'focus_toxic_shock', 'focus_uti', 'focus_wound', 'cahai_ca', 'cahai_eos', 'cahai_hai',
               'cahai_los.ca', 'cahai_los.hai']

pathogen = ['patho_candida', 'patho_cons', 'patho_ecoli', 'patho_enterococcus', 
            'patho_hinfluenzae', 'patho_klebsiella', 'patho_nmeningitidis', 
            'patho_othergneg',
           'patho_othergpos', 'patho_paeruginosa', 'patho_sagalactiae',
           'patho_saureus', 'patho_spneumoniae', 'patho_spyogenes',
           'patho_viridansgroup']

treatment = ['abx.change.pen', 'abx.change.oxa',
           'abx.change.pip', 'abx.change.cla', 'abx.change.cfz', 'abx.change.cxm',
           'abx.change.cro', 'abx.change.fep', 'abx.change.mem', 'abx.change.amk',
           'abx.change.ery', 'abx.change.van', 'abx.change.cip', 'abx.change.lzd',
           'abx.change.cli', 'abx.change.mtz', 'abx.change.sxt', 
           'niv', 'iv', 'hfnc',  'ecls', 'inotrope', 'crrt']

binarys = {'demographics': demographics, 
           'chronic_condition':chronic_conditions, 
           'cons_scores':cons_scores, 
           'infections':infections,
           'pathogen':pathogen, 
           'treatment':treatment}

In [None]:
from sklearn.decomposition import PCA
binary_cols = []
binary_data = []

for key, value in binarys.items():
    subdata = clinical_view[value]
    pca = PCA()
    subdata = pca.fit_transform(subdata)
    #subdata = subdata[:, pca.singular_values_ > 1e-3]
    subdata = pd.DataFrame(subdata,
                           index=clinical_view[value].index, 
                           columns=value)
    for i in range(subdata.shape[1]):
        plt.figure(figsize=(9, 6))
        plt.hist(subdata[value[i]], bins=50)
        plt.title(value[i], fontsize=20)
        plt.savefig('Binary_PCA_plots/PCA_{}.png'.format(value[i]))
        plt.close()
    binary_cols.extend(value)
    binary_data.append(subdata)
    
continous_cols = [col for col in all_cols if col not in binary_cols] 
clinical_continous = clinical_view[continous_cols]
clinical_binary_pcas = pd.concat(binary_data, 1)

In [None]:
sc = StandardScaler()
clinical_continous[clinical_continous.columns] = sc.fit_transform(clinical_continous)
clinical_binary_pcas[clinical_binary_pcas.columns] = sc.fit_transform(clinical_binary_pcas)
clinical_view_new = pd.concat([clinical_continous, clinical_binary_pcas], 1)
clinical_view_new.to_csv('data/ClinicalViewStandardized_binary_pca.csv')

In [None]:
proteome_view.to_csv('data/ProteomeViewStandardized.csv')

In [None]:
kcc_input_sub = {}
kcc_input_sub['Cov'] = np.vstack((physio_view['age.at.bc'].values, contextual_view['sex'].values)).T
kcc_input_sub['Clinical'] = clinical_view_new.values
savemat('data/ClinicalView_AgeSex.mat', kcc_input_sub) 

In [None]:
kcc_input_sub = {}
kcc_input_sub['Cov'] = np.vstack((physio_view['age.at.bc'].values, contextual_view['sex'].values)).T
kcc_input_sub['Proteome'] = proteome_view.values
savemat('data/ProteomeView_AgeSex.mat', kcc_input_sub) 

## assign sample id to KCC processed data

In [None]:
for k in [2, 3, 4, 5, 6]:
    clinical_kcc = pd.read_csv('data/TwoViewsKCC_clinical_K_{}.csv'.format(k), header=None)
    clinical_kcc.columns = ['KCC {}'.format(i) for i in range(1, k+1)]
    clinical_kcc.index = clinical_view_new.index
    clinical_kcc.to_csv('data/TwoViewsKCC_clinical_K_{}.csv'.format(k))
    
    npx_kcc = pd.read_csv('data/TwoViewsKCC_proteome_K_{}.csv'.format(k), header=None)
    npx_kcc.columns = ['KCC {}'.format(i) for i in range(1, k+1)]
    npx_kcc.index = proteome_view.index
    npx_kcc.to_csv('data/TwoViewsKCC_proteome_K_{}.csv'.format(k))