In [525]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, NMF, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, KFold, LeaveOneOut # or StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
import pingouin as pg 
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import MultiTaskLasso

OUTDATED_IGNORE=1

## 1. Read in the subjects 

Isolate subjects that should be kept for the PCA 

In [526]:
def get_clinic(iFile='../../DerivedData/Global.csv'):
    clinic = pd.read_csv('../../DerivedData/Global.csv', sep=';')
    to_keep = ['ParticipantID', 'Cognitive-CompositeScore', 'Communication-CompositeScore', 'Motor-CompositeScore','Cognitive-ScaledScore',
                 'ReceptiveCom-ScaledScore','ExpressiveCom-ScaledScore','FineMotor-ScaledScore','GrossMotor-ScaledScore', 'IMDScore', 
              'Sex', 'Gabirth', 'Pregnancy-size', 'Oxygen-totaldays', 
              'ParenteralNutrition>21d', 'ChronicLungDisease']
    clinic= clinic[to_keep]
    
    clinic.rename( columns ={
                'ParticipantID' : 'subject_id',
                'Cognitive-CompositeScore' : 'Cognitive',
                'Communication-CompositeScore' : 'Language', 
                'Motor-CompositeScore' : 'Motor',
                'Gabirth': 'birth_age', 
                'Pregnancy-size' : 'MultiPreg',
                'Oxygen-totaldays': 'Oxygen_days',
                'ParenteralNutrition>21d': 'ParenteralNutrition_21d'
                }, inplace=True)
    
    for col in ['IMDScore', 'birth_age']:
        clinic[col] = clinic[col].astype(str)
        clinic[col] = clinic[col].apply(lambda x: x.replace(',','.'))
        clinic[col] = clinic[col].astype(np.float16)
        
    clinic.loc[clinic['Sex'] == 'Female', 'Sex_cat'] = 1
    clinic.loc[clinic['Sex'] == 'Male', 'Sex_cat'] = 0
    
    clinic.loc[clinic['ParenteralNutrition_21d'] == 'Yes', 'ParenteralNutrition_21d_cat'] = 1
    clinic.loc[clinic['ParenteralNutrition_21d'] == 'No', 'ParenteralNutrition_21d_cat'] = 0
      
    clinic.loc[clinic['ChronicLungDisease'] == 'Yes', 'ChronicLungDisease_cat'] = 1
    clinic.loc[clinic['ChronicLungDisease'] == 'No', 'ChronicLungDisease_cat'] = 0

    clinic['MultiPreg'] = clinic['MultiPreg'] -1
    
    return clinic

def _get_meanWM(group):
    

    ex_WM = pd.read_csv('../../DerivedData/extreme_pairs_mean_diffusion_metrics_over_WM.csv', index_col=0)
    mod_WM = pd.read_csv('../../DerivedData/moderate_pairs_mean_diffusion_metrics_over_WM.csv', index_col=0)

    WM = pd.concat([ex_WM, mod_WM])
    WM.rename(columns={'matched_ID' : 'control_ID'}, inplace=True)
        
    cols = [col for col in WM.columns if group in col]
    cols = [col for col in cols if 'post' not in col]
        
    WM = WM[cols]
    WM = WM.set_axis(['meanWM_'+col.split('_')[1] for col in cols] , axis=1, inplace=False)
    WM.reset_index(drop=True, inplace=True)
    WM.rename(columns={'meanWM_ID': 'subject_id'}, inplace =True)
    
    return WM 
        
def import_WM():
    
    preterm_WM = _get_meanWM('preterm')
    control_WM = _get_meanWM('control')

    WM = pd.concat([preterm_WM, control_WM])
    return WM

In [527]:
# read in the metric files saved as csv
controls = pd.read_csv('../../DerivedData/extracted_diffusion_metrics_control_group_mergedLR.csv', index_col=0)
preterms = pd.read_csv('../../DerivedData/extracted_diffusion_metrics_preterm_group_mergedLR.csv', index_col=0)

### create pairing - as there is fewer preterms, use their IDs to find matches with controls 
matched = pd.read_csv('../../DerivedData/subject_matching.csv', index_col=0)
matched = matched[matched['preterm_ID'].isin(preterms['subject_id'].values)]

#get matched controls
controls = controls[controls.subject_id.isin(matched.matched_ID_with_outcome.values)]

### get IDs for evaluations 
preterm_ids = preterms.subject_id.values
control_ids = controls.subject_id.values

### concatenate the two 
df = pd.concat([preterms, controls])
df = pd.merge(df, get_clinic(), how="inner", on=["subject_id"])

## get WM 
df = pd.merge(df, import_WM(), how='inner', on=['subject_id'])

#### subjects for the PCA

In [528]:
miss_ids = list(df[df['Cognitive'].isna()].subject_id.values)

ct = df[df['subject_id'].isin(miss_ids) & df['subject_id'].isin(control_ids)].subject_id.values
pt = df[df['subject_id'].isin(miss_ids) & df['subject_id'].isin(preterm_ids)].subject_id.values

req = len(pt) - len(ct)
ids = df[~df['subject_id'].isin(miss_ids) & df['subject_id'].isin(control_ids)].subject_id.values

print('Required number of random controls to select: {}'.format(req))

np.random.seed(42)
random_controls = random.choices(population=ids, k=req)
miss_ids.extend( random_controls)

print('Final number of subjects for PCA: {}'.format(len(miss_ids)))


Required number of random controls to select: 9
Final number of subjects for PCA: 30


In [529]:
### fix miss_Ids so it's reproducible

In [530]:
miss_ids = ['CC00997BN25','CC00301XX04','CC00632XX14','CC00889BN24',
         'CC00525XX14', 'CC00621XX11', 'CC00747XX22', 'CC00326XX13',
         'CC00576XX16','CC00385XX15','CC00889AN24','CC01038XX16',
         'CC01005XX07','CC01077XX14','CC00805XX13','CC00427XX15',
         'CC01042XX12','CC00383XX13','CC00653XX10','CC01014XX08',
         'CC00178XX14','CC00082XX09','CC00150AN02','CC00091XX10',
         'CC00111XX04','CC00716XX15','CC00584XX16','CC00667XX16',
         'CC00566XX14','CC00477XX16']

### Split dataset 
into: 
1. data for PCA (~30 subjects) 
2. hold-out - 25% of the remaining 
3. Train-test


In [531]:
from sklearn.model_selection import train_test_split

df_pca = df[df.subject_id.isin(miss_ids)].copy()

df_sub = df[~df.subject_id.isin(miss_ids)].copy()
df_sub = df_sub.sample(frac=1).reset_index(drop=True)

df_pred, df_hold = train_test_split(df_sub, test_size=0.25)

print('Nmber of subjects for PCA: {}'.format(len(df_pca)))
print('Number of subjects in hold-out set: {}'.format(len(df_hold)))
print('Number of subjects in train-out set: {}'.format(len(df_train)))



Nmber of subjects for PCA: 30
Number of subjects in hold-out set: 22
Number of subjects in train-out set: 66


In [532]:
# global settings 
seed=42
N=50

#independent = ['Motor', 'Language', 'Cognitive']
independent = ['Cognitive-ScaledScore',
       'ReceptiveCom-ScaledScore', 'ExpressiveCom-ScaledScore',
       'FineMotor-ScaledScore', 'GrossMotor-ScaledScore']
sss = LeaveOneOut()

## 2. set up scalers & inputers

In [533]:
def scale_train_features(arr_train, arr_test, scaling_strategy, return_scaler=False):
    
    if scaling_strategy == 'min_max':
        scl = MinMaxScaler()
    else: 
        scl = StandardScaler()
        
    scl.fit(arr_train)
    
    if return_scaler == True:
        return scl.transform(arr_train), scl.transform(arr_test), scl
    else:
        return scl.transform(arr_train), scl.transform(arr_test)
    

def inpute_median(arr_train, arr_test, return_medians=False):
    
    
    
    if len(arr_train[0]) == 1:
        #print('I am in the worng loop')
        md = np.nanmedian(arr_train)
        arr_train[np.where(np.isnan(arr_train))] = md
        arr_test[np.where(np.isnan(arr_test))] = md
        
        if return_medians == True:
            return arr_train, arr_test, md
        else: 
            return arr_train, arr_test
    
    else:
        mds = []
        for col in range(len(arr_train[0])):
            
            md = np.nanmedian(arr_train[:,col])
            #print(md)
            arr_train[:,col][np.where(np.isnan(arr_train[:,col]))] = md
            arr_test[:,col][np.where(np.isnan(arr_test[:,col]))] = md
            
            mds.append(md)
        
        if return_medians == True:
            return arr_train, arr_test, mds
        else: 
            return arr_train, arr_test
    
def initiate_itr_dict(outcomes):
    
    results = {}
    
    for outcome in outcomes:
        results[outcome] = {}
        results[outcome]['test_pred'] = []
        results[outcome]['test_true'] = []
        #results[outcome]['train_pred'] = []
        #results[outcome]['train_true'] = []
    
    return results

    
def initiate_global_dict(outcomes):
    
    results = {}
    
    for outcome in outcomes:
        results[outcome] = {}
        results[outcome]['test_r2'] = []
        results[outcome]['test_mae'] = []
        results[outcome]['test_corr'] = []
        #results[outcome]['train_r2'] = []
        #results[outcome]['train_mae'] = []
        #results[outcome]['train_corr'] = []
    
    return results

def run_dummy(X_train, X_test, y_train, y_test):
    
    reg = DummyRegressor(strategy='mean')
    reg.fit(X_train, y_train)
    
    y_pred_test = reg.predict(X_test)
    #y_pred_train = reg.predict(X_train)
    
    return int(y_pred_test[0]), int(y_test[0])#, int(y_pred_train[0]), int(y_train[0])

def run_L_regression(X_train, X_test, y_train, y_test):
    
    reg = LinearRegression()
    #reg = SVR(kernel='rbf')
    #reg = GradientBoostingRegressor()
    #reg.set_params(**dct)
    reg.fit(X_train, y_train)
    
    y_pred_test = reg.predict(X_test)
    #y_pred_train = reg.predict(X_train)
    
    return int(y_pred_test[0]), int(y_test[0])#, int(y_pred_train[0]), int(y_train[0])

def run_regression(X_train, X_test, y_train, y_test, dct):
    
    #reg = LinearRegression()
    #reg = SVR(kernel='rbf')
    reg = GradientBoostingRegressor()
    reg.set_params(**dct)
    reg.fit(X_train, y_train)
    
    y_pred_test = reg.predict(X_test)
    #y_pred_train = reg.predict(X_train)
    
    return int(y_pred_test[0]), int(y_test[0])#, int(y_pred_train[0]), int(y_train[0])

def evaluate(itr_results, out_results, outcomes):
    
    for outcome in independent:
        out_results[outcome]['test_r2'].append( r2_score(y_true = itr_results[outcome]['test_true'], y_pred=itr_results[outcome]['test_pred']))
        #dummy_results[outcome]['train_r2'].append( r2_score(y_true = itr_results[outcome]['train_true'], y_pred=itr_results[outcome]['train_pred']))
        
        out_results[outcome]['test_mae'].append( mean_absolute_error(y_true = itr_results[outcome]['test_true'], y_pred=itr_results[outcome]['test_pred']))
        #dummy_results[outcome]['train_mae'].append( mean_absolute_error(y_true = itr_results[outcome]['train_true'], y_pred=itr_results[outcome]['train_pred']))
    
        out_results[outcome]['test_corr'].append(np.corrcoef(itr_results[outcome]['test_true'], itr_results[outcome]['test_pred'])[0,1])
        #dummy_results[outcome]['train_corr'].append(np.corrcoef(itr_results[outcome]['train_true'],itr_results[outcome]['train_pred'])[0,1])

    return out_results   

def run_linear(X, y, outcomes, N, sss, pca=None, num_pcs=None, scale=True):
    dummy_results = initiate_global_dict(outcomes=outcomes)
    prediction_results = initiate_global_dict(outcomes=outcomes)


    for _ in range(N):
        itr_results = initiate_itr_dict(outcomes=outcomes) 
        prd_results = initiate_itr_dict(outcomes=outcomes)
 
        for train_index, test_index in sss.split(X):
        
               
                X_train, X_test = X[train_index], X[test_index] 
                y_train, y_test = y[train_index], y[test_index]
                
                if scale == True:
                    # inpute median
                    X_train, X_test = inpute_median(arr_train=X_train, arr_test=X_test)
                    # scale 
                    X_train, X_test = scale_train_features(arr_train=X_train, arr_test= X_test, scaling_strategy='standard_scaler')
    
                if pca != None:
                        #print('Transforming with {} PCs'.format(num_pcs))
                        X_train = pca.transform(X_train)[:,:num_pcs]
                        X_test = pca.transform(X_test)[:,:num_pcs]
            
                for i, outcome in enumerate(outcomes):
            
                    test_pred, test_true = run_dummy(X_train=X_train, X_test=X_test, y_train=y_train[:,i], y_test=y_test[:,i])

                    itr_results[outcome]['test_pred'].append(test_pred)
                    itr_results[outcome]['test_true'].append(test_true)
                    test_pred, test_true = run_L_regression(X_train, X_test, y_train[:,i], y_test[:,i])

                    prd_results[outcome]['test_pred'].append(test_pred)
                    prd_results[outcome]['test_true'].append(test_true)

        dummy_results = evaluate(itr_results=itr_results, out_results=dummy_results, outcomes=outcomes)
        prediction_results = evaluate(itr_results=prd_results, out_results=prediction_results, outcomes=outcomes)
    return dummy_results, prediction_results


def get_linear_results(dummy_results, prediction_results):
    for key in dummy_results.keys():
        print()
        print(key)
        for sub_key in dummy_results[key].keys():
            print('Repeated Dummy:')
            print('mean {} {}: {}(std {}) '.format(sub_key.split('_')[1], sub_key.split('_')[0],
                                    np.round(np.mean(dummy_results[key][sub_key]),3), 
                                      np.round(np.std(dummy_results[key][sub_key]),3)))
            print('Regression:')
            print('mean {} {}: {}(std {}) '.format(sub_key.split('_')[1], sub_key.split('_')[0],
                                    np.round(np.mean(prediction_results[key][sub_key]),3), 
                                      np.round(np.std(prediction_results[key][sub_key]),3))) 
     
def validate_linear(X_train, y_train, X_test, y_test, outcomes, pca=None, num_pcs=None, permut=50, scale=True):
    
    
    if scale == True:
        # inpute median
        X_train, X_test = inpute_median(arr_train=X_train, arr_test=X_test)
        # scale 
        X_train, X_test = scale_train_features(arr_train=X_train, arr_test=X_test, scaling_strategy='standard_scaler')
      
    if pca != None:
        #print('Transforming with {} PCs'.format(num_pcs))
        X_train = pca.transform(X_train)[:,:num_pcs]
        X_test = pca.transform(X_test)[:,:num_pcs]
                    
    print('DUMMY')
    ### baseline model 
    for i, outcome in enumerate(outcomes):

        r2 = []
        mae = []
        corr = []
    
        for _ in range(permut):

            ### single model 
            clf = DummyRegressor(strategy='mean')
    
            clf.fit(X_train, y_train[:,i])
            y_pred = clf.predict(X_test)
    
            r2.append(r2_score(y_pred=y_pred, y_true=y_test[:,i]))                                   
            mae.append(mean_absolute_error(y_pred=y_pred, y_true=y_test[:,i]))
            corr.append(np.corrcoef(y_pred, y_test[:,i])[0,1])
    
        print(outcome)
        print('R2: {}(std {})'.format(np.round(np.mean(r2),3), np.round(np.std(r2),3)))
        print('MAE: {}(std {})'.format(np.round(np.mean(mae),3), np.round(np.std(mae),3)))
        print('corr: {}(std {})'.format(np.round(np.mean(corr),3), np.round(np.std(corr),3)))

 
    print('REGRESSION:')
    ### single model 
    for i, outcome in enumerate(outcomes):

        r2 = []
        mae = []
        corr = []
    
        for _ in range(permut):

            ### single model 
            clf = LinearRegression()
            #clf.set_params(**prediction_params[best_single_model])
            #clf.set_params(**best_single)
    
            clf.fit(X_train, y_train[:,i])
            y_pred = clf.predict(X_test)
    
            r2.append(r2_score(y_pred=y_pred, y_true=y_test[:,i]))                                   
            mae.append(mean_absolute_error(y_pred=y_pred, y_true=y_test[:,i]))
            corr.append(np.corrcoef(y_pred, y_test[:,i])[0,1])
    
        print(outcome)
        print('R2: {}(std {})'.format(np.round(np.nanmean(r2),3), np.round(np.nanstd(r2),3)))
        print('MAE: {}(std {})'.format(np.round(np.nanmean(mae),3), np.round(np.nanstd(mae),3)))
        print('corr: {}(std {})'.format(np.round(np.nanmean(corr),3), np.round(np.nanstd(corr),3)))
     

    

## 2.1 Linear regression

1. IMD as predictor
2. IMD + clinical 
3. WM 
4. WM + IMD 
5. WM + IMD + clinical

In [534]:
features = ['IMDScore']
#features=['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore','birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']

X = df_pred[features].values
y = df_pred[independent].values

print('*** optimisation *** \n')

dummy, prediction = run_linear(X=X, y=y, outcomes=independent, N=N, sss=sss)
get_linear_results(dummy_results=dummy, prediction_results=prediction)


print('*** validation *** \n')
### validation 
## final evaluation 
X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_linear(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test, 
                outcomes=independent, permut=50)

*** optimisation *** 



  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]



Cognitive-ScaledScore
Repeated Dummy:
mean r2 test: -0.003(std 0.0) 
Regression:
mean r2 test: -0.006(std 0.0) 
Repeated Dummy:
mean mae test: 1.727(std 0.0) 
Regression:
mean mae test: 1.682(std 0.0) 
Repeated Dummy:
mean corr test: nan(std nan) 
Regression:
mean corr test: 0.194(std 0.0) 

ReceptiveCom-ScaledScore
Repeated Dummy:
mean r2 test: -0.058(std 0.0) 
Regression:
mean r2 test: -0.063(std 0.0) 
Repeated Dummy:
mean mae test: 2.818(std 0.0) 
Regression:
mean mae test: 2.758(std 0.0) 
Repeated Dummy:
mean corr test: nan(std nan) 
Regression:
mean corr test: 0.054(std 0.0) 

ExpressiveCom-ScaledScore
Repeated Dummy:
mean r2 test: -0.009(std 0.0) 
Regression:
mean r2 test: -0.005(std 0.0) 
Repeated Dummy:
mean mae test: 2.197(std 0.0) 
Regression:
mean mae test: 2.136(std 0.0) 
Repeated Dummy:
mean corr test: nan(std nan) 
Regression:
mean corr test: 0.175(std 0.0) 

FineMotor-ScaledScore
Repeated Dummy:
mean r2 test: -0.009(std 0.0) 
Regression:
mean r2 test: -0.105(std 0.0) 
R

### GBR optimisation

In [535]:
def run_optimisation(X, y, outcomes, N, sss, space,
                     pca=None, num_pcs=None, itr=15, scale=True):
    
    overall_prediction_results = {}
    prediction_params = {}
    overall_mutitask_results = {}
    multitask_params = {}
    
    for search in range(itr):
    
        prediction_results = initiate_global_dict(outcomes=outcomes)
        multitask_results = initiate_global_dict(outcomes=outcomes)
        
        ### get parameters
        params_single = {}
        for key in space.keys():
            params_single[key] = random.choices(population=space[key], k=1)[0]
        
        params_multi = {}
        for key in space_multi.keys():
            params_multi[key] = random.choices(population=space_multi[key], k=1)[0]
        print('Optimization {}/{}'.format(search, itr))
        
        for _ in range(1):
        
            multi_results = initiate_itr_dict(outcomes=outcomes) 
            prd_results = initiate_itr_dict(outcomes=outcomes)
            
            for train_index, test_index in sss.split(X):
                X_train, X_test = X[train_index], X[test_index] 
                y_train, y_test = y[train_index], y[test_index]
        
                if scale == True:
                    # inpute median
                    X_train, X_test = inpute_median(arr_train=X_train, arr_test=X_test)
                    # scale 
                    X_train, X_test = scale_train_features(arr_train=X_train, arr_test=X_test, scaling_strategy='standard_scaler')
        
                if pca != None:
                    #print('Transforming with {} PCs'.format(num_pcs))
                    X_train = pca.transform(X_train)[:,:num_pcs]
                    X_test = pca.transform(X_test)[:,:num_pcs]
                    
        
                for i, outcome in enumerate(outcomes):
            
                    ### prediction
                    test_pred, test_true = run_regression(X_train, X_test, y_train[:,i], y_test[:,i], dct=params_single)

                    prd_results[outcome]['test_pred'].append(test_pred)
                    prd_results[outcome]['test_true'].append(test_true)
                    
                ### multitask setting
                reg = MultiTaskLasso(**params_multi,max_iter=10000)
                reg.fit(X_train , y_train)
                y_pred = reg.predict(X_test )
                
                for i, outcome in enumerate(outcomes):
                    multi_results[outcome]['test_pred'].append(int(y_pred[0,i]))
                    multi_results[outcome]['test_true'].append(int(y_test[0,i]))
            
            prediction_results = evaluate(itr_results=prd_results, out_results=prediction_results, outcomes=independent)
            multitask_results = evaluate(itr_results=multi_results, out_results=multitask_results, outcomes=independent)
        overall_prediction_results[search] = prediction_results
        overall_mutitask_results[search] = multitask_results
    
        prediction_params[search] = params_single
        multitask_params[search] = params_multi
        
    score = overall_prediction_results[0].keys()[1]
    
    ### get the best
    r = []
    for key in overall_prediction_results:
        r.append(np.mean(overall_prediction_results[key][score]['test_mae']))
    
    best_single_model = np.argmin(np.asarray(r))

    r = []
    for key in overall_mutitask_results:
        r.append(np.mean(overall_mutitask_results[key][score]['test_mae']))
    
    best_multi_model = np.argmin(np.asarray(r))
    
    return overall_prediction_results[best_single_model], overall_mutitask_results[best_multi_model], prediction_params[best_single_model], multitask_params[ best_multi_model]

def get_opt_results(dummy_results, prediction_results):
    for key in dummy_results.keys():
        print()
        print(key)
        for sub_key in dummy_results[key].keys():
            print('Repeated SINGLE:')
            print('mean {} {}: {}(std {}) '.format(sub_key.split('_')[1], sub_key.split('_')[0],
                                    np.round(np.mean(dummy_results[key][sub_key]),3), 
                                      np.round(np.std(dummy_results[key][sub_key]),3)))
            print('Regression MULTI')
            print('mean {} {}: {}(std {}) '.format(sub_key.split('_')[1], sub_key.split('_')[0],
                                    np.round(np.mean(prediction_results[key][sub_key]),3), 
                                      np.round(np.std(prediction_results[key][sub_key]),3))) 

            
def validate_single(X_train, X_test, y_train, y_test, setting, outcomes,
                    pca=None, num_pcs=None, permut = 50, scale=True):
    
    if scale == True:
        # inpute median
        X_train, X_test = inpute_median(arr_train=X_train, arr_test=X_test)
        # scale 
        X_train, X_test = scale_train_features(arr_train=X_train, arr_test=X_test, scaling_strategy='standard_scaler')
    
    if pca != None:
        #print('Transforming with {} PCs'.format(num_pcs))
        X_train = pca.transform(X_train)[:,:num_pcs]
        X_test = pca.transform(X_test)[:,:num_pcs]
                    
    print('SINGLE:')
    for i, outcome in enumerate(outcomes):

        r2 = []
        mae = []
        corr = []
    
        for _ in range(permut):

            ### single model 
            clf = GradientBoostingRegressor(**setting, random_state=None)
    
            clf.fit(X_train, y_train[:,i])
            y_pred = clf.predict(X_test)
    
            r2.append(r2_score(y_pred=y_pred, y_true=y_test[:,i]))                                   
            mae.append(mean_absolute_error(y_pred=y_pred, y_true=y_test[:,i]))
            corr.append(np.corrcoef(y_pred, y_test[:,i])[0,1])
    
        print(outcome)
        print('R2: {}(std {})'.format(np.round(np.nanmean(r2),3), np.round(np.nanstd(r2),3)))
        print('MAE: {}(std {})'.format(np.round(np.nanmean(mae),3), np.round(np.nanstd(mae),3)))
        print('corr: {}(std {})'.format(np.round(np.nanmean(corr),3), np.round(np.nanstd(corr),3)))
     
        
def validate_multi(X_train, X_test, y_train, y_test, setting, outcomes,
                   pca=None, num_pcs=None, permut = 50, scale=True):
    if scale == True:
        # inpute median
        X_train, X_test = inpute_median(arr_train=X_train, arr_test=X_test)
        # scale 
        X_train, X_test = scale_train_features(arr_train=X_train, arr_test=X_test, scaling_strategy='standard_scaler')
     
    if pca != None:
        #print('Transforming with {} PCs'.format(num_pcs))
        X_train = pca.transform(X_train)[:,:num_pcs]
        X_test = pca.transform(X_test)[:,:num_pcs]
                    
    ### multi model
    print('MULTI:')
    results_r2 = {}
    results_mae = {}
    results_corr = {}

    for outcome in outcomes:
        results_r2[outcome] = []
        results_mae[outcome] = []
        results_corr[outcome] = []


    for _ in range(permut):
        clf = MultiTaskLasso(**setting, random_state=None, max_iter=10000)
    
    
        clf.fit(X_train , y_train)
        y_pred = clf.predict(X_test )

        for i, outcome in enumerate(independent):
        
            results_r2[outcome].append(r2_score(y_pred=y_pred[:,i], y_true=y_test[:,i]))
            results_mae[outcome].append(mean_absolute_error(y_pred=y_pred[:,i], y_true=y_test[:,i]))
            results_corr[outcome].append(np.corrcoef(y_pred[:,i], y_test[:,i])[0,1])

        
    for outcome in independent:
        print(outcome)
        print('R2: {}(std {})'.format(np.round(np.nanmean(results_r2[outcome]),3), np.round(np.nanstd(results_r2[outcome]),3)))
        print('MAE: {}(std {})'.format(np.round(np.nanmean(results_mae[outcome]),3), np.round(np.nanstd(results_mae[outcome]),3)))
        print('corr: {}(std {})'.format(np.round(np.nanmean(results_corr[outcome]),3), np.round(np.nanstd(results_corr[outcome]),3)))
    
     

In [536]:
space = {}
space['loss']= ['absolute_error', 'squared_error', 'huber']
space['learning_rate'] = [ 0.001, 0.01, 0.1, 1]
space['n_estimators']= [5,10,25,50]
space['max_depth']= [3,5,10,25]

space_multi = {}
space_multi['alpha'] = [0.001, 0.01, 0.01, 1, 10, 100]
space_multi['fit_intercept'] = [True, False]

In [537]:
features = ['IMDScore']
#features=['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore','birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']

X = df_pred[features].values
y = df_pred[independent].values

print('*** optimisation *** \n')

single, multi, best_single, best_multi  = run_optimisation(X=X, y=y, 
                                                           outcomes=independent, 
                                                           N=N, sss=sss, space=space,
                                                           itr=15)
get_opt_results(dummy_results=single, prediction_results=multi)

print('BEST SINGLE: {}',format(best_single))
print('BEST MULTI: {}',format(best_multi))

X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_single(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_single, outcomes=independent, permut = 50)

validate_multi(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_multi, outcomes=independent, permut = 50)

*** optimisation *** 

Optimization 0/15


KeyboardInterrupt: 

## SIMILAR but add PCA before the training

### Linear

In [None]:
#features = ['IMDScore']
#features=['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore']
features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore','birth_age', 'Sex_cat', 'MultiPreg', 
                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
                'ChronicLungDisease_cat']

X = df_pred[features].values
y = df_pred[independent].values


X_for_PCA = df_pca[features].values 
X_for_PCA, _ = inpute_median(arr_train=X_for_PCA, arr_test=X_for_PCA)
X_for_PCA, _ = scale_train_features(arr_train=X_for_PCA, arr_test=X_for_PCA, scaling_strategy='standard')
                                    
### PCA 
pcs= min(len(X_for_PCA), len(X_for_PCA[0])) -1 
pca = PCA(n_components=pcs).fit(X_for_PCA)
num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
print('Number of PCs selected: {}'.format(num_pcs))                                           

print('*** optimisation *** \n')

dummy, prediction = run_linear(X=X, y=y, 
                               outcomes=independent, N=N, sss=sss, 
                               pca = pca, num_pcs = num_pcs)
                                    
get_linear_results(dummy_results=dummy, prediction_results=prediction)


print('*** validation *** \n')
### validation 
## final evaluation 
X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_linear(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test, 
                outcomes=independent, permut=50, 
                pca = pca, num_pcs = num_pcs)

### Optimisation 

In [None]:
space = {}
space['loss']= ['absolute_error', 'squared_error', 'huber']
space['learning_rate'] = [ 0.001, 0.01, 0.1, 1]
space['n_estimators']= [5,10,25,50]
space['max_depth']= [3,5,10,25]

space_multi = {}
space_multi['alpha'] = [0.001, 0.01, 0.01, 1, 10, 100]
space_multi['fit_intercept'] = [True, False]

In [None]:
#features = ['IMDScore']
#features=['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI']
features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore']
#features = ['meanWM_AD', 'meanWM_RD', 'meanWM_MD',
#       'meanWM_FA', 'meanWM_NDI', 'meanWM_ODI', 'IMDScore','birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']

X = df_pred[features].values
y = df_pred[independent].values

X_for_PCA = df_pca[features].values 
X_for_PCA, _ = inpute_median(arr_train=X_for_PCA, arr_test=X_for_PCA)
X_for_PCA, _ = scale_train_features(arr_train=X_for_PCA, arr_test=X_for_PCA, scaling_strategy='standard')
                                    
### PCA 
pcs= min(len(X_for_PCA), len(X_for_PCA[0])) -1 
pca = PCA(n_components=pcs).fit(X_for_PCA)
num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
print('Number of PCs selected: {}'.format(num_pcs))                                           


print('*** optimisation *** \n')

single, multi, best_single, best_multi  = run_optimisation(X=X, y=y, 
                                                           outcomes=independent, 
                                                           N=N, sss=sss, space=space,
                                                           itr=15, 
                                                          pca = pca, num_pcs=num_pcs)
get_opt_results(dummy_results=single, prediction_results=multi)

print('BEST SINGLE: {}',format(best_single))
print('BEST MULTI: {}',format(best_multi))

X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_single(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_single, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs)

validate_multi(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_multi, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs)

## and finally on BUNDLES 

In [None]:
pair_names = [
    'M1-Brainstem', 'S1-Brainstem','Paracentral-Brainstem',
    'M1-Caud', 'S1-Caud','Paracentral-Caud',
    'M1-Lenti', 'S1-Lenti','Paracentral-Lenti', 
    'M1-Thalfus', 'S1-Thalfus','Paracentral-Thalfus', 
    'M1L-M1R', 'S1L-S1R',
    'S1-M1' ]

metrics = ['AD', 'RD', 'FA', 'MD', 'NDI', 'ODI']

In [None]:
## features 
features = [col + '_' + metric for col in pair_names for metric in metrics]
features.extend(['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
                'ChronicLungDisease_cat'])

In [None]:
# ===== LINEAR ==== # 

X = df_pred[features].values
y = df_pred[independent].values


X_for_PCA = df_pca[features].values 
X_for_PCA, _ = inpute_median(arr_train=X_for_PCA, arr_test=X_for_PCA)
X_for_PCA, _ = scale_train_features(arr_train=X_for_PCA, arr_test=X_for_PCA, scaling_strategy='standard')
                                    
### PCA 
pcs= min(len(X_for_PCA), len(X_for_PCA[0])) -1 
pca = PCA(n_components=pcs).fit(X_for_PCA)
num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
print('Number of PCs selected: {}'.format(num_pcs))                                           

print('*** optimisation *** \n')

dummy, prediction = run_linear(X=X, y=y, 
                               outcomes=independent, N=N, sss=sss, 
                               pca = pca, num_pcs = num_pcs)
                                    
get_linear_results(dummy_results=dummy, prediction_results=prediction)


print('*** validation *** \n')
### validation 
## final evaluation 
X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_linear(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test, 
                outcomes=independent, permut=50, 
                pca = pca, num_pcs = num_pcs)

In [None]:
X = df_pred[features].values
y = df_pred[independent].values

X_for_PCA = df_pca[features].values 
X_for_PCA, _ = inpute_median(arr_train=X_for_PCA, arr_test=X_for_PCA)
X_for_PCA, _ = scale_train_features(arr_train=X_for_PCA, arr_test=X_for_PCA, scaling_strategy='standard')
                                    
### PCA 
pcs= min(len(X_for_PCA), len(X_for_PCA[0])) -1 
pca = PCA(n_components=pcs).fit(X_for_PCA)
num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
print('Number of PCs selected: {}'.format(num_pcs))                                           


print('*** optimisation *** \n')

single, multi, best_single, best_multi  = run_optimisation(X=X, y=y, 
                                                           outcomes=independent, 
                                                           N=N, sss=sss, space=space,
                                                           itr=15, 
                                                          pca = pca, num_pcs=num_pcs)
get_opt_results(dummy_results=single, prediction_results=multi)

print('BEST SINGLE: {}',format(best_single))
print('BEST MULTI: {}',format(best_multi))

X_train = df_pred[features].values
y_train = df_pred[independent].values

X_test = df_hold[features].values
y_test = df_hold[independent].values

validate_single(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_single, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs)

validate_multi(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_multi, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs)

In [None]:
## features 
sets =  [['AD', 'RD'], ['FA', 'MD'], ['NDI', 'ODI']]

features1 = [col + '_' + metric for col in pair_names for metric in sets[0]]
features2 = [col + '_' + metric for col in pair_names for metric in sets[1]]
features3 = [col + '_' + metric for col in pair_names for metric in sets[2]]

features4 = 'IMDScore'

#features4 = ['IMDScore', 'birth_age', 'Sex_cat', 'MultiPreg', 
#                'Oxygen_days', 'ParenteralNutrition_21d_cat', 
#                'ChronicLungDisease_cat']

In [None]:
X = np.zeros((len(df_pred),len(df_pred.columns)))
y =  df_pred[independent].values

y_train = y 

X_test = np.zeros((len(df_hold),len(df_hold.columns)))
y_test = df_hold[independent].values

cum_pcs = 0
for ft in [features1, features2, features3]:
    X_for_PCA =  df_pca[ft].values
    X_for_PCA, _, mds = inpute_median(arr_train=X_for_PCA, arr_test=X_for_PCA, return_medians=True)
    X_for_PCA, _, scl = scale_train_features(arr_train=X_for_PCA, arr_test=X_for_PCA, scaling_strategy='standard', return_scaler=True)
    
    ### PCA 
    pcs= min(len(X_for_PCA), len(X_for_PCA[0])) -1 
    pca = PCA(n_components=pcs).fit(X_for_PCA)
    num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
    print('Number of PCs selected: {}'.format(num_pcs))
    
    Xpt = df_pred[ft].values
    Xtst = df_hold[ft].values
    
    for col in range(len(ft)):
        Xpt[:,col][np.where(np.isnan(Xpt[:,col]))] = mds[col]
        Xtst[:,col][np.where(np.isnan(Xtst[:,col]))] = mds[col]
        
    Xpt = scl.transform(Xpt)
    Xpt = pca.transform(Xpt)[:,:num_pcs]
    
    Xtst = scl.transform(Xtst)
    Xtst = pca.transform(Xtst)[:,:num_pcs]
    
    #print(np.shape(Xpt))

    X[:, cum_pcs: cum_pcs+ num_pcs] = Xpt
    X_test[:, cum_pcs: cum_pcs+ num_pcs] = Xtst
    
    cum_pcs = cum_pcs+ num_pcs
 




In [None]:
if features4 == 'IMDScore':
    
    X[:,cum_pcs] = df_pred[features4].values
    md_IMD = np.nanmedian(df_pred[features4].values)

    X[:,cum_pcs][np.where(np.isnan(X[:,cum_pcs]))] = md_IMD
    X = X[:, :cum_pcs+1]



    X_test[:, cum_pcs] = df_hold[features4].values
    X_test[:,cum_pcs][np.where(np.isnan(X_test[:,cum_pcs]))] = md_IMD
    X_test = X_test[:, :cum_pcs+1]

    sc = StandardScaler()
    sc.fit(X)

    X = sc.transform(X)
    X_test = sc.transform(X_test)
    X_train = X
    
else: 
    
    X = X[:,:cum_pcs]
    X_train = X
    X_test = X_test[:, :cum_pcs]
                               
### 2nd level PCA 
pcs= min(len(X), len(X[0])) -1 
pca = PCA(n_components=pcs).fit(X)
num_pcs = np.argwhere( np.cumsum(pca.explained_variance_ratio_)>0.95)[0][0] + 1
print('Overall: Number of PCs selected: {}'.format(num_pcs)) 

In [None]:
# ===== LINEAR ==== # 
                                     
print('*** optimisation *** \n')

dummy, prediction = run_linear(X=X, y=y, 
                               outcomes=independent, N=N, sss=sss, pca =pca, num_pcs=num_pcs, scale=False)
                                    
get_linear_results(dummy_results=dummy, prediction_results=prediction)

print('*** validation *** \n')
### validation 

validate_linear(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test, 
                outcomes=independent, permut=50, pca =pca, num_pcs=num_pcs, scale=False)

In [None]:
                                        
print('*** optimisation *** \n')

single, multi, best_single, best_multi  = run_optimisation(X=X, y=y, 
                                                           outcomes=independent, 
                                                           N=N, sss=sss, space=space,
                                                           itr=15, 
                                                          pca = pca, num_pcs=num_pcs, 
                                                          scale= False)
get_opt_results(dummy_results=single, prediction_results=multi)

print('BEST SINGLE: {}',format(best_single))
print('BEST MULTI: {}',format(best_multi))



validate_single(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_single, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs, scale=False)

validate_multi(X_train=X_train, X_test=X_test,
                y_train=y_train, y_test=y_test, 
                setting=best_multi, outcomes=independent, permut = 50, 
                pca = pca, num_pcs=num_pcs, scale=False)

In [None]:
scores = ['Cognitive-ScaledScore',
       'ReceptiveCom-ScaledScore', 'ExpressiveCom-ScaledScore',
       'FineMotor-ScaledScore', 'GrossMotor-ScaledScore']
for pair in pair_names:
    for metric in metrics:
        bundle = pair+'_'+metric
        for outcome in scores:
            c = np.corrcoef(df_sub[bundle], df_sub[outcome])[0,1]
        
            if c > 0.25:
                print(outcome, bundle, c)

In [None]:
res = {}
res['Motor'] = []
res['Language'] = []
res['Cognitive'] = []


for _ in range(1000):
  for i in range(len(y_test[0])):

    mean_ar =  np.random.normal( np.nanmean(y_train[:,i]),1,len(y_test))
    #print(independent[i])
    #print(np.corrcoef(y_test[:,i], mean_ar)[0,1])
    
    res[independent[i]].append(np.corrcoef(y_test[:,i], mean_ar)[0,1])

In [None]:
for key in res.keys():
    print('{}: {}(std {})'.format(key, np.round(np.mean(res[key]),3),np.round(np.std(res[key]),3) ))