### Data cleaning
THis notebook aims to:
1. remove bundles containing subthalamus.
2. remove cortico-cortical bundles involving Paracentral
3. merge left and right bundles into the mean (effect of hemi and group not significant in ANCOVA analysis) 
4. Additionally, once we decide on which NODDI metrics (pre or post to keep), clean that here as well. 


In [1]:
import pandas as pd 
import numpy as np 

In [6]:
pairs = [['M1L-Brainstem', 'M1R-Brainstem'],
        ['M1L-CaudL', 'M1R-CaudR'], 
        ['M1L-LentiL', 'M1R-LentiR'], 
        ['M1L-ThalfusL', 'M1R-ThalfusR'], 
         ['ParacentralL-Brainstem', 'ParacentralR-Brainstem'],
         ['ParacentralL-CaudL', 'ParacentralR-CaudR'],
         ['ParacentralL-LentiL', 'ParacentralR-LentiR'],
         ['ParacentralL-ThalfusL', 'ParacentralR-ThalfusR'],
         ['S1L-Brainstem', 'S1R-Brainstem'],
         ['S1L-CaudL', 'S1R-CaudR'], 
        ['S1L-LentiL', 'S1R-LentiR'],
        ['S1L-ThalfusL', 'S1R-ThalfusR'], 
         ['S1L-M1L', 'S1R-M1R']]

In [7]:
# read in the metric files saved as csv
controls = pd.read_csv('../../DerivedData/extracted_diffusion_metrics_control_group.csv', index_col=0)
preterms = pd.read_csv('../../DerivedData/extracted_diffusion_metrics_preterm_group.csv', index_col=0)

#1. remove subthalamus
cols_to_keep = [col for col in controls.columns if 'Subthal' not in col]

#2. remove cortico-cortical with Paracentral 
for pair in ['M1R-ParacentralR', 'S1L-ParacentralL',  'ParacentralL-ParacentralR', 'M1L-ParacentralL', 'S1R-ParacentralR']:

    cols_to_keep = [col for col in cols_to_keep if pair not in col]
    
    
controls = controls[cols_to_keep]
preterms = preterms[cols_to_keep]

#2. remove asymmetries

for pair in pairs:
    for metric in ['FA', 'MD', 'RD', 'AD', 'NDI_post', 'NDI_pre', 'ODI_post', 'ODI_pre']:
        
        ### controls 
        new_values_controls = (controls[pair[0]+'_'+metric].values + controls[pair[1]+'_'+metric].values)/2
        new_values_preterms = (preterms[pair[0]+'_'+metric].values + preterms[pair[1]+'_'+metric].values)/2
        
        label = pair[0].replace('L-', '-')
        if label[-1] == 'L':
            label = label[:-1]
      
        controls[label+'_'+metric] = new_values_controls
        preterms[label+'_'+metric] = new_values_preterms
        
        controls.drop(columns=pair[0]+'_'+metric, inplace=True)
        controls.drop(columns=pair[1]+'_'+metric, inplace=True)
        
        preterms.drop(columns=pair[0]+'_'+metric, inplace=True)
        preterms.drop(columns=pair[1]+'_'+metric, inplace=True)
        
        
#3. remove pre or post NDI

cols_to_keep = [col for col in controls.columns if '_pre' not in col]
controls = controls[cols_to_keep]
preterms = preterms[cols_to_keep]

# rename columns 
controls.columns = controls.columns.str.replace('_post', '') 
preterms.columns = preterms.columns.str.replace('_post', '') 

controls.to_csv('../../DerivedData/extracted_diffusion_metrics_control_group_mergedLR.csv')
preterms.to_csv('../../DerivedData/extracted_diffusion_metrics_preterm_group_mergedLR.csv')