In [63]:
import os
import pandas as pd
import glob 

In [64]:
import scipy
import numpy as np
import scipy.io as spio


def loadmat(filename):
    '''
    this function should be called instead of direct spio.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    '''
    def _check_keys(d):
        '''
        checks if entries in dictionary are mat-objects. If yes
        todict is called to change them to nested dictionaries
        '''
        for key in d:
            if isinstance(d[key], spio.matlab.mio5_params.mat_struct):
                d[key] = _todict(d[key])
        return d

    def _todict(matobj):
        '''
        A recursive function which constructs from matobjects nested dictionaries
        '''
        d = {}
        for strg in matobj._fieldnames:
            elem = matobj.__dict__[strg]
            if isinstance(elem, spio.matlab.mio5_params.mat_struct):
                d[strg] = _todict(elem)
            elif isinstance(elem, np.ndarray):
                d[strg] = _tolist(elem)
            else:
                d[strg] = elem
        return d

    def _tolist(ndarray):
        '''
        A recursive function which constructs lists from cellarrays
        (which are loaded as numpy ndarrays), recursing into the elements
        if they contain matobjects.
        '''
        elem_list = []
        for sub_elem in ndarray:
            if isinstance(sub_elem, spio.matlab.mio5_params.mat_struct):
                elem_list.append(_todict(sub_elem))
            elif isinstance(sub_elem, np.ndarray):
                elem_list.append(_tolist(sub_elem))
            else:
                elem_list.append(sub_elem)
        return elem_list
    data = scipy.io.loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_keys(data)

In [65]:
homedir='/media/raghuram/My Passport/dicom_seg/TCGA-LGG'

In [66]:
os.chdir(homedir)

In [67]:
dicom_file_params_df=pd.read_csv('dicom_file_params.csv')

In [68]:
dicom_file_params_df.shape

(24398, 9)

In [69]:
t1_post_samples_df = pd.read_csv('t1_post_samples.csv')
t1_pre_samples_df = pd.read_csv('t1_pre_samples.csv')
flair_samples = pd.read_csv('flair_samples.csv')
t2_pre_samples = pd.read_csv('t2_pre_samples.csv')

In [None]:
# t1_post_samples_df['Sequence Name']

In [70]:
sequence_mapper_dict = {'T1CE':t1_post_samples_df.values,
               'T1W':t1_pre_samples_df.values,
               'T2W': t2_pre_samples.values,
               'T2F': flair_samples.values}

In [71]:
mapped_sequence = []
for idx, row in dicom_file_params_df.iterrows():
    try:
        if row['scanning_seq_mri'] in sequence_mapper_dict['T1CE']:
            mapped_sequence.append('T1CE')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T1W']:
            mapped_sequence.append('T1W')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T2W']:
            mapped_sequence.append('T2W')
        elif row['scanning_seq_mri'] in sequence_mapper_dict['T2F']:
            mapped_sequence.append('T2F')
    except Exception as e:
        print('{}, {} in row {}'.format(e, row['scanning_seq_mri'], idx))

In [72]:
dicom_file_params_df['mat_file_sequence'] = mapped_sequence
dicom_file_params_df.to_csv('dicom_file_params.csv', index=False)

In [73]:
patient_name_list = []
for idx, row in dicom_file_params_df.iterrows():
    try:
        patient_name_list.append(row['filename'].split('/TCGA-LGG')[1].split('/')[1])
    except Exception as e:
        print('Error {} at index {}'.format(e, idx))

In [74]:
dicom_file_params_df['patient_name'] = patient_name_list
dicom_file_params_df.to_csv('dicom_file_params.csv', index=False)

In [79]:
mat_files_dir = '/home/raghuram/Desktop/radiomics/TEXTURES/mat_folder'

In [80]:
os.chdir(mat_files_dir)

In [81]:
mat_files_list = glob.glob('*.mat')

In [89]:
len(glob.glob('*_T1CE.mat'))

106

In [90]:
data = loadmat(mat_files_list[0])

In [91]:
mat_files_list[-1 ]

'TCGA-CS-5394_T2W.mat'

In [92]:
dicom_file_params_df['mat_file_name'] = dicom_file_params_df['patient_name']+'_'+dicom_file_params_df['mat_file_sequence']+'.mat'

In [93]:
dicom_file_params_df.drop_duplicates(subset=['mat_file_name'], keep='first', inplace=True)

In [94]:
dicom_file_params_df.drop(columns=['patient_name'], inplace=True)

In [95]:
dicom_file_params_df.to_csv('dicom_file_params_df_with_mat_files.csv', index=False)

In [96]:
missing = set(mat_files_list).difference(list(dicom_file_params_df['mat_file_name']))

In [97]:
# missing

In [98]:
mat_files_list[0]

'TCGA-CS-5393_T1CE.mat'

In [99]:
dicom_file_params_df.head()

Unnamed: 0,mag_field_strength,scanner_manufacturer,scanner_model,repetition_time,excitation_time,scanning_seq_mri,filename,mat_file_sequence,mat_file_name
0,1.5,GE MEDICAL SYSTEMS,GENESIS_SIGNA,600.0,9.0,AXIAL T1,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T1W,TCGA-CS-4941_T1W.mat
23,1.5,GE MEDICAL SYSTEMS,GENESIS_SIGNA,3000.0,30.0,AXIAL T2,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,TCGA-CS-4941_T2W.mat
73,1.5,GE MEDICAL SYSTEMS,GENESIS_SIGNA,10002.0,157.5,AXIAL FLAIR,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2F,TCGA-CS-4941_T2F.mat
101,1.5,GE MEDICAL SYSTEMS,GENESIS_SIGNA,650.0,9.0,AXIAL T1 POST GAD FATSAT,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T1CE,TCGA-CS-4941_T1CE.mat
126,3.0,Philips Medical Systems,Intera Achieva,500.000214,10.0,T1 AX SE,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T1W,TCGA-CS-4942_T1W.mat


In [100]:
def extract_flatten_features(mat_data, filename):
    
    features_flattened = []
    
    for experiment_, values in data['textures']['List'].items():
            
        experiment_number = int(experiment_.split('Experiment')[1])
        if experiment_number > 25:
            break
        scale_ = float(values.split(',')[0].split('=')[1])
        algo_ = values.split(',')[1].split('=')[1]
        ng_ = int(values.split(',')[2].split('=')[1])
        
        flattened_df = pd.io.json.json_normalize(data['textures'][experiment_], sep='_')
        flattened_df['mat_file_name'] = filename
        flattened_df_merged = pd.merge(flattened_df, dicom_file_params_df, on='mat_file_name', how='inner')
        flattened_df_merged['experiment_number'] = experiment_number
        flattened_df_merged['scale'] = scale_
        flattened_df_merged['algo'] = algo_
        flattened_df_merged['ng'] = ng_
        features_flattened.append(flattened_df_merged)
    
    features_df_concat = pd.concat(features_flattened, ignore_index=True)
    features_df_concat.to_csv(filename.split('.')[0]+'_features'+'.csv', index=False)

In [101]:
# extract_flatten_features(data)

In [102]:
for idx, mat_file in enumerate(mat_files_list):
    
    if mat_file in missing:
            print('{} not in dicom df, skipping'.format(mat_file))
            continue   
    
    print('Mat file {} at index {} of total length {} being processed'.format(mat_file, idx, len(mat_files_list)))
    data = loadmat(mat_file)
    extract_flatten_features(data, mat_file)

Mat file TCGA-CS-5393_T1CE.mat at index 0 of total length 408 being processed
Mat file TCGA-FG-6690_T2F.mat at index 1 of total length 408 being processed
TCGA-CS-4944_T1CE.mat not in dicom df, skipping
Mat file TCGA-HT-7688_T2W.mat at index 3 of total length 408 being processed
Mat file TCGA-CS-4941_T1W.mat at index 4 of total length 408 being processed
Mat file TCGA-CS-5397_T2W.mat at index 5 of total length 408 being processed
Mat file TCGA-HT-7473_T1CE.mat at index 6 of total length 408 being processed
Mat file TCGA-FG-6689_T1W.mat at index 7 of total length 408 being processed
TCGA-DU-7008_T2F.mat not in dicom df, skipping
TCGA-FG-6690_T1CE.mat not in dicom df, skipping
Mat file TCGA-DU-5853_T1CE.mat at index 10 of total length 408 being processed
TCGA-HT-A5Rb_T1CE.mat not in dicom df, skipping
Mat file TCGA-DU-8167_T2F.mat at index 12 of total length 408 being processed
Mat file TCGA-HT-7882_T2F.mat at index 13 of total length 408 being processed
Mat file TCGA-DU-8163_T2W.mat at 

Mat file TCGA-DU-A5TW_T2F.mat at index 115 of total length 408 being processed
Mat file TCGA-HT-7694_T1CE.mat at index 116 of total length 408 being processed
Mat file TCGA-HT-7694_T1W.mat at index 117 of total length 408 being processed
Mat file TCGA-HT-7874_T2W.mat at index 118 of total length 408 being processed
Mat file TCGA-FG-6691_T2F.mat at index 119 of total length 408 being processed
Mat file TCGA-CS-6669_T2F.mat at index 120 of total length 408 being processed
Mat file TCGA-DU-6410_T1W.mat at index 121 of total length 408 being processed
Mat file TCGA-HT-7882_T2W.mat at index 122 of total length 408 being processed
Mat file TCGA-DU-5849_T2W.mat at index 123 of total length 408 being processed
Mat file TCGA-DU-6542_T2W.mat at index 124 of total length 408 being processed
TCGA-DU-6410_T1CE.mat not in dicom df, skipping
TCGA-DU-6400_T1W.mat not in dicom df, skipping
Mat file TCGA-HT-8111_T2F.mat at index 127 of total length 408 being processed
Mat file TCGA-CS-5393_T2F.mat at in

Mat file TCGA-FG-6691_T2W.mat at index 227 of total length 408 being processed
Mat file TCGA-CS-4944_T2F.mat at index 228 of total length 408 being processed
Mat file TCGA-HT-7475_T2F.mat at index 229 of total length 408 being processed
TCGA-CS-4942_T1CE.mat not in dicom df, skipping
Mat file TCGA-HT-7468_T1W.mat at index 231 of total length 408 being processed
Mat file TCGA-HT-7686_T2W.mat at index 232 of total length 408 being processed
Mat file TCGA-HT-7680_T2W.mat at index 233 of total length 408 being processed
Mat file TCGA-HT-7874_T2F.mat at index 234 of total length 408 being processed
Mat file TCGA-DU-8165_T1CE.mat at index 235 of total length 408 being processed
Mat file TCGA-HT-7604_T2W.mat at index 236 of total length 408 being processed
Mat file TCGA-HT-7693_T2W.mat at index 237 of total length 408 being processed
Mat file TCGA-HT-7602_T1W.mat at index 238 of total length 408 being processed
TCGA-FG-6690_T1W.mat not in dicom df, skipping
TCGA-CS-5390_T1CE.mat not in dicom 

Mat file TCGA-HT-7874_T1W.mat at index 337 of total length 408 being processed
Mat file TCGA-FG-6689_T2F.mat at index 338 of total length 408 being processed
Mat file TCGA-DU-8164_T2F.mat at index 339 of total length 408 being processed
Mat file TCGA-FG-7643_T2W.mat at index 340 of total length 408 being processed
Mat file TCGA-HT-7604_T1CE.mat at index 341 of total length 408 being processed
Mat file TCGA-DU-6542_T1CE.mat at index 342 of total length 408 being processed
Mat file TCGA-HT-7475_T1W.mat at index 343 of total length 408 being processed
Mat file TCGA-CS-6665_T2F.mat at index 344 of total length 408 being processed
Mat file TCGA-DU-8162_T1W.mat at index 345 of total length 408 being processed
Mat file TCGA-HT-8563_T2W.mat at index 346 of total length 408 being processed
Mat file TCGA-HT-7860_T1W.mat at index 347 of total length 408 being processed
Mat file TCGA-DU-8166_T2F.mat at index 348 of total length 408 being processed
Mat file TCGA-DU-7306_T2F.mat at index 349 of tota

In [134]:
homedir = '/home/raghuram/Desktop/radiomics/TEXTURES/csv_folder'
os.chdir(homedir)

t1_list = glob.glob('*_T1W_features.csv')
t2_list = glob.glob('*_T2W_features.csv')
t1ce_list = glob.glob('*_T1CE_features.csv')
flair_list = glob.glob('*_T2F_features.csv')

In [162]:
def create_npy_files(sequence_list, sequence_name):
    
    for expt_num in range(1, 26):
        df_list = []
        for seq_file in sequence_list:
            df = pd.read_csv(seq_file)
            df = df[df['experiment_number'] == expt_num]
            df_list.append(df)
        df_concat = pd.concat(df_list)
        print('Shape before dropping is {}'.format(df_concat.shape))
        df_concat = df_concat[df_concat['repetition_time']<=800]
        df_concat.to_csv('seq_file'+'_'+str(expt_num)+'_'+sequence_name+'.csv')
        print('Shape after dropping is {}'.format(df_concat.shape))
        

In [164]:
create_npy_files(t1ce_list, 'T1CE')

Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40, 58)
Shape before dropping is (67, 58)
Shape after dropping is (40,

In [25]:
df = pd.read_csv(t1_list[0])

In [106]:
df['mat_file_name'].head()

0    TCGA-HT-7879_T1W.mat
1    TCGA-HT-7879_T1W.mat
2    TCGA-HT-7879_T1W.mat
3    TCGA-HT-7879_T1W.mat
4    TCGA-HT-7879_T1W.mat
Name: mat_file_name, dtype: object

In [107]:
df['repetition_time'].head()

0    6.773
1    6.773
2    6.773
3    6.773
4    6.773
Name: repetition_time, dtype: float64

In [114]:
df[(df['repetition_time'] > 800)].shape

(0, 56)

In [147]:
df = pd.read_csv(t2_list[2])

In [148]:
df.head()

Unnamed: 0,GLCM_Contrast,GLCM_Correlation,GLCM_Dissimilarity,GLCM_Energy,GLCM_Entropy,GLCM_Homogeneity,GLCM_SumAverage,GLCM_Variance,GLRLM_GLN,GLRLM_GLV,...,scanner_model,repetition_time,excitation_time,scanning_seq_mri,filename,mat_file_sequence,experiment_number,scale,algo,ng
0,0.482198,0.953506,0.380219,0.063643,4.350379,0.82484,0.071843,0.081026,0.136421,0.001076,...,SIGNA EXCITE,4000.0,103.464,AX T2 FR-FSE RF2 150,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,1,0.5,Equal,8
1,1.530081,0.963313,0.771797,0.020712,6.076781,0.702815,0.034019,0.081459,0.06705,0.002299,...,SIGNA EXCITE,4000.0,103.464,AX T2 FR-FSE RF2 150,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,2,0.5,Equal,16
2,5.66159,0.966072,1.551333,0.006189,7.934774,0.560861,0.016537,0.081482,0.032895,0.003921,...,SIGNA EXCITE,4000.0,103.464,AX T2 FR-FSE RF2 150,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,3,0.5,Equal,32
3,22.145322,0.966755,3.103957,0.00191,9.784935,0.423382,0.008158,0.081316,0.017018,0.006664,...,SIGNA EXCITE,4000.0,103.464,AX T2 FR-FSE RF2 150,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,4,0.5,Equal,64
4,0.169202,0.912215,0.167422,0.205621,2.805948,0.916581,0.077254,0.015058,0.259342,0.001037,...,SIGNA EXCITE,4000.0,103.464,AX T2 FR-FSE RF2 150,/media/raghuram/My Passport/dicom_seg/TCGA-LGG...,T2W,5,0.5,Uniform,8
