In [1]:
import pandas as pd
import numpy as np

In [2]:
def data_extractor(roi, p, key, layers, subjs):
    '''
    Function to extract data from .mat files of a single roi in all subjects
    and combine them into a single dataframe
    ---
    roi: the region of interest. case sensitive to the filename in your work directory
    p: path to the directory of subject folders
    key: dataframe containing the key mapping of included alexnet_id to imagenet_id
    ---
    returns combined dataframe
    '''
    df = pd.DataFrame()

    for mod in layers:
        for subj in subjs:
            #Path construction
            path = p + mod + "/" + subj + "/" + roi + "/"
            #bfd\best_features\1000\conv1\Subj1\VC
            files = glob.glob(os.path.join(path, "*.mat"))
        
            #Create a tmp df containing all mat data in that folder
            tmp = pd.concat([pd.DataFrame(scipy.io.loadmat(f)['best_feature_data']) for f  in files]
                         , ignore_index = True)
    
            #Add a new column to label the entries with the model name, subject number and Alexnet id
            tmp.insert(0, 'layer', mod)
            tmp.insert(0, 'Subject', subj)
            
            #Create a pd df for the index values, transforming Alexnet_id in the process
            #e.g. from n01443537_22563 to 1443537
            l = pd.Series([0]*len(files), dtype='int')
            for i in range(len(files)):
                l.iloc[i] = os.path.basename(files[i]).split('_')[0][2:]
            l = np.repeat(l, 35)
            
            #Repeat the values 35 times, corresponding to 35 samples per image
            l= pd.DataFrame(l, columns = ['Alexnet_id'])
            tmp = pd.concat([l.reset_index(drop=True), tmp], axis = 1, ignore_index = True)

            #Concatenate the resultant dfs together
            df = pd.concat([df, tmp], axis = 0, ignore_index = True)
    #Transform the key into a dictionary to use it in the next step
    key.Alexnet_id = key.Alexnet_id.astype('str')
    key = key.set_index('Alexnet_id')['Imagenet_id'].to_dict()
    
    #Name the metadata columns as appropriate
    df = df.rename(columns = {0 : 'Alexnet_id', 1: 'Subject', 2: 'Layer'})
    
    #Create a new column, Imagenet_id in the master df, by mapping the dic with the Alexnet id values
    df.insert(0, 'Imagenet_id', df.Alexnet_id.map(key))
    
    #Drop rows with nan values (these will have no associated Imagenet_id in the key. Therefore, they are not included)
    df.dropna(inplace = True)
    
    #Drop the Alexnet_id column, and set the Imagenet_id column to int
    df.drop(['Alexnet_id'], axis = 1, inplace = True)
    df['Imagenet_id'] = df['Imagenet_id'].astype('int')
    
    return df.reset_index(drop = True)

In [3]:
def create_multi_index(x, indices):
    '''
    Duplicate multiple lists sequentially to be used in a multi-index dataframe
    e.g.: 
    list = [1, 2, 3]
    after broadcasting onto a 9-row dataframe,
    list = [1, 2, 3, 1, 2, 3, 1, 2, 3]
    ----
    x: number of rows in a dataframe. Use x = 1 to calculate it by multiplying the len of lists together
    indices: list (of lists) to be duplicated to match length x.
    '''
    #x is the number of rows in the dataframe; calculated by multiplying the len of lists together, or provided beforehand
    if x == 1:
        for i in range(len(indices)):
            x *= len(indices[i])
    mul_index = []
    
    for index in indices:
        y = x / len(index)
        index = list(index) * int(y)
        mul_index += [index]
    return mul_index