In [1]:
# Predict Who is Alcoholic
import pandas as pd
import numpy as np

# Data loading code:
def import_eeg_file(file_obj, df_type='wide', optimize=True):
    """
    Imports a file for a single EEG file and returns a wide or long dataframe.
    Parameters
    ----------
    file_obj
        A file-like object, such as a GzipFile, or a TextIOWrapper,
        or a regular file (such as from `open(<filename>)`)
    df_type : str, opt
        'long' or 'wide'.  If you want a 'long' dataframe or a 'wide' dataframe as an output.
    optimize: bool, opt
        True if you want data types to be coerced into their minimum sizes, false if you don't.
    Returns
    -------
    pandas.DataFrame
        The data from this file in a DataFrame object.
    """
    
    def parse_subject(line):
        return line[2:-4]

    def parse_alcoholic(line):
        char = line.strip('# ')[3]
        return True if char == 'a' else False

    def parse_obj(line):
        char = line.strip('# ')[1]
        return True if char == '1' else False

    def parse_match(line):
        string = line.strip('# ').split(',')[0].split(' ')[1]
        if string == 'nomatch':
            return 'nomatch'
        elif string == 'obj':
            return 'obj'
        elif string == 'match':
            return 'match'

    def parse_err(line):
        strings = line.strip('# ').split(',')[0].split(' ')
        if len(strings) == 3 and strings[2] == 'err':
            return True
        else:
            return False

    from io import TextIOWrapper
    
    if isinstance(file_obj, TextIOWrapper):
        text_obj = file_obj
    else:
        text_obj = TextIOWrapper(file_obj)

    header = []
    loc = None
    while True:
        loc = text_obj.tell()
        newline = text_obj.readline()
        if newline[0] == "#":
            header += [newline]
        else:
            text_obj.seek(loc)
            break

    subject = parse_subject(header[0])
    alcoholic = parse_alcoholic(header[0])
    obj = parse_obj(header[3])
    match = parse_match(header[3])
    err = parse_err(header[3])

    df = pd.read_csv(text_obj, sep=' ', header=None, names=['trial', 'sensor', 'sample', 'value'],
                     comment='#')
    df['alcoholic'] = alcoholic
    df['object'] = obj
    df['match'] = match
    df['err'] = err
    df['subject'] = subject

    df = df[['subject', 'trial', 'alcoholic', 'match', 'err', 'sensor', 'sample', 'value']]

    if optimize:
        df[['trial', 'sample']] = df[['trial', 'sample']].apply(pd.to_numeric, downcast='unsigned')
        df['value'] = df['value'].astype(np.float32)
        df['sensor'] = pd.Categorical(df['sensor'])
        df['match'] = pd.Categorical(df['match'])
        df['subject'] = pd.Categorical(df['subject'])

    if df_type == 'wide':
        df = df.pivot_table(values='value', index='sample',
                            columns=['subject', 'trial', 'alcoholic', 'match', 'err', 'sensor'])

    if df_type == 'long':
        df = df.set_index(['subject', 'trial', 'alcoholic', 'match', 'err', 'sample'])

    return df

In [2]:
import gzip
import glob

pd.set_option('display.max_rows', None)

fileloc = r'./Data/SMNI_CMI_TRAIN/'
fileList = glob.glob(fileloc+'\\**\\*.gz', recursive = True)

bridge = pd.DataFrame()
trial_no_Dict ={}

for file in fileList:
    fa = gzip.open (file, 'rb')
    dfa = import_eeg_file(fa)

    patient_no = file.split('\\')[-2]
    if (patient_no.__contains__('co2a')):
        alcoholic = True
    else:
        alcoholic = False
    
    match = dfa.columns.levels[3][0]
    err = dfa.columns.levels[4][0]
    
    bridge = bridge.append({'patient_no':patient_no, 'trial_no':file.split('\\')[-1], 'alcoholic': alcoholic, 'match': match, 'err':err, 'file_loc': file}, ignore_index = True)
    
    dfa_column_list =[]
    
    for index in dfa.columns.tolist():
        _, _, _, _, _, channel = index
        dfa_column_list.append (channel)

    dfa.columns = dfa_column_list
    dfa['trial_no'] = file.split('\\')[-1]
    trial_no_Dict.update ({file.split('\\')[-1]:dfa})
    
bridge['alcoholic'] = bridge['alcoholic'].apply(lambda x: bool(x))
bridge['err'] = bridge['err'].apply(lambda x: bool(x))    

In [3]:
bridge.head()

Unnamed: 0,patient_no,trial_no,alcoholic,match,err,file_loc
0,co2a0000364,co2a0000364.rd.000.gz,True,obj,False,./Data/SMNI_CMI_TRAIN\co2a0000364\co2a0000364....
1,co2a0000364,co2a0000364.rd.002.gz,True,obj,False,./Data/SMNI_CMI_TRAIN\co2a0000364\co2a0000364....
2,co2a0000364,co2a0000364.rd.007.gz,True,nomatch,False,./Data/SMNI_CMI_TRAIN\co2a0000364\co2a0000364....
3,co2a0000364,co2a0000364.rd.009.gz,True,match,False,./Data/SMNI_CMI_TRAIN\co2a0000364\co2a0000364....
4,co2a0000364,co2a0000364.rd.010.gz,True,obj,False,./Data/SMNI_CMI_TRAIN\co2a0000364\co2a0000364....


In [4]:
dfList = [trial_no_Dict[key] for key in trial_no_Dict]
trail_no_channels = pd.concat(dfList)
trail_no_channels.head()

Unnamed: 0_level_0,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,C5,...,POZ,PZ,T7,T8,TP7,TP8,X,Y,nd,trial_no
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.146,1.129,-16.856001,-10.02,-0.987,-1.129,2.747,-2.716,-0.926,-8.586,...,-6.266,-2.797,-6.805,-3.886,-9.338,-3.082,-5.269,-5.636,-8.901,co2a0000364.rd.000.gz
1,-2.146,0.641,-7.09,-7.09,-1.475,-1.617,-1.16,-3.204,-2.879,-11.515,...,-8.708,-4.262,-9.247,-5.839,-12.268,-4.059,-5.758,-2.706,-7.924,co2a0000364.rd.000.gz
2,-1.658,-0.336,7.558,1.211,-0.987,0.336,0.305,1.678,2.492,-11.027,...,-9.196,-4.262,-8.27,-4.374,-8.85,-3.571,-2.828,1.689,-3.042,co2a0000364.rd.000.gz
3,-0.682,-0.824,19.277,10.488,-0.01,0.824,0.305,-0.275,2.981,-6.632,...,-6.755,-2.797,-3.876,0.02,-1.526,-1.129,1.567,5.595,4.771,co2a0000364.rd.000.gz
4,2.248,0.641,23.183001,13.906,2.431,2.777,2.258,4.608,5.91,1.18,...,-3.337,-0.844,1.495,4.415,4.822,2.777,5.961,9.013,11.607,co2a0000364.rd.000.gz


In [6]:
dataset=(trail_no_channels.merge(bridge, how ='outer', on=['trial_no']))
dataset.tail()

Unnamed: 0,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,C5,...,TP8,X,Y,nd,trial_no,patient_no,alcoholic,match,err,file_loc
153595,11.424,14.181,2.441,14.13,12.4,-3.103,-0.244,-5.107,-2.401,0.183,...,1.536,-9.816,8.708,6.154,co2c0000347.rd.081.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TRAIN\co2c0000347\co2c0000347....
153596,9.959,16.622,-2.93,18.524,12.889,-3.591,0.732,-6.083,0.041,-1.77,...,3.977,-2.981,10.173,7.619,co2c0000347.rd.081.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TRAIN\co2c0000347\co2c0000347....
153597,8.006,16.622,-7.324,16.083,11.912,-3.591,1.221,-7.548,1.506,-4.211,...,6.907,11.18,11.149,9.572,co2c0000347.rd.081.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TRAIN\co2c0000347\co2c0000347....
153598,6.053,13.692,-11.719,8.759,9.471,-3.591,1.221,-8.525,1.017,-6.653,...,7.884,26.316,10.661,12.502,co2c0000347.rd.081.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TRAIN\co2c0000347\co2c0000347....
153599,4.1,8.321,-15.625,0.458,6.053,-4.079,0.732,-9.013,-0.448,-8.118,...,7.395,34.617001,8.219,13.479,co2c0000347.rd.081.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TRAIN\co2c0000347\co2c0000347....


In [7]:
from pathlib import Path  
filepath = Path('Data/out.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
dataset.to_csv(filepath)  