In [1]:
# Predict Who is Alcoholic
import pandas as pd
import numpy as np

# Data loading code:
def import_eeg_file(file_obj, df_type='wide', optimize=True):
    """
    Imports a file for a single EEG file and returns a wide or long dataframe.
    Parameters
    ----------
    file_obj
        A file-like object, such as a GzipFile, or a TextIOWrapper,
        or a regular file (such as from `open(<filename>)`)
    df_type : str, opt
        'long' or 'wide'.  If you want a 'long' dataframe or a 'wide' dataframe as an output.
    optimize: bool, opt
        True if you want data types to be coerced into their minimum sizes, false if you don't.
    Returns
    -------
    pandas.DataFrame
        The data from this file in a DataFrame object.
    """
    
    def parse_subject(line):
        return line[2:-4]

    def parse_alcoholic(line):
        char = line.strip('# ')[3]
        return True if char == 'a' else False

    def parse_obj(line):
        char = line.strip('# ')[1]
        return True if char == '1' else False

    def parse_match(line):
        string = line.strip('# ').split(',')[0].split(' ')[1]
        if string == 'nomatch':
            return 'nomatch'
        elif string == 'obj':
            return 'obj'
        elif string == 'match':
            return 'match'

    def parse_err(line):
        strings = line.strip('# ').split(',')[0].split(' ')
        if len(strings) == 3 and strings[2] == 'err':
            return True
        else:
            return False

    from io import TextIOWrapper
    
    if isinstance(file_obj, TextIOWrapper):
        text_obj = file_obj
    else:
        text_obj = TextIOWrapper(file_obj)

    header = []
    loc = None
    while True:
        loc = text_obj.tell()
        newline = text_obj.readline()
        if newline[0] == "#":
            header += [newline]
        else:
            text_obj.seek(loc)
            break

    subject = parse_subject(header[0])
    alcoholic = parse_alcoholic(header[0])
    obj = parse_obj(header[3])
    match = parse_match(header[3])
    err = parse_err(header[3])

    df = pd.read_csv(text_obj, sep=' ', header=None, names=['trial', 'sensor', 'sample', 'value'],
                     comment='#')
    df['alcoholic'] = alcoholic
    df['object'] = obj
    df['match'] = match
    df['err'] = err
    df['subject'] = subject

    df = df[['subject', 'trial', 'alcoholic', 'match', 'err', 'sensor', 'sample', 'value']]

    if optimize:
        df[['trial', 'sample']] = df[['trial', 'sample']].apply(pd.to_numeric, downcast='unsigned')
        df['value'] = df['value'].astype(np.float32)
        df['sensor'] = pd.Categorical(df['sensor'])
        df['match'] = pd.Categorical(df['match'])
        df['subject'] = pd.Categorical(df['subject'])

    if df_type == 'wide':
        df = df.pivot_table(values='value', index='sample',
                            columns=['subject', 'trial', 'alcoholic', 'match', 'err', 'sensor'])

    if df_type == 'long':
        df = df.set_index(['subject', 'trial', 'alcoholic', 'match', 'err', 'sample'])

    return df

In [2]:
import gzip
import glob

#pd.set_option('display.max_rows', None)

def data_load (fileLoc = ''):

    fileList = glob.glob(fileloc+'\\**\\*.gz', recursive = True)

    bridge = pd.DataFrame()
    trial_no_Dict ={}

    for file in fileList:
        fa = gzip.open (file, 'rb')
        
        try:
            dfa = import_eeg_file(fa)
        except:
            print (file)
            try:
                display (fa)
            except:
                continue
            continue

        patient_no = file.split('\\')[-2]
        if (patient_no.__contains__('co2a')):
            alcoholic = True
        else:
            alcoholic = False

        match = dfa.columns.levels[3][0]
        err = dfa.columns.levels[4][0]

        bridge = bridge.append({'patient_no':patient_no, 'trial_no':file.split('\\')[-1], 'alcoholic': alcoholic, 'match': match, 'err':err, 'file_loc': file}, ignore_index = True)
        
        dfa_column_list =[]

        for index in dfa.columns.tolist():
            _, _, _, _, _, channel = index
            dfa_column_list.append (channel)
        

        dfa.columns = dfa_column_list
        dfa['trial_no'] = file.split('\\')[-1]
        dfa = dfa.reset_index()
        trial_no_Dict.update ({file.split('\\')[-1]:dfa})

    bridge['alcoholic'] = bridge['alcoholic'].apply(lambda x: bool(x))
    bridge['err'] = bridge['err'].apply(lambda x: bool(x))    
    
    return bridge, trial_no_Dict

In [3]:
def data_save (df, fileLoc = '', fileName = ''):
    df.to_csv(fileLoc + '\\'+fileName+'.csv')

In [4]:
fileloc = r'./Data/SMNI_CMI_TEST/' #This one changes according to your folder/files name

bridge, trial_no_Dict = data_load(fileloc)

In [5]:
bridge.head()

Unnamed: 0,patient_no,trial_no,alcoholic,match,err,file_loc
0,co2a0000364,co2a0000364.rd.030.gz,True,obj,False,./Data/SMNI_CMI_TEST\co2a0000364\co2a0000364.r...
1,co2a0000364,co2a0000364.rd.032.gz,True,obj,False,./Data/SMNI_CMI_TEST\co2a0000364\co2a0000364.r...
2,co2a0000364,co2a0000364.rd.034.gz,True,obj,False,./Data/SMNI_CMI_TEST\co2a0000364\co2a0000364.r...
3,co2a0000364,co2a0000364.rd.036.gz,True,obj,False,./Data/SMNI_CMI_TEST\co2a0000364\co2a0000364.r...
4,co2a0000364,co2a0000364.rd.038.gz,True,obj,False,./Data/SMNI_CMI_TEST\co2a0000364\co2a0000364.r...


In [6]:
dfList = [trial_no_Dict[key] for key in trial_no_Dict]
trail_no_channels = pd.concat(dfList)
trail_no_channels.head()

Unnamed: 0,sample,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,...,POZ,PZ,T7,T8,TP7,TP8,X,Y,nd,trial_no
0,0,-6.083,-7.64,-0.682,6.46,-4.71,7.762,-0.773,24.017,-2.319,...,-0.6,2.391,-7.741,-8.921,-1.862,1.76,-2.441,-5.341,-3.56,co2a0000364.rd.030.gz
1,1,-4.618,-9.593,0.295,2.065,-3.733,-7.375,-0.285,-21.881001,0.61,...,-0.112,1.414,-6.276,-7.456,-3.326,1.272,-4.395,-5.341,-4.537,co2a0000364.rd.030.gz
2,2,-2.177,-7.151,-1.17,-5.259,-1.292,-2.981,1.18,-3.815,2.075,...,-0.6,-0.051,-2.37,-4.527,-2.838,-0.682,-4.883,-4.364,-5.025,co2a0000364.rd.030.gz
3,3,1.241,0.173,-3.611,-8.189,1.638,2.391,1.668,-5.28,-1.343,...,-1.577,-1.027,1.048,-2.574,-1.862,-2.635,-2.441,-3.876,-3.56,co2a0000364.rd.030.gz
4,4,2.706,8.962,-5.564,-4.283,4.079,2.391,1.18,0.58,-1.343,...,-2.553,-1.516,0.071,-3.062,-0.885,-3.611,0.488,-3.876,-0.631,co2a0000364.rd.030.gz


In [7]:
dataset=(trail_no_channels.merge(bridge, how ='outer', on=['trial_no']))
dataset.tail()

Unnamed: 0,sample,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,...,TP8,X,Y,nd,trial_no,patient_no,alcoholic,match,err,file_loc
153595,251,-17.302999,-24.058001,-7.965,-35.136002,-19.094,2.268,-2.909,1.638,-7.111,...,-14.74,-35.613998,-18.443001,-9.43,co2c0000347.rd.117.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TEST\co2c0000347\co2c0000347.r...
153596,252,-15.839,-22.105,-12.848,-31.23,-17.629,1.292,-3.398,-0.315,-8.087,...,-16.693001,-40.985001,-20.884001,-13.824,co2c0000347.rd.117.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TEST\co2c0000347\co2c0000347.r...
153597,253,-14.374,-18.198999,-19.684,-26.347,-15.188,0.315,-2.909,-2.757,-7.599,...,-16.205,-46.355999,-23.813999,-17.731001,co2c0000347.rd.117.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TEST\co2c0000347\co2c0000347.r...
153598,254,-12.909,-15.269,-20.66,-22.929001,-13.723,-0.173,-2.421,-3.245,-6.622,...,-13.763,-49.285999,-25.278999,-18.219,co2c0000347.rd.117.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TEST\co2c0000347\co2c0000347.r...
153599,255,-12.421,-13.804,-13.824,-21.952,-12.746,0.315,-1.933,-2.268,-5.646,...,-9.857,-48.308998,-24.302,-16.266001,co2c0000347.rd.117.gz,co2c0000347,False,match,False,./Data/SMNI_CMI_TEST\co2c0000347\co2c0000347.r...


In [8]:
fileloc = r'Data'

data_save (dataset, fileloc, 'test_set')