In [1]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm

In [17]:
input_path = '/Users/yunyihuang/Desktop/gl_data/COCAINE/SHOCK'
output_path = '/Users/yunyihuang/Desktop/gl_data/NEW_TRIAL/COC_SHOCK'
parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS((?:PRESHOCK|SHOCK))",
               r"(\AC[0-9]{2})HS((?:PRESHOCK|SHOCK))"]

In [18]:
files = [i for i in sorted(os.listdir(input_path)) if i != '.DS_Store']
files

['BSB273BC08HSSHOCK-1_output.xlsx',
 'BSB273BC08HSSHOCK-2_output.xlsx',
 'BSB273BC08HSSHOCK-3_output.xlsx',
 'BSB273BC08HSSHOCK_output.xlsx',
 'BSB273BC09HSPRESHOCK-1_output.xlsx',
 'BSB273BC09HSPRESHOCK-2_output.xlsx',
 'BSB273BC09HSPRESHOCK_output.xlsx',
 'BSB273BC09HSSHOCK-1_output.xlsx',
 'BSB273BC09HSSHOCK-2_output.xlsx',
 'BSB273BC09HSSHOCK-3_output.xlsx',
 'BSB273BC09HSSHOCK_output.xlsx',
 'BSB273BC11HSPRESHOCK-2_output.xlsx',
 'BSB273BC11HSPRESHOCK-3_output.xlsx',
 'BSB273BC11HSPRESHOCK-4_output.xlsx',
 'BSB273BC11HSPRESHOCK_output.xlsx',
 'BSB273BC11HSSHOCK-2_output.xlsx',
 'BSB273BC11HSSHOCK-3_output.xlsx',
 'BSB273BC11HSSHOCK-4_output.xlsx',
 'BSB273BC11HSSHOCK_output.xlsx',
 'BSB273BC13HSPRESHOCK2_output.xlsx',
 'BSB273BC13HSPRESHOCK3_output.xlsx',
 'BSB273BC13HSPRESHOCK_output.xlsx',
 'BSB273BC13HSSHOCK2_output.xlsx',
 'BSB273BC13HSSHOCK3_output.xlsx',
 'BSB273BC13HSSHOCK_output.xlsx',
 'BSB273BC14HSPRESHOCK2_output.xlsx',
 'BSB273BC14HSPRESHOCK3_output.xlsx',
 'BSB273BC14

In [19]:
df_rfid_coc = pd.read_csv('rfid_cocaine.csv', index_col=0)
df_rfid_coc

Unnamed: 0,subject,rfid
0,F1,933000120124701
1,F2,933000120124703
2,F3,933000120117342
3,F4,933000120117333
4,F5,933000120117347
...,...,...
1206,M2084,933000320525501
1207,M2085,933000320525508
1208,M2086,933000320525502
1209,M2087,933000320525510


In [21]:
def transform_shock(input_path, file, parsers):
    filepath = os.path.join(input_path, file)
    df_raw = pd.read_excel(filepath)

    num_subjects = len(set([i for i in df_raw.iloc[5,:].values if isinstance(i, int)]))
    if df_raw.shape[1] > num_subjects+1:
        df_raw = df_raw.iloc[:,:num_subjects+1]

    df_raw = df_raw.T
    df_raw.reset_index(inplace=True)

    # modify the header
    new_header = df_raw.iloc[0]   #grab the first row for the header
    df = df_raw[1:]               #take the data except the header row
    df.columns = new_header 
    df.reset_index(drop=True, inplace=True)
    df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

    # change data types
    cols = df.columns.tolist()
    int_columns = ['box','total shocks','total reward']

    for col in cols:
        name = col.lower()
        if ('active' in name) or ('reward' in name) or (name in int_columns):
            df[col] = df[col].astype('int32')
        elif ('date' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
        elif ('time' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
        else:
            pass

    # reorganize the columns
    colnames = df.columns.tolist()

    reward_shock_begin = colnames.index('Reward # Got Shock 1')
    reward_col_begin = colnames.index('Reward 1')
    reward_col_end = colnames.index('Reward 201')

    df['Rewards Got Shock'] = df.iloc[:,reward_shock_begin:reward_col_begin].values.tolist()
    df['Reward Timestamps'] = df.iloc[:,reward_col_begin:reward_col_end+1].values.tolist()

    df.drop(df.iloc[:, reward_shock_begin:reward_col_end+1], inplace=True, axis=1)

    # parse the file name
    if file[0] == 'C':
        parser = parsers[1]
        cohort, trial_id = re.findall(parser, file)[0]
        room = 'N/A'
    else:
        parser = parsers[0]
        room, cohort, trial_id = re.findall(parser, file)[0]

    df['Room'] = [room] * len(df)
    df['Cohort'] = [cohort] * len(df)
    df['Cohort'] = df['Cohort'].apply(lambda x: int(x[1:]))
    df['Trial ID'] = [trial_id] * len(df)
    df['Drug'] = ['Cocaine'] * len(df)

    # reorgainzed the columns order, format
    new_columns = ['Subject','Room','Cohort','Trial ID','Drug','Box','Start Time','End Time','Start Date','End Date',
                   'Total Active Lever Presses', 'Total Inactive Lever Presses','Total Shocks', 'Total Reward', 
                   'Rewards After First Shock','Rewards Got Shock', 'Reward Timestamps']
    
    df = df[new_columns]
    df = df.sort_values(by='Subject').reset_index(drop=True)
    df.rename(columns=str.lower,inplace=True)
    df = pd.merge(df, df_rfid_coc,  how='left', on = ['subject'])
    old_columns = df.columns.tolist()
    new_columns = [old_columns[-1]] + old_columns[:-1]
    df = df[new_columns]
    df.columns = df.columns.str.replace(' ','_')
    df.fillna({'rfid':-999}, inplace=True)
    
    # store the final output in csv
    filename = file[:-11] + 'transformed.csv'
    df.to_csv(os.path.join(output_path, filename))

In [22]:
for i in tqdm(range(len(files))):
    transform_shock(input_path, files[i], parsers)

100%|█████████████████████████████████████████| 102/102 [00:09<00:00, 10.78it/s]
