In [1]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm

In [11]:
RFID_COC = pd.read_csv('rfid_cocaine.csv', index_col=0)
RFID_COC.head()

Unnamed: 0,subject,rfid
0,F1,933000120124701
1,F2,933000120124703
2,F3,933000120117342
3,F4,933000120117333
4,F5,933000120117347


In [12]:
RFID_OXY = pd.read_csv('rfid_oxycodone.csv', index_col=0)
RFID_OXY.head()

Unnamed: 0,subject,rfid
0,F101,933000100000000.0
1,F102,933000100000000.0
2,F103,933000100000000.0
3,F104,933000100000000.0
4,F105,933000100000000.0


In [41]:
input_path = 'input'
output_path = 'output'

parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS[OXY]*((?:LGA|SHA)[0-9]{2})",
           r"(\AC[0-9]{2})HS[OXY]*((?:LGA|SHA)[0-9]{2})"]
drug = 'Cocaine'
file = 'BSB273BC08HSLGA01_output.xlsx'

In [42]:
# import data and transpose
filepath = os.path.join(input_path, file)
df_raw = pd.read_excel(filepath).T
df_raw.reset_index(inplace=True)

# modify the header
new_header = df_raw.iloc[0]   #grab the first row for the header
df = df_raw[1:]               #take the data except the header row
df.columns = new_header 
df.reset_index(drop=True, inplace=True)
df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

# change data types
cols = df.columns.tolist()
for col in cols:
    name = col.lower()
    if ('active' in name) or ('reward' in name) or ('timeout' in name) or (name == 'box'):
        df[col] = df[col].astype('int32')
    elif ('date' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
    elif ('time' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
    else:
        pass

In [43]:
# group the timestamps
colnames = df.columns.tolist()
active_col_begin = colnames.index('Active 1')
inactive_col_begin = colnames.index('Inactive 1')
reward_col_begin = colnames.index('Reward 1')
timeout_col_begin = colnames.index('Timeout Press 1')
idx_end = df.shape[1]
df['Active Timestamps'] = df.iloc[:, active_col_begin:inactive_col_begin].values.tolist()
df['Inactive Timestamps'] = df.iloc[:, inactive_col_begin:reward_col_begin].values.tolist()
df['Reward Timestamps'] = df.iloc[:, reward_col_begin:timeout_col_begin].values.tolist()
df['Timeout Timestamps'] = df.iloc[:, timeout_col_begin:idx_end].values.tolist()

In [44]:
# reorganize the columns
timestamp_col_begin = df.columns.tolist().index('Active Timestamps')
df.drop(df.iloc[:, active_col_begin:timestamp_col_begin], inplace=True, axis=1)
df.rename(columns={"Reward": "Reward Presses"}, inplace=True)
df['Timeout Presses'] = df['Timeout Timestamps'].apply(lambda x: len([i for i in x if i != 0]))

# parse the filename
if file[0] == 'C':
    parser = parsers[1]
    cohort, trial_id = re.findall(parser, file)[0]
    room = 'N/A'
else:
    parser = parsers[0]
    room, cohort, trial_id = re.findall(parser, file)[0]

df['Room'] = [room] * len(df)
df['Cohort'] = [cohort] * len(df)
df['Cohort'] = df['Cohort'].apply(lambda x: int(x[1:]))
df['Trial ID'] = [trial_id] * len(df)
df['Drug'] = [drug] * len(df)

# get the final output
new_columns = ['Subject','Room','Cohort','Trial ID','Drug','Box','Start Time','End Time','Start Date','End Date',
               'Active Lever Presses','Inactive Lever Presses','Reward Presses','Timeout Presses',
               'Active Timestamps','Inactive Timestamps','Reward Timestamps','Timeout Timestamps']
df = df[new_columns]

In [45]:
df.rename(columns=str.lower,inplace=True)

if drug.lower() == 'cocaine':
    rfid_to_merge = RFID_COC
if drug.lower() == 'oxycodone':
    rfid_to_merge = RFID_OXY
    
df = pd.merge(df, rfid_to_merge,  how='left', on = ['subject'])
old_columns = df.columns.tolist()
new_columns = [old_columns[-1]] + old_columns[:-1]
df = df[new_columns]
df.columns = df.columns.str.replace(' ','_')
df.fillna({'rfid':-999}, inplace=True)
df

Unnamed: 0,rfid,subject,room,cohort,trial_id,drug,box,start_time,end_time,start_date,end_date,active_lever_presses,inactive_lever_presses,reward_presses,timeout_presses,active_timestamps,inactive_timestamps,reward_timestamps,timeout_timestamps
0,933000300000000.0,F801,BSB273B,8,LGA01,Cocaine,1,09:16:44,15:42:38,2019-07-11,2019-07-11,113,2,86,27,"[2309, 2316, 2317, 2349, 2410, 2446, 2526, 254...","[2342, 2343, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2309, 2349, 2410, 2446, 2526, 2546, 2922, 312...","[2316, 2317, 3539, 5848, 5849, 7465, 9309, 107..."
1,933000300000000.0,F802,BSB273B,8,LGA01,Cocaine,2,09:16:49,15:42:38,2019-07-11,2019-07-11,101,8,68,33,"[307, 315, 422, 423, 2693, 2694, 2925, 3242, 3...","[471, 519, 519, 523, 804, 808, 4719, 20603, 0,...","[307, 422, 2693, 2925, 3242, 3472, 3787, 4539,...","[315, 423, 2694, 4783, 4785, 5317, 8143, 8849,..."
2,933000300000000.0,F803,BSB273B,8,LGA01,Cocaine,3,09:16:52,15:42:38,2019-07-11,2019-07-11,5,11,5,0,"[2788, 3120, 4917, 7356, 16818, 0, 0, 0, 0, 0,...","[2805, 2922, 3182, 3183, 3277, 3365, 8349, 164...","[2788, 3120, 4917, 7356, 16818, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,933000300000000.0,F804,BSB273B,8,LGA01,Cocaine,4,09:16:54,15:42:38,2019-07-11,2019-07-11,20,9,16,4,"[4551, 4723, 4729, 6709, 6711, 6859, 8197, 824...","[6854, 11123, 14652, 14672, 14673, 14675, 2044...","[4551, 4723, 6709, 6859, 8197, 8241, 8801, 131...","[4729, 6711, 14688, 19069, 0, 0, 0, 0, 0, 0, 0..."
4,933000300000000.0,F805,BSB273B,8,LGA01,Cocaine,5,09:16:58,15:42:38,2019-07-11,2019-07-11,144,0,127,17,"[100, 166, 166, 167, 188, 213, 492, 516, 628, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[100, 166, 188, 213, 492, 516, 628, 688, 886, ...","[166, 167, 2868, 8858, 12183, 13597, 14155, 14..."
5,-999.0,,BSB273B,8,LGA01,Cocaine,6,09:17:07,15:42:38,2019-07-11,2019-07-11,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,933000300000000.0,F807,BSB273B,8,LGA01,Cocaine,7,09:17:14,15:42:38,2019-07-11,2019-07-11,1,9,1,0,"[9737, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3678, 3679, 3681, 7403, 7406, 7458, 12089, 18...","[9737, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,933000300000000.0,F808,BSB273B,8,LGA01,Cocaine,8,09:17:17,15:42:38,2019-07-11,2019-07-11,39,1,32,7,"[3707, 3768, 5724, 8521, 8559, 8954, 10635, 11...","[762, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3707, 3768, 5724, 8521, 8559, 8954, 10635, 11...","[12514, 14140, 15565, 15993, 18188, 18413, 188..."
8,933000300000000.0,F809,BSB273B,8,LGA01,Cocaine,9,09:17:20,15:42:38,2019-07-11,2019-07-11,20,1,15,5,"[6887, 8963, 8969, 8971, 11775, 11776, 11776, ...","[11660, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6887, 8963, 11775, 11805, 11986, 12074, 12207...","[8969, 8971, 11776, 11776, 18781, 0, 0, 0, 0, ..."
9,933000300000000.0,F810,BSB273B,8,LGA01,Cocaine,10,09:17:23,15:42:38,2019-07-11,2019-07-11,1,5,1,0,"[9693, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11760, 17164, 20411, 20414, 20415, 0, 0, 0, 0...","[9693, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [46]:
filename = file[:-11] + 'transformed.csv'
df.to_csv(os.path.join(output_path, filename))

In [None]:
def transform_data(input_path, file, parsers, drug):
    # import data and transpose
    filepath = os.path.join(input_path, file)
    df_raw = pd.read_excel(filepath).T
    df_raw.reset_index(inplace=True)

    # modify the header
    new_header = df_raw.iloc[0]   #grab the first row for the header
    df = df_raw[1:]               #take the data except the header row
    df.columns = new_header 
    df.reset_index(drop=True, inplace=True)
    df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)
    
    # change data types
    cols = df.columns.tolist()
    for col in cols:
        name = col.lower()
        if ('active' in name) or ('reward' in name) or ('timeout' in name) or (name == 'box'):
            df[col] = df[col].astype('int32')
        elif ('date' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
        elif ('time' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
        else:
            pass
        
    # group the timestamps
    colnames = df.columns.tolist()
    active_col_begin = colnames.index('Active 1')
    inactive_col_begin = colnames.index('Inactive 1')
    reward_col_begin = colnames.index('Reward 1')
    timeout_col_begin = colnames.index('Timeout Press 1')
    df['Active Timestamps'] = df.iloc[:, active_col_begin:inactive_col_begin].values.tolist()
    df['Inactive Timestamps'] = df.iloc[:, inactive_col_begin:reward_col_begin].values.tolist()
    df['Reward Timestamps'] = df.iloc[:, reward_col_begin:timeout_col_begin].values.tolist()
    df['Timeout Timestamps'] = df.iloc[:, timeout_col_begin:].values.tolist()
    
    # reorganize the columns
    timestamp_col_begin = df.columns.tolist().index('Active Timestamps')
    df.drop(df.iloc[:, active_col_begin:timestamp_col_begin], inplace=True, axis=1)
    df.rename(columns={"Reward": "Reward Presses"}, inplace=True)
    df['Timeout Presses'] = df['Timeout Timestamps'].apply(lambda x: len([i for i in x if i != 0]))
    
    # parse the filename
    if file[0] == 'C':
        parser = parsers[1]
        cohort, trial_id = re.findall(parser, file)[0]
        room = 'N/A'
    else:
        parser = parsers[0]
        room, cohort, trial_id = re.findall(parser, file)[0]
        
    df['Room'] = [room] * len(df)
    df['Cohort'] = [cohort] * len(df)
    df['Cohort'] = df['Cohort'].apply(lambda x: int(x[1:]))
    df['Trial ID'] = [trial_id] * len(df)
    df['Drug'] = [drug] * len(df)
    
    # get the final output
    new_columns = ['Subject','Room','Cohort','Trial ID','Drug','Box','Start Time','End Time','Start Date','End Date',
                   'Active Lever Presses','Inactive Lever Presses','Reward Presses','Timeout Presses',
                   'Active Timestamps','Inactive Timestamps','Reward Timestamps','Timeout Timestamps']
    df = df[new_columns]
    
    return df

In [None]:
if __name__ == "__main__":
    input_path = '__file path for input excel LGA or SHA data__'
    output_path = '__file path for output transformed LGA or SHA data__'
    parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS[OXY]*((?:LGA|SHA)[0-9]{2})",
               r"(\AC[0-9]{2})HS[OXY]*((?:LGA|SHA)[0-9]{2})"]
    drug = '__use oxycodone or cocaine__'
    files = [i for i in sorted(os.listdir(input_path)) if i != '.DS_Store']

    for i in tqdm(range(len(files))):
        df = transform_data(input_path, files[i], parsers, drug)
        filename = files[i][:-11] + 'transformed.csv'
        df.to_csv(os.path.join(output_path, filename))