In [1]:
import pandas as pd
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm

In [2]:
df_rfid_coc = pd.read_csv('/Users/yunyihuang/Desktop/DataStream/data_preprocessing/rfid_cocaine.csv', index_col=0)

In [10]:
# reingest these
problem_fp = '/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataSource/excel_output_files/COCAINE/SHOCK/problem_original'
files_to_reingest = [i for i in sorted(os.listdir(problem_fp))]
files_to_reingest

['MED1110C05HSSHOCK01_output.xlsx']

In [11]:
characteristics_SHOCK = ['rfid', 'subject', 'room', 'cohort', 'trial_id', 'drug', 'box',
       'start_time', 'end_time', 'start_date', 'end_date',
       'total_active_lever_presses', 'total_inactive_lever_presses',
       'total_shocks', 'total_reward', 'rewards_after_first_shock',
       'rewards_got_shock', 'reward_timestamps']

In [12]:
input_path = '/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataSource/excel_output_files/COCAINE/SHOCK'
output_path = '/Users/yunyihuang/Desktop/gl_data/TRIAL/COC_SHOCK'
parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS[COCAINE]*((?:PRESHOCK[0-9]*|SHOCK[0-9]*))",
           r"(\AC[0-9]{2})HS((?:PRESHOCK[0-9]*|SHOCK[0-9]*))"]
files = [i for i in sorted(os.listdir(input_path)) if (i != '.DS_Store') and ('Backup' not in i)
        and ('miscalculated' not in i) and ('duplicates' not in i) and ('C21' not in i) 
         and ('problem' not in i) and (i in files_to_reingest)]
files

['MED1110C05HSSHOCK01_output.xlsx']

In [13]:
len(files)

1

In [8]:
def reformat_shock_id(shock_id, cohort):
    if 'PRESHOCK' in shock_id:
        return 'PRESHOCK'
    elif cohort in range(1,6):
        return 'SHOCK' + '_V' + str(int(shock_id[5:]))
    else:
        return 'SHOCK_V3'
    
# return valid list of datapoints
def process_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return lst

# count valid data points
def count_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return len(lst)

In [94]:
transform_shock(input_path, 'MTF134BC19HSPRESHOCK02_output.xlsx', parsers)

## Main Code

In [14]:
def transform_shock(input_path, file, parsers):
    filepath = os.path.join(input_path, file)
    df_raw = pd.read_excel(filepath)

    # remove extra
    num_subjects = len(set([i for i in df_raw.iloc[5,:].values if isinstance(i, int)]))
    if df_raw.shape[1] > num_subjects+1:
        df_raw = df_raw.iloc[:,:num_subjects+1]

    df_raw = df_raw.T
    df_raw.reset_index(inplace=True)

    # modify the header
    new_header = df_raw.iloc[0]   #grab the first row for the header
    df = df_raw[1:]               #take the data except the header row
    df.columns = new_header 
    df.reset_index(drop=True, inplace=True)
    df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

    # change data types
    cols = df.columns.tolist()
    int_columns = ['box','total shocks','total reward']

    for col in cols:
        name = col.lower()
        if ('active' in name) or ('reward' in name) or (name in int_columns):
            df[col] = df[col].astype('int32')
        elif ('date' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
        elif ('time' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
        else:
            pass

    # reorganize the columns
    colnames = df.columns.tolist()

    reward_shock_begin = colnames.index('Reward # Got Shock 1')
    reward_col_begin = colnames.index('Reward 1')
    reward_col_end = colnames.index('Reward 201')

    df['Rewards Got Shock'] = df.iloc[:,reward_shock_begin:reward_col_begin].values.tolist()
    df['Rewards Got Shock'] = df['Rewards Got Shock'].apply(process_datapoints)
    df['Reward Timestamps'] = df.iloc[:,reward_col_begin:reward_col_end+1].values.tolist()
    df['Reward Timestamps'] = df['Reward Timestamps'].apply(process_datapoints)

    df.drop(df.iloc[:, reward_shock_begin:reward_col_end+1], inplace=True, axis=1)

    modified_filename = file.replace('-','0')
    # parse the file name
    if file[0] == 'C':
        parser = parsers[1]
        cohort, shock_id = re.findall(parser, modified_filename)[0]
        room = None
    else:
        parser = parsers[0]
        room, cohort, shock_id = re.findall(parser, modified_filename)[0]

    cohort = int(cohort[1:])
    trial_id = reformat_shock_id(shock_id, cohort)

    df['room'] = [room] * len(df)
    df['cohort'] = [cohort] * len(df)
    df['trial_id'] = [trial_id] * len(df)
    df['drug'] = ['cocaine'] * len(df)

    # get the final output
    df.rename(columns=str.lower,inplace=True)
    df = pd.merge(df, df_rfid_coc,  how='left', on = ['subject'])
    df.columns = df.columns.str.replace(' ','_')
    df.fillna({'rfid':-999}, inplace=True)
    df['rfid'] = df['rfid'].astype('int64')
    df = df[characteristics_SHOCK]
    df = df.sort_values(by='box')
    
    # store the final output in csv
    filename = file[:-11] + 'transformed.csv'
    df.to_csv(os.path.join(output_path, filename))

In [15]:
for i in tqdm(range(len(files))):
    transform_shock(input_path, files[i], parsers)

100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.98it/s]


## Sketch

In [124]:
file = 'C03HSSHOCK01.2_output.xlsx'

In [130]:
filepath = os.path.join(input_path, file)
df_raw = pd.read_excel(filepath)

# remove extra
num_subjects = len(set([i for i in df_raw.iloc[5,:].values if isinstance(i, int)]))
if df_raw.shape[1] > num_subjects+1:
    df_raw = df_raw.iloc[:,:num_subjects+1]

df_raw = df_raw.T
df_raw.reset_index(inplace=True)

# modify the header
new_header = df_raw.iloc[0]   #grab the first row for the header
df = df_raw[1:]               #take the data except the header row
df.columns = new_header 
df.reset_index(drop=True, inplace=True)
df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

# change data types
cols = df.columns.tolist()
int_columns = ['box','total shocks','total reward']

for col in cols:
    name = col.lower()
    if ('active' in name) or ('reward' in name) or (name in int_columns):
        df[col] = df[col].astype('int32')
    elif ('date' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
    elif ('time' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
    else:
        pass

# reorganize the columns
colnames = df.columns.tolist()

reward_shock_begin = colnames.index('Reward # Got Shock 1')
reward_col_begin = colnames.index('Reward 1')
reward_col_end = colnames.index('Reward 201')

df['Rewards Got Shock'] = df.iloc[:,reward_shock_begin:reward_col_begin].values.tolist()
df['Rewards Got Shock'] = df['Rewards Got Shock'].apply(process_datapoints)
df['Reward Timestamps'] = df.iloc[:,reward_col_begin:reward_col_end+1].values.tolist()
df['Reward Timestamps'] = df['Reward Timestamps'].apply(process_datapoints)

df.drop(df.iloc[:, reward_shock_begin:reward_col_end+1], inplace=True, axis=1)

modified_filename = file.replace('-','0')
# parse the file name
if file[0] == 'C':
    parser = parsers[1]
    cohort, shock_id = re.findall(parser, modified_filename)[0]
    room = None
else:
    parser = parsers[0]
    room, cohort, shock_id = re.findall(parser, modified_filename)[0]

cohort = int(cohort[1:])
trial_id = reformat_shock_id(shock_id, cohort)

df['room'] = [room] * len(df)
df['cohort'] = [cohort] * len(df)
df['trial_id'] = [trial_id] * len(df)
df['drug'] = ['cocaine'] * len(df)

# get the final output
df.rename(columns=str.lower,inplace=True)
df = pd.merge(df, df_rfid_coc,  how='left', on = ['subject'])
df.columns = df.columns.str.replace(' ','_')
df.fillna({'rfid':-999}, inplace=True)
df['rfid'] = df['rfid'].astype('int64')
df = df[characteristics_SHOCK]
df = df.sort_values(by='box')

df

Unnamed: 0,rfid,subject,room,cohort,trial_id,drug,box,start_time,end_time,start_date,end_date,total_active_lever_presses,total_inactive_lever_presses,total_shocks,total_reward,rewards_after_first_shock,rewards_got_shock,reward_timestamps
0,933000120138648,F309,,3,SHOCK_V1,cocaine,1,08:54:04,10:03:41,2018-03-15,2018-03-15,26,5,6,23,21,"[2, 3, 10, 15, 16, 19]","[308, 345, 402, 440, 506, 666, 843, 979, 1158,..."
1,933000120138642,F310,,3,SHOCK_V1,cocaine,2,08:54:08,10:03:41,2018-03-15,2018-03-15,27,0,8,25,21,"[4, 6, 9, 11, 12, 13, 21, 24]","[8, 35, 59, 84, 107, 238, 331, 578, 790, 963, ..."
2,933000120138559,F311,,3,SHOCK_V1,cocaine,3,08:54:11,10:03:41,2018-03-15,2018-03-15,31,7,7,27,26,"[1, 5, 6, 15, 18, 19, 22]","[17, 39, 126, 175, 259, 305, 458, 565, 638, 84..."
3,933000120138556,F312,,3,SHOCK_V1,cocaine,4,08:54:13,10:03:41,2018-03-15,2018-03-15,28,0,8,26,20,"[6, 7, 10, 11, 14, 20, 22, 23]","[333, 368, 451, 538, 649, 692, 793, 943, 1112,..."
4,933000120138647,F313,,3,SHOCK_V1,cocaine,5,08:54:16,10:03:41,2018-03-15,2018-03-15,25,1,6,22,15,"[7, 9, 10, 11, 17, 19]","[36, 67, 101, 171, 219, 288, 583, 924, 988, 13..."
5,933000120138652,F314,,3,SHOCK_V1,cocaine,6,08:54:22,10:03:41,2018-03-15,2018-03-15,3,1,1,2,1,[1],"[152, 423]"
6,933000120138634,F315,,3,SHOCK_V1,cocaine,7,08:54:25,10:03:41,2018-03-15,2018-03-15,14,0,4,13,7,"[6, 7, 8, 12]","[14, 109, 252, 793, 996, 1261, 1546, 1870, 216..."
7,933000120138633,F316,,3,SHOCK_V1,cocaine,8,08:54:29,10:03:41,2018-03-15,2018-03-15,26,0,8,25,21,"[4, 8, 9, 11, 18, 19, 22, 24]","[15, 51, 86, 145, 220, 388, 620, 760, 877, 108..."
8,933000120138394,M359,,3,SHOCK_V1,cocaine,9,08:54:47,10:03:41,2018-03-15,2018-03-15,30,0,6,23,21,"[2, 3, 7, 11, 15, 20]","[32, 79, 125, 366, 494, 813, 851, 1068, 1362, ..."
9,933000120138389,M360,,3,SHOCK_V1,cocaine,10,08:54:50,10:03:41,2018-03-15,2018-03-15,0,0,0,0,0,,


In [131]:
df.shape

(16, 18)