In [109]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm

In [110]:
RFID_OXY = pd.read_csv('rfid_oxycodone.csv', index_col=0)
RFID_COC = pd.read_csv('rfid_cocaine.csv', index_col=0)

In [111]:
characteristics_PR = ['rfid', 'subject', 'room', 'cohort', 'trial_id', 'drug', 'box','start_time', 'end_time', 
 'start_date', 'end_date', 'breakpoint', 'last_ratio', 'ratios', 'active_lever_presses', 'inactive_lever_presses',
 'reward_presses']

In [112]:
coc_rewards = np.arange(19)
coc_lr = [0,1,2,4,6,9,12,15,20,25,32,40,50,62,77,95,118,145,178]
coc_lr_dict = dict(zip(coc_rewards,coc_lr))
coc_lr_dict

{0: 0,
 1: 1,
 2: 2,
 3: 4,
 4: 6,
 5: 9,
 6: 12,
 7: 15,
 8: 20,
 9: 25,
 10: 32,
 11: 40,
 12: 50,
 13: 62,
 14: 77,
 15: 95,
 16: 118,
 17: 145,
 18: 178}

In [113]:
oxy_rewards = np.arange(69)
oxy_lr = [0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10] + np.arange(10,49).tolist() + [50,60,70,80,90,100,100,100,100,100]
oxy_lr_dict = dict(zip(oxy_rewards,oxy_lr))
oxy_lr_dict

{0: 0,
 1: 1,
 2: 1,
 3: 2,
 4: 2,
 5: 3,
 6: 3,
 7: 4,
 8: 4,
 9: 5,
 10: 5,
 11: 6,
 12: 6,
 13: 7,
 14: 7,
 15: 8,
 16: 8,
 17: 9,
 18: 9,
 19: 10,
 20: 10,
 21: 11,
 22: 12,
 23: 13,
 24: 14,
 25: 15,
 26: 16,
 27: 17,
 28: 18,
 29: 19,
 30: 20,
 31: 21,
 32: 22,
 33: 23,
 34: 24,
 35: 25,
 36: 26,
 37: 27,
 38: 28,
 39: 29,
 40: 30,
 41: 31,
 42: 32,
 43: 33,
 44: 34,
 45: 35,
 46: 36,
 47: 37,
 48: 38,
 49: 39,
 50: 40,
 51: 41,
 52: 42,
 53: 43,
 54: 44,
 55: 45,
 56: 46,
 57: 47,
 58: 48,
 59: 50,
 60: 60,
 61: 70,
 62: 80,
 63: 90,
 64: 100,
 65: 100,
 66: 100,
 67: 100,
 68: 100}

In [163]:
# retrieve the breakpoint
def get_last_ratio(lr_dict, breakpoint):
    if not breakpoint or np.isnan(breakpoint):
        return None
    if breakpoint not in lr_dict:
        return None
    lr_list = list(lr_dict.values())
    idx = lr_list.index(breakpoint) + 1
    return lr_list[idx]

# return valid list of datapoints
def process_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return lst

# count valid data points
def count_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return len(lst)
    
# standardize trial id
def process_trial_id(tid):
    i = 0
    while not (tid[i].isdigit()):
        i += 1
    name,num = tid[:i],tid[i:]
    res = name + num.rjust(2, "0")
    return res

### coc

In [114]:
input_path = '/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataSource/excel_output_files/COCAINE/PR'
output_path = '/Users/yunyihuang/Desktop/gl_data/TRIAL/COC_PR'

parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS[COCAINE|OXY]*((?:LGA|SHA|PR|TREATMENT)[0-9]+)_output",
               r"(\AC[0-9]{2})HS[OXY]*((?:LGA|SHA|PR|TREATMENT)[0-9]+)_output"]
drugA = 'cocaine'

In [93]:
coc_files = [i for i in sorted(os.listdir(input_path)) if (i != '.DS_Store') and ('C21' not in i)
            and ('duplicates' not in i)]
coc_files

['BSB273BC08HSPR01_output.xlsx',
 'BSB273BC08HSPR02_output.xlsx',
 'BSB273BC08HSPR03_output.xlsx',
 'BSB273BC09HSPR01_output.xlsx',
 'BSB273BC09HSPR02_output.xlsx',
 'BSB273BC09HSPR03_output.xlsx',
 'BSB273BC11HSPR01_output.xlsx',
 'BSB273BC11HSPR02_output.xlsx',
 'BSB273BC11HSPR03_output.xlsx',
 'BSB273BC13HSPR01_output.xlsx',
 'BSB273BC13HSPR02_output.xlsx',
 'BSB273BC13HSPR03_output.xlsx',
 'BSB273BC14HSPR01_output.xlsx',
 'BSB273BC14HSPR02_output.xlsx',
 'BSB273BC14HSPR03_output.xlsx',
 'BSB273BC15HSCOCAINETREATMENT1_output.xlsx',
 'BSB273BC15HSCOCAINETREATMENT2_output.xlsx',
 'BSB273BC15HSCOCAINETREATMENT3_output.xlsx',
 'BSB273BC15HSCOCAINETREATMENT4_output.xlsx',
 'BSB273BC15HSPR01_output.xlsx',
 'BSB273BC15HSPR02_output.xlsx',
 'BSB273BC15HSPR03_output.xlsx',
 'BSB273BC16HSPR01_output.xlsx',
 'BSB273BC16HSPR02_output.xlsx',
 'BSB273BC16HSPR03_output.xlsx',
 'BSB273BC17HSPR01_output.xlsx',
 'BSB273BC17HSPR02_output.xlsx',
 'BSB273BC17HSPR03_output.xlsx',
 'BSB273CC08HSPR01_outpu

In [94]:
len(coc_files)

174

### oxy

In [123]:
input_path = '/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataSource/excel_output_files/OXYCODONE/PR'
output_path = '/Users/yunyihuang/Desktop/gl_data/TRIAL/OXY_PR'

parsers = [r"(\A[A-Z]+[0-9]+[A-Z|0-9]{1})(C[0-9]{2})HS[COCAINE|OXY]*((?:LGA|SHA|PR|TREATMENT)[0-9]+)_output",
               r"(\AC[0-9]{2})HS[OXY]*((?:LGA|SHA|PR|TREATMENT)[0-9]+)_output"]
drugB = 'oxycodone'

In [121]:
oxy_files = [i for i in sorted(os.listdir(input_path)) if (i != '.DS_Store') and ('DISSECTION' not in i) 
             and ('PRETREATMENT' not in i) and ('C21' not in i) and ('duplicates' not in i)]
oxy_files

['BSB273BC04HSOXYPR1_output.xlsx',
 'BSB273BC04HSOXYPR2_output.xlsx',
 'BSB273BC04HSOXYTREATMENT1_output.xlsx',
 'BSB273BC04HSOXYTREATMENT2_output.xlsx',
 'BSB273BC04HSOXYTREATMENT3_output.xlsx',
 'BSB273BC04HSOXYTREATMENT4_output.xlsx',
 'BSB273BC05HSOXYPR1_output.xlsx',
 'BSB273BC05HSOXYPR2_output.xlsx',
 'BSB273BC05HSOXYTREATMENT1_output.xlsx',
 'BSB273BC05HSOXYTREATMENT2_output.xlsx',
 'BSB273BC05HSOXYTREATMENT3_output.xlsx',
 'BSB273BC05HSOXYTREATMENT4_output.xlsx',
 'BSB273BC06HSOXYPR01_output.xlsx',
 'BSB273BC06HSOXYPR02_output.xlsx',
 'BSB273BC06HSOXYTREATMENT1_output.xlsx',
 'BSB273BC06HSOXYTREATMENT2_output.xlsx',
 'BSB273BC06HSOXYTREATMENT3_output.xlsx',
 'BSB273BC06HSOXYTREATMENT4_output.xlsx',
 'BSB273BC07HSOXYPR01_output.xlsx',
 'BSB273BC07HSOXYPR02_output.xlsx',
 'BSB273BC07HSOXYTREATMENT1_output.xlsx',
 'BSB273BC07HSOXYTREATMENT2_output.xlsx',
 'BSB273BC07HSOXYTREATMENT3_output.xlsx',
 'BSB273BC07HSOXYTREATMENT4_output.xlsx',
 'BSB273BC09HSOXYPR01_output.xlsx',
 'BSB273

In [100]:
len(oxy_files)

361

## Main Code

In [164]:
def transform_pr(input_path, output_path, file, parsers, drug):
    # import data and transpose
    filepath = os.path.join(input_path, file)
    df_raw = pd.read_excel(filepath)
    num_subjects = len(set([i for i in df_raw.iloc[5,:].values if isinstance(i, int)]))

    if df_raw.shape[1] > num_subjects+1:
        df_raw = df_raw.iloc[:,:num_subjects+1]

    df_raw = df_raw.T
    df_raw.reset_index(inplace=True)

    # modify the header
    new_header = df_raw.iloc[0]   #grab the first row for the header
    df = df_raw[1:]               #take the data except the header row
    df.columns = new_header 
    df.reset_index(drop=True, inplace=True)
    df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

    # change data types
    cols = df.columns.tolist()
    int_columns = ['box','last ratio']

    for col in cols:
        name = col.lower()
        if ('active' in name) or ('reward' in name) or (name in int_columns):
            df[col] = df[col].astype('int32')
        elif ('date' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
        elif ('time' in name):
            df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
        else:
            pass

    # reorganize the columns
    colnames = df.columns.tolist()
    reward_col_begin = colnames.index('Reward 1')
    df['ratios'] = df.iloc[:, reward_col_begin:].values.tolist()
    df['ratios'] = df['ratios'].apply(process_datapoints)
    points_col_begin = df.columns.tolist().index('ratios')
    df.drop(df.iloc[:, reward_col_begin:points_col_begin], inplace=True, axis=1)
    df.rename(columns={"Reward": "Reward Presses"}, inplace=True)

    # parse the file name
    if file[0] == 'C':
        parser = parsers[1]
        cohort, trial_id = re.findall(parser, file)[0]
        room = None
    else:
        parser = parsers[0]
        room, cohort, trial_id = re.findall(parser, file)[0]

    df['room'] = [room] * len(df)
    cohort = int(cohort[1:])
    df['cohort'] = [cohort] * len(df)
    trial_id = process_trial_id(trial_id)
    df['trial_id'] = [trial_id] * len(df)
    df['drug'] = [drug] * len(df)

    # merge in the RFID and reorganize the column formats
    df.rename(columns=str.lower,inplace=True)
    df.columns = df.columns.str.replace(' ','_')
    if drug.lower() == 'cocaine':
        rfid_to_merge = RFID_COC
        lr_dict = coc_lr_dict
    if drug.lower() == 'oxycodone':
        rfid_to_merge = RFID_OXY
        lr_dict = oxy_lr_dict


    # calculate special variables 
    df['breakpoint'] = df['reward_presses'].apply(lambda x: lr_dict[x] if x in lr_dict else None)
    df['last_ratio'] = df['breakpoint'].apply(lambda x: get_last_ratio(lr_dict, x))

    df = pd.merge(df, rfid_to_merge,  how='left', on = ['subject'])
    df.columns = df.columns.str.replace(' ','_')
    df.fillna({'rfid':-999}, inplace=True)
    df['rfid'] = df['rfid'].astype('int64')
    df = df[characteristics_PR]
    df = df.sort_values(by='box')

    filename = file[:-11] + 'transformed.csv'
    df.to_csv(os.path.join(output_path, filename))

In [161]:
file = oxy_files[0]
file

'BSB273BC04HSOXYPR1_output.xlsx'

In [162]:
transform_pr(input_path, output_path, file, parsers, drugB)

In [97]:
for i in tqdm(range(len(coc_files))):
    transform_pr(input_path, output_path, coc_files[i], parsers, drugA)

100%|███████████████████████████████████████████| 174/174 [00:13<00:00, 12.49it/s]


In [103]:
for i in tqdm(range(len(oxy_files))):
    transform_pr(input_path, output_path, oxy_files[i], parsers, drugB)

100%|███████████████████████████████████████████| 361/361 [00:28<00:00, 12.64it/s]


## Sketch

In [132]:
drug = 'oxycodone'

In [155]:
# import data and transpose
filepath = os.path.join(input_path, file)
df_raw = pd.read_excel(filepath)
num_subjects = len(set([i for i in df_raw.iloc[5,:].values if isinstance(i, int)]))

if df_raw.shape[1] > num_subjects+1:
    df_raw = df_raw.iloc[:,:num_subjects+1]

df_raw = df_raw.T
df_raw.reset_index(inplace=True)

# modify the header
new_header = df_raw.iloc[0]   #grab the first row for the header
df = df_raw[1:]               #take the data except the header row
df.columns = new_header 
df.reset_index(drop=True, inplace=True)
df.drop(['Filename', 'Experiment', 'Group', 'MSN', 'FR'], axis=1, inplace=True)

# change data types
cols = df.columns.tolist()
int_columns = ['box','last ratio']

for col in cols:
    name = col.lower()
    if ('active' in name) or ('reward' in name) or (name in int_columns):
        df[col] = df[col].astype('int32')
    elif ('date' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").date())
    elif ('time' in name):
        df[col] = df[col].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
    else:
        pass

# reorganize the columns
colnames = df.columns.tolist()
reward_col_begin = colnames.index('Reward 1')
df['ratios'] = df.iloc[:, reward_col_begin:].values.tolist()
df['ratios'] = df['ratios'].apply(process_datapoints)
points_col_begin = df.columns.tolist().index('ratios')
df.drop(df.iloc[:, reward_col_begin:points_col_begin], inplace=True, axis=1)
df.rename(columns={"Reward": "Reward Presses"}, inplace=True)

# parse the file name
if file[0] == 'C':
    parser = parsers[1]
    cohort, trial_id = re.findall(parser, file)[0]
    room = None
else:
    parser = parsers[0]
    room, cohort, trial_id = re.findall(parser, file)[0]

df['room'] = [room] * len(df)
cohort = int(cohort[1:])
df['cohort'] = [cohort] * len(df)
trial_id = process_trial_id(trial_id)
df['trial_id'] = [trial_id] * len(df)
df['drug'] = [drug] * len(df)

# merge in the RFID and reorganize the column formats
df.rename(columns=str.lower,inplace=True)
df.columns = df.columns.str.replace(' ','_')
if drug.lower() == 'cocaine':
    rfid_to_merge = RFID_COC
    lr_dict = coc_lr_dict
if drug.lower() == 'oxycodone':
    rfid_to_merge = RFID_OXY
    lr_dict = oxy_lr_dict
        
        
# calculate special variables 
df['breakpoint'] = df['reward_presses'].apply(lambda x: lr_dict[x] if x in lr_dict else None)
df['last_ratio'] = df['breakpoint'].apply(lambda x: get_last_ratio(lr_dict, x))

df = pd.merge(df, rfid_to_merge,  how='left', on = ['subject'])
df.columns = df.columns.str.replace(' ','_')
df.fillna({'rfid':-999}, inplace=True)
df['rfid'] = df['rfid'].astype('int64')
df = df[characteristics_PR]
        
df

Unnamed: 0,rfid,subject,room,cohort,trial_id,drug,box,start_time,end_time,start_date,end_date,breakpoint,last_ratio,ratios,active_lever_presses,inactive_lever_presses,reward_presses
0,933000320046909,F401,BSB273B,4,PR01,oxycodone,1,17:05:38,18:43:32,2019-06-28,2019-06-28,0,,,0,0,0
1,933000320046925,F402,BSB273B,4,PR01,oxycodone,2,17:05:41,18:43:32,2019-06-28,2019-06-28,0,,,0,0,0
2,933000320046901,F404,BSB273B,4,PR01,oxycodone,4,17:05:46,18:43:32,2019-06-28,2019-06-28,0,,,0,0,0
3,933000320046914,F406,BSB273B,4,PR01,oxycodone,6,17:05:52,18:43:32,2019-06-28,2019-06-28,0,,,0,0,0
4,933000320047117,F410,BSB273B,4,PR01,oxycodone,10,17:06:03,18:43:32,2019-06-28,2019-06-28,0,,,0,1,0
5,-999,0,BSB273B,4,PR01,oxycodone,12,17:06:10,18:43:33,2019-06-28,2019-06-28,0,,,0,0,0
6,933000320047454,F413,BSB273B,4,PR01,oxycodone,13,17:06:15,18:43:33,2019-06-28,2019-06-28,0,,,0,0,0
7,933000320047486,F416,BSB273B,4,PR01,oxycodone,16,17:06:29,18:43:33,2019-06-28,2019-06-28,0,,,0,0,0
8,933000320047452,F415,BSB273B,4,PR01,oxycodone,15,17:06:25,18:57:14,2019-06-28,2019-06-28,1,1.0,,0,3,1
9,933000320046906,F408,BSB273B,4,PR01,oxycodone,8,17:05:56,19:33:26,2019-06-28,2019-06-28,1,1.0,,0,0,2
