In [72]:
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import xlrd
from openpyxl import load_workbook
import os
from datetime import datetime

In [73]:
characteristics_SHOCK = ['rfid', 'subject', 'room', 'cohort', 'trial_id', 'drug', 'box',
       'start_time', 'end_time', 'start_date', 'end_date',
       'total_active_lever_presses', 'total_inactive_lever_presses',
       'total_shocks', 'total_reward', 'rewards_after_first_shock',
       'rewards_got_shock', 'reward_timestamps']

In [74]:
rfid_coc = pd.read_csv('/Users/yunyihuang/Desktop/gl_data/rfid_cocaine.csv', index_col=0)

In [75]:
# get all the sheetnames in one excel workbook
def get_sheetnames_xlsx(file_name):
    wb = load_workbook(file_name, read_only=True, keep_links=False)
    return wb.sheetnames

# clean subject id
def clean_subject_id(sid):
    sid = sid.upper()
    if 'F' in sid:
        char = 'F'
    if 'M' in sid:
        char = 'M'
        
    idx = sid.index(char)
    return sid[idx:].split('.')[0]

# return valid list of datapoints
def process_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return lst

# count valid data points
def count_datapoints(lst):
    while lst and lst[-1] == 0:
        lst.pop()
        
    if len(lst) == 0:
        return None
    else:
        return len(lst)

In [76]:
input_path = '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock'
output_path = '/Users/yunyihuang/Desktop/gl_data/GEORGE/COC_SHOCK'
workbooks = [os.path.join(input_path, i) for i in sorted(os.listdir(input_path)) if ('modified_final' in i)]
workbooks

['/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C01.COC.SHOCK_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C02.COC.SHOCK_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C03.COC.SHOCKDATA_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C04.COC.SHOCKDATA_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C05.COC.SHOCKDATA_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C08.COC.SHOCKDATA_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C09.COC.SHOCKDATA_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dropbox/George_Lab/Team GWAS/SA data/Cocaine/shock/C10COCAINESHOCK_modified_final.xlsx',
 '/Users/yunyihuang/George Lab Dro

In [77]:
len(workbooks)

9

In [80]:
for wb in workbooks:
    worksheets = sorted(get_sheetnames_xlsx(wb))
    print(len(worksheets))
    for ws in worksheets:
        print(ws)
        transform_old_shock(wb,ws)

3
SHOCK01_9-6-17
SHOCK02_9-11-17
SHOCK03_9-13-17
3
PRESHOCK_11-21-17
SHOCK01_11-27-17
SHOCK02_11-29-17
4
PRESHOCK_3-13-2018
SHOCK01_3-15-2018
SHOCK02_3-20-2018
SHOCK03_3-22-2018
4
PRESHOCK_5-21-2018
SHOCK01_5-23-2018
SHOCK02_5-29-2018
SHOCK03_5-31-2018
4
PRESHOCK_9-4-2018
SHOCK01_9-6-2018
SHOCK02_9-11-2018
SHOCK03_9-13-2018
1
SHOCK03_8-2-2019
2
PRESHOCK_10-29-19
SHOCK03_10-30-2019
2
PRESHOCK_2-5-2020
SHOCK03_2-6-2020
2
PRESHOCK_5-6-2020
SHOCK03_5-7-2020


In [120]:
worksheets

['PRESHOCK_3-13-2018',
 'SHOCK01_3-15-2018',
 'SHOCK02_3-20-2018',
 'SHOCK03_3-22-2018']

In [64]:
wb = workbooks[4]
worksheets = sorted(get_sheetnames_xlsx(wb))
ws = worksheets[0]
ws

'PRESHOCK_9-4-2018'

In [70]:
df_existed = pd.read_csv('/Users/yunyihuang/Desktop/testDS_public_trial_shock.csv')
existed = df_existed['subject'].to_numpy()

In [69]:
transform_old_shock(wb,ws)

## Main Code

In [79]:
def transform_old_shock(wb,ws):
    filepath = os.path.join(input_path, wb)
    df_raw = pd.read_excel(filepath, sheet_name = ws).T.reset_index()

    # modify the header
    new_header = df_raw.iloc[0]     #grab the first row for the header
    df = df_raw[1:]                 #take the data except the header row
    df.columns = new_header 

    # get rid of 0s
    df.replace(0, np.nan, inplace=True)
    df.dropna(how='all', axis=1, inplace=True)
    df.fillna(0,inplace=True)
    
    # group timestamp columns
    colnames = df.columns.tolist()
    reward_shock_begin = colnames.index('Reward # Got Shock 1')
    reward_col_begin = colnames.index('Reward 1')
    reward_col_end = colnames.index('Rewards After First Shock')
    df['Rewards Got Shock'] = df.iloc[:,reward_shock_begin:reward_col_begin].values.tolist()
    df['Rewards Got Shock'] = df['Rewards Got Shock'].apply(process_datapoints)
    df['Reward Timestamps'] = df.iloc[:,reward_col_begin:reward_col_end].values.tolist()
    df['Reward Timestamps'] = df['Reward Timestamps'].apply(process_datapoints)
    df.drop(df.iloc[:, reward_shock_begin:reward_col_end], inplace=True, axis=1)
    
    # add extra info
    cohort = int(wb.split('/')[-1][1:3])
    raw_trial_id = ws.split('_')[0]
    
    if raw_trial_id == 'PRESHOCK':
        trial_id = raw_trial_id
    elif cohort in range(1,6):
        trial_id = raw_trial_id.replace('0','_V')
    else:
        trial_id = 'SHOCK_V3'
    
    df['room'] = [None] * len(df)
    df['cohort'] = [cohort] * len(df)
    df['trial_id'] = [trial_id] * len(df)
    df['drug'] = ['cocaine'] * len(df)
    df['end_time'] = [datetime.min.time()] * len(df)
    df['end_date'] = [datetime.min.date()] * len(df)
    
    # reorganize columns
    df.rename(columns=str.lower,inplace=True)
    df.columns = df.columns.str.replace(' ','_')
    dff = pd.merge(df, rfid_coc,  how='left', on = ['subject'])
    dff.fillna({'rfid':-999}, inplace=True)
    dff['rfid'] = dff['rfid'].astype('int64')
    dff = dff[characteristics_SHOCK]
    dff['start_time'] = dff['start_time'].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
    dff['start_date'] = dff['start_date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date())
    dff.sort_values(by='subject', inplace=True)
    dff = dff[~dff['subject'].isin(existed)]
    dff['rfid'] = dff['rfid'].astype('int64')

    filename = wb.split('/')[-1][:3] + '_' + ws.split('.')[0]+'_transformed.csv' 
    dff.to_csv(os.path.join(output_path, filename))

## Sketch

In [66]:
filepath = os.path.join(input_path, wb)
df_raw = pd.read_excel(filepath, sheet_name = ws).T.reset_index()

# modify the header
new_header = df_raw.iloc[0]     #grab the first row for the header
df = df_raw[1:]                 #take the data except the header row
df.columns = new_header 

# get rid of 0s
df.replace(0, np.nan, inplace=True)
df.dropna(how='all', axis=1, inplace=True)
df.fillna(0,inplace=True)

# group timestamp columns
colnames = df.columns.tolist()
reward_shock_begin = colnames.index('Reward # Got Shock 1')
reward_col_begin = colnames.index('Reward 1')
reward_col_end = colnames.index('Rewards After First Shock')
df['Rewards Got Shock'] = df.iloc[:,reward_shock_begin:reward_col_begin].values.tolist()
df['Rewards Got Shock'] = df['Rewards Got Shock'].apply(process_datapoints)
df['Reward Timestamps'] = df.iloc[:,reward_col_begin:reward_col_end].values.tolist()
df['Reward Timestamps'] = df['Reward Timestamps'].apply(process_datapoints)
df.drop(df.iloc[:, reward_shock_begin:reward_col_end], inplace=True, axis=1)

# add extra info
cohort = int(wb.split('/')[-1][1:3])
raw_trial_id = ws.split('_')[0]

if raw_trial_id == 'PRESHOCK':
    trial_id = raw_trial_id
elif cohort in range(1,6):
    trial_id = raw_trial_id.replace('0','_V')
else:
    trial_id = 'SHOCK_V3'

df['room'] = [None] * len(df)
df['cohort'] = [cohort] * len(df)
df['trial_id'] = [trial_id] * len(df)
df['drug'] = ['cocaine'] * len(df)
df['end_time'] = [datetime.min.time()] * len(df)
df['end_date'] = [datetime.min.date()] * len(df)
df

Unnamed: 0,Subject,Start Date,Start Time,Box,Total Inactive Lever Presses,Total Reward,Total Active Lever Presses,Total Shocks,Rewards After First Shock,Rewards Got Shock,Reward Timestamps,room,cohort,trial_id,drug,end_time,end_date
1,F501,09/04/2018,08:14:10,1,2.0,34.0,39.0,9.0,33.0,"[1.0, 2.0, 10.0, 12.0, 14.0, 16.0, 22.0, 24.0,...","[2.0, 33.0, 66.0, 106.0, 141.0, 195.0, 262.0, ...",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
2,F502,09/04/2018,08:14:13,2,5.0,0.0,0.0,0.0,0.0,,,,5,PRESHOCK,cocaine,00:00:00,0001-01-01
3,F503,09/04/2018,08:14:16,3,3.0,38.0,97.0,12.0,33.0,"[5.0, 6.0, 7.0, 15.0, 16.0, 19.0, 21.0, 23.0, ...","[41.0, 63.0, 83.0, 105.0, 169.0, 195.0, 219.0,...",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
4,F504,09/04/2018,08:14:18,4,7.0,26.0,52.0,9.0,24.0,"[2.0, 6.0, 8.0, 13.0, 16.0, 19.0, 24.0, 25.0, ...","[60.0, 138.0, 247.0, 279.0, 305.0, 364.0, 582....",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
5,F505,09/04/2018,08:14:20,5,0.0,22.0,22.0,7.0,21.0,"[1.0, 8.0, 10.0, 12.0, 15.0, 16.0, 22.0]","[15.0, 44.0, 142.0, 353.0, 474.0, 738.0, 1033....",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
6,F506,09/04/2018,08:14:22,6,0.0,19.0,21.0,5.0,16.0,"[3.0, 5.0, 7.0, 12.0, 14.0]","[897.0, 931.0, 1005.0, 1058.0, 1147.0, 1349.0,...",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
7,F508,09/04/2018,08:14:28,8,0.0,26.0,34.0,9.0,22.0,"[4.0, 5.0, 8.0, 11.0, 17.0, 20.0, 22.0, 24.0, ...","[7.0, 36.0, 67.0, 90.0, 114.0, 277.0, 358.0, 5...",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
8,M551,09/04/2018,08:14:33,9,10.0,2.0,3.0,1.0,1.0,[1.0],"[284.0, 3020.0]",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
9,M552,09/04/2018,08:14:38,10,2.0,3.0,3.0,0.0,0.0,,"[21.0, 426.0, 649.0]",,5,PRESHOCK,cocaine,00:00:00,0001-01-01
10,M553,09/04/2018,08:14:41,11,1.0,0.0,0.0,0.0,0.0,,,,5,PRESHOCK,cocaine,00:00:00,0001-01-01
