In [18]:
import pandas as pd
import numpy as np

def unstack_data(df):
    rows = []
    rfids = [i for i in df.rfid.unique()]

    for rfid in rfids:
        temp = df[df.rfid==rfid]
        temp = temp.reset_index(drop=True)
        temp.sort_values(['trial_id'],inplace=True)
        temp.set_index('trial_id',inplace=True)
        temp_row = temp.iloc[:,1:].unstack().to_frame().sort_index(level=1).T
        temp_row.insert(loc=0, column='rfid', value=temp.rfid.unique())
        temp_row = temp_row.reset_index(drop=True)
        rows.append(temp_row)
        
    return rows


def deserialize_data(ts):
    if pd.isnull(ts):
        return []
    return [float(i) for i in ts.split()]

## Notes table used for filtering

In [19]:
note = pd.read_csv('/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataStream_Database_01-07-2023/Raw/trial_note.csv')
note.drop('#',axis=1,inplace=True)
note.head()

Unnamed: 0,rfid,subject,cohort,sex,drug,experiment_group,trial_id,start_date,code,to_do,note
0,933000320047006,M987,9,Male,Cocaine,Drug,LGA01,2019-10-08,Disconnect,Keep,
1,933000320186832,M1072,10,Male,Cocaine,Drug,LGA02,2020-01-14,Died,Discard,
2,933000320188172,M1565,15,Male,Cocaine,Drug,LGA01,2021-01-13,Tangled,Keep,
3,933000320188113,M1553,15,Male,Cocaine,Drug,LGA06,2021-01-21,Tangled,Discard,
4,933000320187895,M1572,15,Male,Cocaine,Drug,LGA11,2021-01-28,Sick/wound,Keep,


In [20]:
note['drug'] = note['drug'].str.lower()
note['to_do'] = note['to_do'].apply(lambda x: x.lower().strip())

note_filter = note[note['to_do']=='discard'][['rfid','trial_id','drug','to_do']]
note_filter.reset_index(drop=True,inplace=True)

In [21]:
note_filter.replace('SHOCK','SHOCK_V3',inplace=True)
note_filter.replace('PRE-SHOCK','PRESHOCK',inplace=True)
note_filter.replace('TREAMENT02','TREATMENT02',inplace=True)
note_filter.replace('TREAMENT03','TREATMENT03',inplace=True)
note_filter.replace('PRE-SHOCK','PRESHOCK',inplace=True)

## SHA

In [22]:
sha = pd.read_csv('/Users/yunyihuang/George Lab Dropbox/George_Lab/Experiments/DataStream/DataStream_Database_01-07-2023/Raw/trial_sha.csv')
sha.head()

Unnamed: 0,rfid,subject,room,cohort,trial_id,drug,box,start_time,end_time,start_date,end_date,active_lever_presses,inactive_lever_presses,reward_presses,timeout_presses,active_timestamps,inactive_timestamps,reward_timestamps,timeout_timestamps
0,933000320188614,M1658,BSB273D,16,SHA07,cocaine,6.0,08:36:10,11:02:11,2021-03-15,2021-03-15,2.0,3.0,2.0,0.0,6909 7087,69 74 7044,6909 7087,
1,933000320047019,M975,BSB273E,9,SHA07,cocaine,7.0,08:57:46,10:59:13,2019-10-01,2019-10-01,5.0,3.0,4.0,1.0,396 3634 6222 6236 6257,31 6126 6716,396 3634 6222 6257,6236
2,933000320049155,M1472,BSB273E,14,SHA08,cocaine,4.0,08:18:25,11:18:54,2020-10-15,2020-10-15,24.0,1.0,20.0,4.0,46 47 77 659 760 1093 1244 1443 1446 1524 2569...,83,46 77 659 760 1093 1244 1443 1524 2569 2617 32...,47 1446 3295 6864
3,933000320188719,M1780,BSB273E,17,SHA09,cocaine,12.0,08:30:08,10:42:40,2021-06-16,2021-06-16,3.0,0.0,3.0,0.0,219 616 4898,,219 616 4898,
4,933000320047169,F911,BSB273B,9,SHA02,cocaine,11.0,07:55:29,10:50:16,2019-09-24,2019-09-24,1.0,1.0,1.0,0.0,4807,4760,4807,


In [23]:
sha['active_timestamps'] = sha['active_timestamps'].apply(deserialize_data)
sha['inactive_timestamps'] = sha['inactive_timestamps'].apply(deserialize_data)
sha['reward_timestamps'] = sha['reward_timestamps'].apply(deserialize_data)
sha['timeout_timestamps'] = sha['timeout_timestamps'].apply(deserialize_data)

In [24]:
sha_trimmed = sha

In [25]:
sha_keep = pd.merge(sha_trimmed, note_filter, how='left', left_on=['rfid','trial_id','drug'], right_on=['rfid','trial_id','drug'])
sha_keep = sha_keep[~(sha_keep['to_do']=='discard')]
sha_keep.drop(columns=['drug','to_do'],inplace=True)
sha_keep.shape

(13468, 18)

In [27]:
sha_keep.isna().sum()

rfid                         0
subject                      0
room                      1965
cohort                       0
trial_id                     0
box                       1419
start_time                   0
end_time                     0
start_date                   0
end_date                     0
active_lever_presses      1419
inactive_lever_presses    1419
reward_presses            1419
timeout_presses           1419
active_timestamps            0
inactive_timestamps          0
reward_timestamps            0
timeout_timestamps           0
dtype: int64

In [28]:
data_missing = sha_keep[np.isnan(sha_keep.active_lever_presses)]
data_missing

Unnamed: 0,rfid,subject,room,cohort,trial_id,box,start_time,end_time,start_date,end_date,active_lever_presses,inactive_lever_presses,reward_presses,timeout_presses,active_timestamps,inactive_timestamps,reward_timestamps,timeout_timestamps
58,933000120117348,M7,,1,SHA02,,00:00:00,00:00:00,2017-08-01,0001-01-01,,,,,"[5717.75, 5822.46, 5823.46]","[462.65, 463.81, 464.77, 467.46, 5964.38, 6128...","[5717.75, 5822.46]",[5823.46]
59,933000120138295,F98,,2,SHA02,,00:00:00,00:00:00,2017-10-18,0001-01-01,,,,,"[154.12, 158.81, 561.98, 2053.02, 2061.26]",[2093.05],"[154.12, 561.98, 2053.02]","[158.81, 2061.26]"
60,933000120138655,M357,,3,SHA02,,00:00:00,00:00:00,2018-02-06,0001-01-01,,,,,"[89.68, 96.33, 96.54, 116.78, 174.39, 182.44, ...","[75.65, 103.29, 119.19, 131.34, 298.02, 301.21...","[89.68, 116.78, 174.39, 271.05, 373.6, 411.6, ...","[96.33, 96.54, 182.44, 280.96, 374.4, 413.81, ..."
61,933000120138393,M465,,4,SHA02,,00:00:00,00:00:00,2018-04-17,0001-01-01,,,,,[],[5202.82],[],[]
62,933000120138720,F118,,1,SHA02,,00:00:00,00:00:00,2018-07-31,0001-01-01,,,,,[],"[567.83, 569.6, 573.52, 577.02, 579.07, 584.6,...",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13469,933000120138754,F523,,5,SHA04,,00:00:00,00:00:00,2018-08-02,0001-01-01,,,,,"[286.36, 2770.32, 2772.83]",[5418.55],"[286.36, 2770.32]",[2772.83]
13495,933000120117326,M69,,1,SHA02,,00:00:00,00:00:00,2017-08-01,0001-01-01,,,,,"[301.84, 302.83]","[0.95, 132.12, 323.11, 858.31, 858.67, 861.59,...",[301.84],[302.83]
13496,933000120138300,F104,,2,SHA02,,00:00:00,00:00:00,2017-10-18,0001-01-01,,,,,"[302.84, 2097.86, 2127.45]",[],"[302.84, 2097.86, 2127.45]",[]
13497,933000120138671,M464,,4,SHA02,,00:00:00,00:00:00,2018-04-17,0001-01-01,,,,,[109.58],"[141.73, 142.66, 1816.36, 1818.81, 7098.9, 714...",[109.58],[]


In [29]:
data_missing.cohort.value_counts()

1    521
2    235
3    234
5    231
4    198
Name: cohort, dtype: int64

In [30]:
data_missing.subject.value_counts()

M154    19
M153    19
M175    18
M178    18
M177    18
        ..
M462     7
M467     7
M465     7
M466     7
M464     3
Name: subject, Length: 142, dtype: int64

In [32]:
data_missing.isna().sum()

rfid                         0
subject                      0
room                      1419
cohort                       0
trial_id                     0
box                       1419
start_time                   0
end_time                     0
start_date                   0
end_date                     0
active_lever_presses      1419
inactive_lever_presses    1419
reward_presses            1419
timeout_presses           1419
active_timestamps            0
inactive_timestamps          0
reward_timestamps            0
timeout_timestamps           0
dtype: int64

In [40]:
np.isnan(9)

False

In [48]:
# checck
fp = '/Users/yunyihuang/Desktop/trial_shock.csv'
f = pd.read_csv(fp)
f.head()

Unnamed: 0,rfid,subject,room,cohort,trial_id,drug,box,start_time,end_time,start_date,end_date,total_active_lever_presses,total_inactive_lever_presses,total_shocks,total_reward,rewards_after_first_shock,rewards_got_shock,reward_timestamps
0,933000320047293,M851,BSB273B,8,SHOCK_V3,cocaine,9,09:01:37,10:02:06,2019-08-02,2019-08-02,7,1,1,4,2,2,101 516 2079 2208
1,933000320047584,F907,BSB273B,9,SHOCK_V3,cocaine,7,09:16:37,10:18:30,2019-10-30,2019-10-30,3,4,1,3,0,3,1049 1243 1457
2,933000320187227,F1108,BSB273B,11,PRESHOCK,cocaine,8,08:12:49,09:28:39,2020-05-06,2020-05-06,3,4,1,3,0,3,74 96 394
3,933000320186956,M1151,BSB273B,11,SHOCK_V3,cocaine,9,07:35:00,08:44:14,2020-05-07,2020-05-07,4,0,2,3,1,2 3,158 487 1292
4,933000320048757,F1314,BSB273B,13,SHOCK_V3,cocaine,6,09:49:09,10:57:01,2020-09-09,2020-09-09,5,8,0,4,0,,2137 2188 2976 3012


In [49]:
f['active_timestamps'] = f['active_timestamps'].apply(deserialize_data)
f['inactive_timestamps'] = f['inactive_timestamps'].apply(deserialize_data)
f['reward_timestamps'] = f['reward_timestamps'].apply(deserialize_data)
f['timeout_timestamps'] = f['timeout_timestamps'].apply(deserialize_data)

f = pd.merge(f, note_filter, how='left', left_on=['rfid','trial_id','drug'], right_on=['rfid','trial_id','drug'])
f = f[~(f['to_do']=='discard')]
f.drop(columns=['drug','to_do'],inplace=True)
f.shape

KeyError: 'active_timestamps'

In [47]:
f.isna().sum()

rfid                         0
subject                      0
room                      3714
cohort                       0
trial_id                     0
box                       2648
start_time                   0
end_time                     0
start_date                   0
end_date                     0
active_lever_presses         0
inactive_lever_presses       0
reward_presses               0
timeout_presses            439
active_timestamps            0
inactive_timestamps          0
reward_timestamps            0
timeout_timestamps           0
dtype: int64