In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import gzip

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
files = os.listdir()
pklidx = np.array(['.pkl' in f for f in files])
mousefiles = np.array(files)[pklidx]

# the four slc mice with giant files...
# mousefiles = ['Slc_mouse464207.pkl', 'Slc_mouse464204.pkl', 'Slc_mouse462544.pkl', 'Slc_mouse462468.pkl'] # 'Slc_mouse461946.pkl'
# mousefiles = ['Sst_mouse476970.pkl']

for fn in tqdm(mousefiles):
    df = pd.read_pickle(fn)

    # Create smaller df with only essential columns
    unneeded = ['exposure_level','sex','cre_line','targeted_structure','imaging_depth','ophys_experiment_id','dff']
    df.index = df.index.astype('int16')
    df_sm = df.drop(unneeded,axis=1)

    # df.pivot doesn't work if you have previously cast values to float16!
    df_sm = df_sm.astype({'dff_bc':'float64', 'dff_stim500':'float64'})

    # Explode out the trace arrays
    df = df.explode(column='trace').reset_index(drop=True)
    # Add the corresponding timestamps
    df['time'] = np.tile(np.linspace(-0.5,0.7333333333333334,38),int(len(df)/38))
    # Pivot to multi-indexed df
    lmao = df.pivot(index=['ophys_session_id','stimulus_presentations_id','time'],columns='cell_specimen_id',values='trace')
    # Hack to create 3D matrix by stacking along the time dimension
    time_data = []
    for i, ti in enumerate(np.linspace(-0.5,0.7333333333333334,38)):
        val = np.half(lmao.loc[:,:,ti].values)
        time_data.append(val)
    time_data = np.stack(time_data,axis=2)
    
    # Save
    f = gzip.GzipFile(fn[:-4]+'_dffts.npy.gz', "w")
    np.save(f, time_data)
    f.close()

    dff_bc = np.half(df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='dff_bc').values)
    f = gzip.GzipFile(fn[:-4]+'_dffbc.npy.gz', "w")
    np.save(f, dff_bc)
    f.close()

    dff_stim500 = np.half(df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='dff_stim500').values)
    f = gzip.GzipFile(fn[:-4]+'_dffstim.npy.gz', "w")
    np.save(f, dff_stim500)
    f.close()

    img = df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='image_name')
    img = img.fillna(method='bfill', axis=1).iloc[:, 0].values
    chg = df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='is_change')
    chg = chg.fillna(method='bfill', axis=1).iloc[:, 0].values
    omi = df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='omitted')
    omi = omi.fillna(method='bfill', axis=1).iloc[:, 0].values
    ses = df_sm.pivot(index=['ophys_session_id','stimulus_presentations_id'],columns='cell_specimen_id',values='session_number')
    ses = ses.fillna(method='bfill', axis=1).iloc[:, 0].values
    label_data = np.stack([img,chg,omi,ses], axis=1)
    f = gzip.GzipFile(fn[:-4]+'_label.npy.gz', "w")
    np.save(f, label_data)
    f.close()
    

100%|██████████| 17/17 [04:22<00:00, 15.45s/it]


In [18]:
# For fixing the multi-session files (not aligned...)
files = os.listdir()
pklidx = np.array(['sess' in f and '.pkl' in f for f in files])
mousefiles = np.array(files)[pklidx] # all the mouse files with sessions

def myfun(x):
    x=x.split(sep='_')
    return x[1][5:]
myfunvec = np.vectorize(myfun)
slc_mice = np.unique(myfunvec(mousefiles)) # the slc mouse ids with sessions

for mo in tqdm(slc_mice):
    moidx = np.array([mo in f for f in mousefiles])
    mofiles = np.array(mousefiles)[moidx] # the pkl files for one slc mouse with sessions
    print(mofiles)
    df = []
    for fn in tqdm(mofiles):
        df.append(pd.read_pickle(fn))
    df = pd.concat(df) # WARNING this can get big
    df.to_pickle(f'{mofiles[0][0:15]}.pkl')

    

  0%|          | 0/4 [00:00<?, ?it/s]

['Slc_mouse462468_sess910985315.pkl' 'Slc_mouse462468_sess903918949.pkl'
 'Slc_mouse462468_sess902193346.pkl' 'Slc_mouse462468_sess908734047.pkl'
 'Slc_mouse462468_sess902618223.pkl' 'Slc_mouse462468_sess910345059.pkl']


100%|██████████| 6/6 [00:20<00:00,  3.44s/it]
 25%|██▌       | 1/4 [00:27<01:23, 27.75s/it]

['Slc_mouse462544_sess910971181.pkl' 'Slc_mouse462544_sess915587736.pkl'
 'Slc_mouse462544_sess922743776.pkl' 'Slc_mouse462544_sess911449165.pkl'
 'Slc_mouse462544_sess914306708.pkl' 'Slc_mouse462544_sess914797752.pkl'
 'Slc_mouse462544_sess913834848.pkl']


100%|██████████| 7/7 [00:17<00:00,  2.51s/it]
 50%|█████     | 2/4 [00:53<00:53, 26.73s/it]

['Slc_mouse464204_sess918718550.pkl' 'Slc_mouse464204_sess914163299.pkl'
 'Slc_mouse464204_sess928146339.pkl' 'Slc_mouse464204_sess918116930.pkl'
 'Slc_mouse464204_sess929255931.pkl' 'Slc_mouse464204_sess929688369.pkl']


100%|██████████| 6/6 [00:10<00:00,  1.80s/it]
 75%|███████▌  | 3/4 [01:10<00:22, 22.36s/it]

['Slc_mouse464207_sess928142719.pkl' 'Slc_mouse464207_sess918719819.pkl'
 'Slc_mouse464207_sess922168593.pkl' 'Slc_mouse464207_sess920317769.pkl'
 'Slc_mouse464207_sess923202821.pkl' 'Slc_mouse464207_sess924400171.pkl'
 'Slc_mouse464207_sess919432737.pkl' 'Slc_mouse464207_sess926186616.pkl']


100%|██████████| 8/8 [00:09<00:00,  1.16s/it]
100%|██████████| 4/4 [01:25<00:00, 21.27s/it]


In [21]:
files = os.listdir()
pklidx = np.array(['sess' not in f and 'Slc' in f and '.pkl' in f for f in files])
mousefiles = np.array(files)[pklidx]
print(mousefiles)

['Slc_mouse464207.pkl' 'Slc_mouse464204.pkl' 'Slc_mouse462544.pkl'
 'Slc_mouse461946.pkl' 'Slc_mouse462468.pkl']
