# Data preprocessing

After having prepared the data with `glm_data_prep`, load each recordings data, do a bit of cleaning and normalization and then save each units' data into a dedicated folder and as a separate file.

In [1]:
# imports
import sys
import joblib
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import Normalizer
from fcutils.path import to_yaml, from_yaml
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

sys.path.append("./")
sys.path.append(r"C:\Users\Federico\Documents\GitHub\pysical_locomotion")
from analysis.ephys.utils import get_recording_names


cache = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM\data")
base_dir = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM")

metadatafile = base_dir / "metadata.yaml"

Connecting root@127.0.0.1:3306


## Load data 
and remove rows with nans.

In [2]:
def load(REC):
    rec_data = pd.read_hdf(cache / (REC + "_bouts.h5"), key="data").reset_index(drop=True)
    rec_data = rec_data.drop(columns=["curv_35cm", "curv_40cm", "curv_45cm", "curv_50cm", "curv_55cm", "curv_60cm"], inplace=False).reset_index()

    # drop rows with nans
    rec_data.dropna(inplace=True)
    return rec_data

## Normalize and clean

Improve on columns names.

In [3]:
def clean(rec_data):
    units = [c for c in rec_data.columns if isinstance(c, int)]
    variables = list(rec_data.columns[:17])

    # rename unit columns
    column_names = [c if c not in units else "unit_" + str(c) for c in rec_data.columns]
    rec_data.rename(columns=dict(zip(rec_data.columns, column_names)), inplace=True)

    # add squared variables
    rec_data["v_squared"] = rec_data.v**2
    rec_data["omega_squared"] = rec_data.omega**2
    variables += ["v_squared", "omega_squared"]
    return rec_data, units, variables

In [4]:
def normalize(rec_data, variables):
    # split behavioral variables from units firing rates
    X, FR = rec_data[variables], rec_data.drop(variables, axis=1)

    # go from firing rate to p(spike | ms)
    FR = FR / 200

    # normalize columns of X
    X_min = X.min()
    X_max = X.max()
    X = (X - X_min) / (X_max - X_min)
    # x_reconstructed = y * (X_max - _min) + _min

    # put everything back together
    data = pd.concat([X, FR], axis=1)
    del rec_data, X, FR
    return data, X_min, X_max

## Save
For each unit in a dedicated folder/file and update metadata.

In [5]:
def process_unit_data(data, unit:str, activity_only=False):
    if not activity_only:
        unit_data = data[list(variables) + [unit]].copy()
    else:
        unit_data = data[[unit]].copy()
    unit_data.rename(columns={unit:'p_spike'}, inplace=True)
    return unit_data


def save(REC, data, units, X_min, X_max):
    metadata = from_yaml(metadatafile) or dict()
    for i, unit in enumerate(units):
        print(unit)
        if f"{REC}_{unit}" in metadata.keys():
            continue

        unit_folder = base_dir / f"{REC}_unit_{unit}"
        unit_folder.mkdir(exist_ok=True)

        # save normalizers
        normalizers_folder = unit_folder / "normalizers"
        normalizers_folder.mkdir(exist_ok=True)
        X_min.to_hdf(normalizers_folder / "X_min.h5", key="data")
        X_max.to_hdf(normalizers_folder / "X_max.h5", key="data")

        # save unit data
        process_unit_data(data, f"unit_{unit}").to_parquet(unit_folder / "data.parquet")

        # # save shuffled units
        shuffles_folder = unit_folder / "shuffles"
        shuffles_folder.mkdir(exist_ok=True)

        for shuffle in range(100):
            process_unit_data(data, f"{unit}_shuffle_{shuffle}", activity_only=True).to_parquet(shuffles_folder / f"shuffle_{shuffle}.parquet")
            

        metadata[f"{REC}_{unit}"] = dict(
            recording=REC,
            folder = str(unit_folder),
            shuffles_folder = str(unit_folder / "shuffles"),
            unit=unit,
            unit_data=str(unit_folder / "data.parquet"),
            glm_fitted=False,
        )

        to_yaml(metadatafile, metadata)

# Run

In [6]:
# choose which ones to do to avoid repetition of loading stuff
rec_to_do = get_recording_names()
rec_to_do

In [7]:
for REC in rec_to_do:
    try:
        rec_data = load(REC)
    except:
        print(f"{REC} not found")
        continue
    print(f"Doing {REC}")
    rec_data, units, variables = clean(rec_data)
    data, X_min, X_max = normalize(rec_data, variables)
    del rec_data
    save(REC, data, units, X_min, X_max)

    

Doing FC_220408_BAA1101192_hairpin
302
390
447
458
459
484
274
541
301
578
595
641
485
290
670
287
285
663
145
289
91
95
216
520
217
239
244
248
249
358
350
319
232
Doing FC_220409_BAA1101192_hairpin
573
558
552
428
376
366
571
Doing FC_220410_BAA1101192_hairpin
606
410
139
141
285
414
233
223
219
212
478
461
79
107
505
321
340
80
235
87
124
117
100
289
Doing FC_220411_BAA1101192_hairpin
590
292
280
425
572
293
545
543
554
557
562
565
567
551
417
397
400
470
481
467
387
347
357
71
75
83
88
99
174
182
183
185
262
266
273
312
320
329
375
17
Doing FC_220412_BAA1101192_hairpin
142
295
296
302
303
305
312
620
278
624
629
632
638
650
653
657
663
627
143
567
603
139
133
132
115
610
608
120
606
141
97
89
86
532
70
559
77
251
75
28
18
6
Doing FC_220413_BAA1101192_hairpin
643
443
447
306
452
459
460
469
584
635
606
633
629
627
625
617
608
585
392
565
521
531
534
382
561
564
512
555
373
363
43
71
78
80
85
96
111
371
128
126
150
157
287
288
338
339
137
171
182
189
17
12
7
6
3
Doing FC_220414_BAA11