# Data preprocessing

After having prepared the data with `glm_data_prep`, load each recordings data, do a bit of cleaning and normalization and then save each units' data into a dedicated folder and as a separate file.

In [1]:
# imports
import sys
import joblib
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import Normalizer
from fcutils.path import to_yaml, from_yaml
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

sys.path.append("./")
sys.path.append(r"C:\Users\Federico\Documents\GitHub\pysical_locomotion")
from analysis.ephys.utils import get_recording_names


cache = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM\data")
base_dir = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM")

metadatafile = base_dir / "metadata.yaml"

Connecting root@127.0.0.1:3306


## Load data 
and remove rows with nans.

In [2]:
def load(REC):
    rec_data = pd.read_hdf(cache / (REC + "_bouts.h5"), key="data").reset_index(drop=True)
    # rec_data = rec_data.drop(columns=["curv_35cm", "curv_40cm", "curv_45cm", "curv_50cm", "curv_55cm", "curv_60cm"], inplace=False).reset_index()

    # drop rows with nans
    rec_data.dropna(inplace=True)
    rec_data.reset_index(inplace=True, drop=True)
    return rec_data

## Normalize and clean

Improve on columns names.

In [3]:
def clean(rec_data):
    units = [c for c in rec_data.columns if isinstance(c, int)]
    variables = list(rec_data.columns[:17])

    # rename unit columns
    column_names = [c if c not in units else "unit_" + str(c) for c in rec_data.columns]
    rec_data.rename(columns=dict(zip(rec_data.columns, column_names)), inplace=True)

    # add squared variables
    rec_data["v_squared"] = rec_data.v**2
    rec_data["omega_squared"] = rec_data.omega**2
    variables += ["v_squared", "omega_squared"]
    return rec_data, units, variables

In [4]:
def normalize(rec_data, variables):
    # split behavioral variables from units firing rates
    X, FR = rec_data[variables], rec_data.drop(variables, axis=1)

    # go from firing rate to p(spike | ms)
    FR = FR / 200

    # normalize columns of X
    # X_min = X.min()
    # X_max = X.max()
    # X = (X - X_min) / (X_max - X_min)

    # standardize columns of X
    X_mean = X.mean()
    X_std = X.std()
    X = (X - X_mean) / X_std

    # x_reconstructed = y * (X_max - _min) + _min

    # put everything back together
    data = pd.concat([X, FR], axis=1)
    del rec_data, X, FR
    return data, X_mean, X_std

## Save
For each unit in a dedicated folder/file and update metadata.

In [5]:
def process_unit_data(data, unit:str, activity_only=False):
    if not activity_only:
        unit_data = data[list(variables) + [unit]].copy()
    else:
        unit_data = data[[unit]].copy()
    unit_data.rename(columns={unit:'p_spike'}, inplace=True)
    return unit_data


def save(REC, data, units, X_mean, X_std, region):
    metadata = from_yaml(metadatafile) or dict()
    for i, unit in enumerate(units):
        print(unit)
        if f"{REC}_{unit}" in metadata.keys():
            continue

        unit_folder = base_dir / f"{REC}_unit_{unit}"
        unit_folder.mkdir(exist_ok=True)

        # save normalizers
        normalizers_folder = unit_folder / "normalizers"
        normalizers_folder.mkdir(exist_ok=True)
        X_mean.to_hdf(normalizers_folder / f"{REC}_mean.h5", key="data")
        X_std.to_hdf(normalizers_folder / f"{REC}_std.h5", key="data")

        # save unit data
        process_unit_data(data, f"unit_{unit}").to_parquet(unit_folder / "data.parquet")

        # # save shuffled units
        shuffles_folder = unit_folder / "shuffles"
        shuffles_folder.mkdir(exist_ok=True)

        for shuffle in range(100):
            process_unit_data(data, f"{unit}_shuffle_{shuffle}", activity_only=True).to_parquet(shuffles_folder / f"shuffle_{shuffle}.parquet")
            

        metadata[f"{REC}_{unit}"] = dict(
            recording=REC,
            folder = str(unit_folder),
            shuffles_folder = str(unit_folder / "shuffles"),
            unit=unit,
            unit_data=str(unit_folder / "data.parquet"),
            glm_fitted=False,
            region=region,
        )

        to_yaml(metadatafile, metadata)

# Run

In [6]:
# choose which ones to do to avoid repetition of loading stuff
REGION = "CUN/PPN"
rec_to_do = get_recording_names(region=REGION)
rec_to_do

In [7]:
for REC in rec_to_do:
    try:
        rec_data = load(REC)
    except Exception as e:
        print(f"{REC} not found {e}")
        continue
    print(f"Doing {REC}")
    rec_data, units, variables = clean(rec_data)
    data, X_min, X_max = normalize(rec_data, variables)
    del rec_data
    save(REC, data, units, X_min, X_max, REGION)

    

Doing FC_210715_AAA1110750_r5_hairpin
833
818
817
811
798
733
Doing FC_210716_AAA1110750_r6_hairpin
753
774
781
785
794
832
747
Doing FC_210720_AAA1110750_hairpin
328
324
323
320
316
308
FC_210721_AAA1110750_hairpin not found File D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM\data\FC_210721_AAA1110750_hairpin_bouts.h5 does not exist
Doing FC_210722_AAA1110750_hairpin
349
348
345
Doing FC_211022_BAA110516_hairpin
711
700
695
708
600
595
592
530
551
554
Doing FC_211027_BAA110516_hairpin
545
461
Doing FC_211214_BAA110517_hairpin
379
368
Doing FC_220114_BAA110517_hairpin
501
489
Doing FC_220117_BAA110517_hairpin
638
560
565
589
632
615
629
593
Doing FC_220119_BAA110517_hairpin
405
344
351
355
385
361
378
380
358
Doing FC_220120_BAA110517_hairpin
510
444
449
472
454
471
452
Doing FC_210917_BAA1110279_hairpin
Doing FC_210820_BAA1110281_hairpin
649
506
505
501
482
481
579
395
360
367
383
384
390
319
402
408
410
416
435
440
447
460
318
298
342
320
Doing FC_210829_BAA1110281_hairp