# Data preprocessing

After having prepared the data with `glm_data_prep`, load each recordings data, do a bit of cleaning and normalization and then save each units' data into a dedicated folder and as a separate file.

In [12]:
# imports
import sys
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import Normalizer
from fcutils.path import to_yaml, from_yaml
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

sys.path.append("./")
sys.path.append(r"C:\Users\Federico\Documents\GitHub\pysical_locomotion")
from analysis.ephys.utils import get_recording_names


cache = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM\data")
base_dir = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM")

metadatafile = base_dir / "metadata.yaml"

## Load data 
and remove rows with nans.

In [13]:
def load(REC):
    rec_data = pd.read_hdf(cache / (REC + "_bouts.h5"), key="data").reset_index(drop=True)
    rec_data.head()

    # drop rows with nans
    rec_data.dropna(inplace=True)
    return rec_data

## Normalize and clean

Improve on columns names.

In [14]:
def clean(rec_data):
    units = [c for c in rec_data.columns if isinstance(c, int)]
    variables = list(rec_data.columns[:17])

    # rename unit columns
    column_names = [c if c not in units else "unit_" + str(c) for c in rec_data.columns]
    rec_data.rename(columns=dict(zip(rec_data.columns, column_names)), inplace=True)

    # add squared variables
    rec_data["v_squared"] = rec_data.v**2
    rec_data["omega_squared"] = rec_data.omega**2
    variables += ["v_squared", "omega_squared"]
    return rec_data, units, variables

In [15]:
def normalize(rec_data, variables):
    # split behavioral variables from units firing rates
    X, FR = rec_data[variables], rec_data.drop(variables, axis=1)

    # go from firing rate to p(spike | ms)
    FR = FR / 1000

    # normalize columns of X
    X = pd.DataFrame(Normalizer().fit_transform(X.T).T, columns=X.columns, index=X.index)

    # put everything back together
    data = pd.concat([X, FR], axis=1)
    del rec_data, X, FR
    return data

## Save
For each unit in a dedicated folder/file and update metadata.

In [16]:
def save(REC, data, units):
    metadata = from_yaml(metadatafile) or dict()
    for i, unit in enumerate(units):
        unit_folder = base_dir / f"{REC}_unit_{unit}"
        if not unit_folder.exists():
            unit_folder.mkdir()


        unit_data = data[list(variables) + [f"unit_{unit}"]].copy()
        unit_data.rename(columns={f"unit_{unit}":'p_spike'}, inplace=True)

        unit_data.to_parquet(unit_folder / "data.parquet")


        metadata[f"{REC}_{unit}"] = dict(
            recording=REC,
            unit=unit,
            unit_data=str(unit_folder / "data.parquet"),
            glm_fitted=False,
        )
    to_yaml(metadatafile, metadata)

# Run

In [17]:
for REC in get_recording_names():
    try:
        rec_data = load(REC)
    except:
        print(f"{REC} not found")
        continue
    rec_data, units, variables = clean(rec_data)
    data = normalize(rec_data, variables)
    del rec_data
    save(REC, data, units)

FC_220408_BAA1101192_hairpin not found
(366168, 724)
(333465, 724)
[573, 558, 552, 428, 376, 366, 571]
['s', 'sdot', 'v', 'dv_250ms', 'dv_500ms', 'dv_1000ms', 'omega', 'domega_250ms', 'domega_500ms', 'domega_1000ms', 'curv_0cm', 'curv_5cm', 'curv_10cm', 'curv_15cm', 'curv_20cm', 'curv_25cm', 'curv_30cm']
(667131, 2441)
(608937, 2441)
[606, 410, 139, 141, 285, 414, 233, 223, 219, 212, 478, 461, 79, 107, 505, 321, 340, 80, 235, 87, 124, 117, 100, 289]
['s', 'sdot', 'v', 'dv_250ms', 'dv_500ms', 'dv_1000ms', 'omega', 'domega_250ms', 'domega_500ms', 'domega_1000ms', 'curv_0cm', 'curv_5cm', 'curv_10cm', 'curv_15cm', 'curv_20cm', 'curv_25cm', 'curv_30cm']
(332663, 4057)
(294577, 4057)
[590, 292, 280, 425, 572, 293, 545, 543, 554, 557, 562, 565, 567, 551, 417, 397, 400, 470, 481, 467, 387, 347, 357, 71, 75, 83, 88, 99, 174, 182, 183, 185, 262, 266, 273, 312, 320, 329, 375, 17]
['s', 'sdot', 'v', 'dv_250ms', 'dv_500ms', 'dv_1000ms', 'omega', 'domega_250ms', 'domega_500ms', 'domega_1000ms', 'cur