# Data preprocessing

After having prepared the data with `glm_data_prep`, load each recordings data, do a bit of cleaning and normalization and then save each units' data into a dedicated folder and as a separate file.

In [1]:
# imports
import sys
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import Normalizer
from fcutils.path import to_yaml, from_yaml
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

sys.path.append("./")
sys.path.append(r"C:\Users\Federico\Documents\GitHub\pysical_locomotion")
from analysis.ephys.utils import get_recording_names


cache = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM\data")
base_dir = Path(r"D:\Dropbox (UCL)\Rotation_vte\Locomotion\analysis\ephys\GLM")

metadatafile = base_dir / "metadata.yaml"

Connecting root@127.0.0.1:3306


## Load data 
and remove rows with nans.

In [2]:
def load(REC):
    rec_data = pd.read_hdf(cache / (REC + "_bouts.h5"), key="data").reset_index(drop=True)
    rec_data.head()

    # drop rows with nans
    rec_data.dropna(inplace=True)
    return rec_data

## Normalize and clean

Improve on columns names.

In [3]:
def clean(rec_data):
    units = [c for c in rec_data.columns if isinstance(c, int)]
    variables = list(rec_data.columns[:17])

    # rename unit columns
    column_names = [c if c not in units else "unit_" + str(c) for c in rec_data.columns]
    rec_data.rename(columns=dict(zip(rec_data.columns, column_names)), inplace=True)

    # add squared variables
    rec_data["v_squared"] = rec_data.v**2
    rec_data["omega_squared"] = rec_data.omega**2
    variables += ["v_squared", "omega_squared"]
    return rec_data, units, variables

In [4]:
def normalize(rec_data, variables):
    # split behavioral variables from units firing rates
    X, FR = rec_data[variables], rec_data.drop(variables, axis=1)

    # go from firing rate to p(spike | ms)
    FR = FR / 1000

    # normalize columns of X
    X = pd.DataFrame(Normalizer().fit_transform(X.T).T, columns=X.columns, index=X.index)

    # put everything back together
    data = pd.concat([X, FR], axis=1)
    del rec_data, X, FR
    return data

## Save
For each unit in a dedicated folder/file and update metadata.

In [7]:
def process_unit_data(data, unit:str):
    unit_data = data[list(variables) + [unit]].copy()
    unit_data.rename(columns={unit:'p_spike'}, inplace=True)
    return unit_data


def save(REC, data, units):
    metadata = from_yaml(metadatafile) or dict()
    for i, unit in enumerate(units):
        unit_folder = base_dir / f"{REC}_unit_{unit}"
        if unit_folder.exists():
            print(f"{unit_folder} already exists")
            continue
        unit_folder.mkdir(exist_ok=True)

        # save unit data
        process_unit_data(data, f"unit_{unit}").to_parquet(unit_folder / "data.parquet")

        # save shuffled units
        shuffles_folder = unit_folder / "shuffles"
        shuffles_folder.mkdir(exist_ok=True)

        for shuffle in range(100):
            process_unit_data(data, f"{unit}_shuffle_{shuffle}").to_parquet(shuffles_folder / f"shuffle_{shuffle}.parquet")
            

        metadata[f"{REC}_{unit}"] = dict(
            recording=REC,
            folder = str(unit_folder),
            shuffles_folder = str(unit_folder / "shuffles"),
            unit=unit,
            unit_data=str(unit_folder / "data.parquet"),
            glm_fitted=False,
        )
    to_yaml(metadatafile, metadata)

# Run

In [8]:
for REC in get_recording_names():
    try:
        rec_data = load(REC)
    except:
        print(f"{REC} not found")
        continue
    rec_data, units, variables = clean(rec_data)
    data = normalize(rec_data, variables)
    del rec_data
    save(REC, data, units)