In [1]:
# TODOS:
# Add warnings if expected directory or files do not exist
# Comment heavily

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import ecephys.sglx.paths as sglx_paths

In [2]:
#from pathlib import Path
#import ecephys.data_mgmt.paths as mgmt
#from itertools import chain
#import re
#import os

In [4]:
import ecephys_analyses as ea

yaml_path = ea.package_datapath('example_org_spec.yaml')

In [5]:
import yaml

with open(yaml_path) as fp:
    yaml_data = yaml.safe_load(fp)
doc = yaml_data

In [6]:
session = doc['recording-sessions'][0]
run = session['SpikeGLX-runs'][2]
gate = 'g1'
probe = 'imec0'

# Getting and parsing files

In [428]:
def parse_sglx_fname(fname):
    """Parse recording identifiers from a SpikeGLX style filename stem.

    Paramters
    ---------
    fname: str
        The filename to parse, e.g. "my-run-name_g0_t1.imec2.lf.bin"

    Returns
    -------
    run: str
        The run name, e.g. "my-run-name".
    gate: str
        The gate identifier, e.g. "g0".
    trigger: str
        The trigger identifier, e.g. "t1".
    probe: str
        The probe identifier, e.g. "imec2"
    stream: str
        The data type identifier, "lf" or "ap"
    ftype: str
        The file type identifier, "bin" or "meta"
    """
    x = re.search(r"_g\d+_t\d+\.imec\d+.(ap|lf).(bin|meta)\Z", fname)  # \Z forces match at string end.
    run = fname[: x.span()[0]]  # The run name is everything before the match
    gate = re.search(r"g\d+", x.group()).group()
    trigger = re.search(r"t\d+", x.group()).group()
    probe = re.search(r"imec\d+", x.group()).group()
    stream = re.search(r"(ap|lf)", x.group()).group()
    ftype = re.search(r"(bin|meta)", x.group()).group()

    return (run, gate, trigger, probe, stream, ftype)

In [429]:
fname = '3-1-2021_A_g1_t0.imec0.lf.meta'
parse_sglx_fname(fname)

('3-1-2021_A', 'g1', 't0', 'imec0', 'lf', 'meta')

In [431]:
probe_dir = Path(doc['raw-data-root'], session['directory'], f"{run}_{gate}", f"{run}_{gate}_{probe}")
probe_dir

PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0')

In [433]:
def get_trigger_files(probe_dir):
    matches = [p for p in probe_dir.glob("*_g*_t*.imec[0-9].*.*") if (p.is_file() and re.search(r"_g\d+_t\d+\.imec\d+.(ap|lf).(bin|meta)\Z", p.name))]
    return sorted(matches)

get_trigger_files(probe_dir)

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.lf.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.lf.bin'),
 Po

In [434]:
# This function is probably unneccessary

def get_unique_trigger_stems(probe_dir, with_probe=False):
    parses = [parse_sglx_fname(f.name) for f in get_trigger_files(probe_dir)]
    if with_probe:
        stems = [f"{run}_{gate}_{trigger}.{probe}" for run, gate, trigger, probe, stream, ftype in parses]
    else:
        stems = [f"{run}_{gate}_{trigger}" for run, gate, trigger, probe, stream, ftype in parses]
    return sorted(dict.fromkeys(stems))


print(get_unique_trigger_stems(probe_dir))
print(get_unique_trigger_stems(probe_dir, with_probe=True))

['3-1-2021_A_g1_t0', '3-1-2021_A_g1_t1']
['3-1-2021_A_g1_t0.imec0', '3-1-2021_A_g1_t1.imec0']


In [435]:
session_dir = Path(doc['raw-data-root'], session['directory'])

In [436]:
gate_dir = session_dir / f"{run}_{gate}"

### The following functions assume folder-per-probe organization

In [440]:
def get_probe_directories(gate_dir, probe_regex=r"imec\d+"): 
    search_string = r"_g\d+_" + probe_regex + r"\Z"
    matches = [p for p in gate_dir.glob(f"{gate_dir.name}_imec[0-9]") if (p.is_dir() and re.search(search_string, p.name))]
    return sorted(matches)

get_probe_directories(gate_dir)

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec1')]

In [441]:
get_probe_directories(gate_dir, probe_regex="imec0")

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0')]

In [442]:
def get_gate_files(gate_dir):
    return list(
        chain.from_iterable(
            get_trigger_files(probe_dir)
            for probe_dir in get_probe_directories(gate_dir)
        )
    )
    
get_gate_files(gate_dir)

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.lf.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.lf.bin'),
 Po

### Functions that extend the SpikeGLX schema

In [463]:
def get_gate_directories(session_dir, run):
    matches = [p for p in session_dir.glob(f"{run}_g*") if (p.is_dir() and re.search(r"_g\d+\Z", p.name))]
    return sorted(matches)

get_gate_directories(session_dir, run)

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1')]

In [464]:
def get_run_files(session_dir, run):
    return list(
        chain.from_iterable(
            get_gate_files(gate_dir)
            for gate_dir in get_gate_directories(session_dir, run)
        )
    )

get_run_files(session_dir, run)

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec0/3-1-2021_A_g0_t0.imec0.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec0/3-1-2021_A_g0_t0.imec0.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec0/3-1-2021_A_g0_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec0/3-1-2021_A_g0_t0.imec0.lf.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec1/3-1-2021_A_g0_t0.imec1.ap.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec1/3-1-2021_A_g0_t0.imec1.ap.meta'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec1/3-1-2021_A_g0_t0.imec1.lf.bin'),
 Po

In [28]:
def get_session_files(session_dir, runs):
    return list(
        chain.from_iterable(get_run_files(session_dir, run) for run in runs)
    )

get_session_files(session_dir, session["SpikeGLX-runs"]);

In [50]:
# This is probably not a useful function, and mostly exists for completeness. 
def get_document_files(doc):
    return list(
        chain.from_iterable(
            get_session_files(
                Path(doc["raw-data-root"]) / session["directory"],
                session["SpikeGLX-runs"]
            )
            for session in doc["recording-sessions"]
        )
    )


get_document_files(doc);

### Getting experiment files

In [372]:
def get_experiment_files(doc, experiment_name):
    return list(
        chain.from_iterable(
            get_session_files(
                Path(doc["raw-data-root"]) / session["directory"],
                session["SpikeGLX-runs"],
            )
            for session in doc["experiments"][experiment_name]['recording-sessions']
        )
    )

get_experiment_files(doc, 'sleep-homeostasis');

In [162]:
import pandas as pd

def _slice_files_by_name(files, start, end):
    """Files must be sorted BY STEM (e.g. separated by probe) before using this function,
    else pd.Index.slice_locs will correctly raise an error."""
    parses = [parse_sglx_fname(f.name) for f in files]
    stems = [
        f"{run}_{gate}_{trigger}" for run, gate, trigger, probe, stream, ftype in parses
    ]
    (start, end) = pd.Index(stems).slice_locs(start, end)
    return files[start:end]


def get_alias_files(doc, experiment_name, alias_name):
    alias = doc["experiments"][experiment_name]["aliases"][alias_name]
    experiment_files = get_experiment_files(doc, experiment_name)
    alias_files_by_probe = {
        probe: _slice_files_by_name(files, alias["start_file"], alias["end_file"])
        for probe, files in separate_files_by_probe(experiment_files).items()
    }
    return [f for f in experiment_files if f in list(chain.from_iterable(alias_files_by_probe.values()))]


get_alias_files(doc, "sleep-homeostasis", "light-period-circadian-match");

# Manipulating non-dataframe file lists

In [19]:
def filter_files(files, run=None, gate=None, trigger=None, probe=None, stream=None, ftype=None):
    desired = (run, gate, trigger, probe, stream, ftype)
    def keep_file(fname):
        actual = parse_sglx_fname(fname)
        keep = map(lambda x, y: x is None or x == y, desired, actual)
        return all(keep)

    return [f for f in files if keep_file(f.name)]

In [465]:
filter_files(get_document_files(doc), probe='imec0', stream='lf', ftype='bin')

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec0/3-1-2021_g0_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021-a_g0/3-1-2021-a_g0_imec0/3-1-2021-a_g0_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g0/3-1-2021_A_g0_imec0/3-1-2021_A_g0_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_A_g1/3-1-2021_A_g1_imec0/3-1-2021_A_g1_t1.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_B_g0/3-1-2021_B_g0_imec0/3-1-2021_B_g0_t0.imec0.lf.bin'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_C_g0/3-1-2021_C_g0_imec0/3-1-2021_C_g0_t0.imec0.lf.bin'),
 PosixPath('

In [466]:
def remove_suffixes(files, regex=r"\.imec\d+\.(lf|ap)\.(bin|meta)"):
    return [f.parent / re.sub(regex, '', f.name) for f in files]

remove_suffixes(get_document_files(doc))

[PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec0/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec0/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec0/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec0/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec1/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec1/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec1/3-1-2021_g0_t0'),
 PosixPath('/Volumes/neuropixel_archive/Data/chronic/CNPIX8-Allan/3-1-2021/3-1-2021_g0/3-1-2021_g0_imec1/3-1-2021_g0_t0'),
 PosixPath('/Vol

In [352]:
# This function isn't really for files, it's just generic, and can probably be gotten rid of
def remove_duplicates(files):
    return list(dict.fromkeys(files))

remove_duplicates(remove_suffixes(filter_files(get_document_files(doc), probe='imec0', stream='lf', ftype='bin')));

In [195]:
def separate_files_by_probe(files):
    parses = [parse_sglx_fname(f.name) for f in files]
    probes = [probe for run, gate, trigger, probe, stream, ftype in parses]
    unique_probes = sorted(dict.fromkeys(probes))
    return {probe: [f for f, p in zip(files, probes) if p == probe] for probe in unique_probes}

separate_files_by_probe(get_document_files(doc));

# Manipulating files with dataframes

In [346]:
def files_to_frame(files):
    runs, gates, triggers, probes, streams, ftypes = zip(
        *[parse_sglx_fname(f.name) for f in files]
    )
    return pd.DataFrame(
        {
            "run": runs,
            "gate": gates,
            "trigger": triggers,
            "probe": probes,
            "stream": streams,
            "ftype": ftypes,
            "path": files,
        }
    )

In [347]:
files_to_frame(get_experiment_files(doc, 'sleep-homeostasis'));

In [389]:
from pandas.api.types import CategoricalDtype


def sort_strings_by_integer_suffix(strings):
    return sorted(strings, key=lambda string: int(re.split(r"(^[^\d]+)", string)[-1]))


def make_categorical(df, run_order="infer"):

    if run_order == "infer":
        run_dtype = CategoricalDtype(df["run"].unique(), ordered=True)
    elif run_order:
        run_dtype = CategoricalDtype(run_order, ordered=True)
    else:
        run_dtype = CategoricalDtype(df["run"].unique(), ordered=False)

    df["run"] = df["run"].astype(run_dtype)
    df["stream"] = df["stream"].astype(
        CategoricalDtype(df["stream"].unique(), ordered=False)
    )
    for x in ["gate", "trigger", "probe"]:
        df[x] = df[x].astype(
            CategoricalDtype(
                sort_strings_by_integer_suffix(df[x].unique()), ordered=True
            )
        )

    return df.set_index(["run", "gate", "trigger", "probe", "stream", "ftype"])


make_categorical(files_to_frame(get_experiment_files(doc, "sleep-homeostasis")));

In [390]:
def get_experiment_files_as_sorted_frame(doc, experiment):
    return make_categorical(files_to_frame(get_experiment_files(doc, "sleep-homeostasis"))).sort_index()

In [416]:
def parse_trigger_stem(stem):
    x = re.search(r"_g\d+_t\d+\Z", stem)  # \Z forces match at string end.
    run = stem[: x.span()[0]]  # The run name is everything before the match
    gate = re.search(r"g\d+", x.group()).group()
    trigger = re.search(r"t\d+", x.group()).group()

    return (run, gate, trigger)

def get_alias_files_as_sorted_frame(doc, experiment_name, alias_name):
    alias = doc["experiments"][experiment_name]["aliases"][alias_name]
    df = get_experiment_files_as_sorted_frame(doc, experiment_name)
    return df[parse_trigger_stem(alias['start_file']) : parse_trigger_stem(alias['end_file'])]

get_alias_files_as_sorted_frame(doc, 'sleep-homeostasis', 'light-period-circadian-match');

In [None]:
#Create new branch
#SGLX schema + agnostic files should be in their own file, probably in sglx_utils
#Functions that extend SGLX schema should be in a separate file, in ecephys_analyses, and should return dataframes