# Check the content of the data files

For each file in `goodFiles.txt`, check that the date in the name of the file corresponds to that inside the file.

In [None]:
# this should be the set to false for production use
# and set to true for testing (default) but can be already set to false from outside (e.g., by the test script)

if "doRunTests" not in globals():
    doRunTests = True

# Check the content of ONE data file

For one file, check that the date in the name of the file corresponds to that inside the file. 

Here, I use simple, not really structured code, as a first attempt toward the solution.

# Check the content of ALL data files

For all files, check that the date in the name of the file corresponds to that inside the file.

Here, I use structured code, inspired by the previous attempt, but with functions.

In [None]:
import os
import numpy as np
from datetime import datetime


def check_csv_date(fullFname, msg=""):
    """
    Check the date of a csv file
    The date is extracted from the first timestamp in the file
    """
    # date is set to 1970-01-01 00:00:00 if not found
    date = datetime.fromtimestamp(0, tz=None)
    if not fullFname.endswith(".csv"):
        msg += f"{fullFname} is not a csv file."
        return date, msg

    # The timestamps are always in column 1, after 3-4 lines of header
    # but it can be a string or a float (in milliseconds)
    data = np.loadtxt(fullFname, skiprows=4, delimiter=",", max_rows=5, dtype=str)
    # we only need the first timestamp
    timestamp = data[0, 0]
    try:
        timestamp = float(timestamp) / 1000  # in seconds
    except:
        pass
    if isinstance(timestamp, float):
        date = datetime.fromtimestamp(timestamp, tz=None)
    if isinstance(timestamp, str):
        date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
    return date, msg


if doRunTests:
    dataPath = "../dat/ReArm.lnk/ReArm_C1P02"
    fname = "Reaching/ReArm_C1P02_20210322_1_r_k.csv"
    fnameToTest = os.path.join(dataPath, fname)
    print(check_csv_date(fnameToTest))

In [None]:
def check_xdf_date(fullFname, msg=""):
    """
    Check the date of a xdf file
    The date is extracted from the corresponding kinect file in csv format
    The Kinect file must have the same number of lines as the EuroMov Mocap Kinect stream.
    """

    if not fullFname.endswith(".xdf"):
        msg += f"{fullFname} is not a XDF file."
        return None, msg

    import pyxdf

    # load the EuroMov-Mocap-Kinect stream
    data, header = pyxdf.load_xdf(
        filename=fullFname,
        select_streams=[{"type": "MoCap", "name": "EuroMov-Mocap-Kinect"}],
        synchronize_clocks=True,
        dejitter_timestamps=False,  # to get the raw timestamps to compare with the CSV
    )
    data_XDF = data[0]

    # get the corresponding kinect file name
    fpath = os.path.dirname(fullFname)
    fname = os.path.basename(fullFname)
    fname, extension = os.path.splitext(fname)
    tokens = fname.split("_")
    kinectFname = (
        tokens[0]
        + "_"
        + tokens[1]
        + "_"
        + tokens[2]
        + "_"
        + tokens[3]
        + "_"
        + tokens[4]
        + "_k.csv"
    )
    fullFnamek = os.path.join(fpath, kinectFname)

    # check if the kinect file exists
    if not os.path.isfile(fullFnamek):
        msg += f"Kinect file {fullFnamek} not found."
        return None, msg

    # get the kinect data
    data_CSV = np.loadtxt(fullFnamek, skiprows=3, delimiter=",", dtype=float)

    # get the number of lines in the xdf and csv file
    nbLines_XDF = len(data_XDF["time_stamps"])
    nbLines_CSV = data_CSV.shape[0]

    # check if the number of lines match
    if nbLines_CSV == nbLines_XDF and nbLines_CSV > 0:
        timestampCSV_sec = data_CSV[:, 0] / 1000
        date = datetime.fromtimestamp(timestampCSV_sec[0], tz=None)
    else:
        msg += f"Number of lines ({nbLines_CSV}) in the CSV file {fullFnamek} does not match the number of lines ({nbLines_XDF}) in the XDF file ({fullFname})."

    return date, msg


if doRunTests:
    dataPath = "../dat/ReArm.lnk/ReArm_C1P02"
    fname = "Circle/ReArm_C1P02_20210322_1_c.xdf"
    fnameToTest = os.path.join(dataPath, fname)

    print(check_xdf_date(fnameToTest))

In [None]:
def check_cwa_date(fullFname, msg=""):
    """
    Check the date of a cwa file
    The date is extracted from the header of the cwa file
    """

    if not fullFname.endswith(".cwa"):
        msg += f"{fullFname} is not a CWA file."
        return None, None, msg

    from openmovement.load import CwaData

    # load the CWA file (using with statement to ensure file is closed after use)
    with CwaData(
        fullFname, include_gyro=False, include_temperature=True, verbose=False
    ) as cwa_data:
        pass

    # get the date from the header in the cwa file
    timestamp = cwa_data.header["loggingEnd"]
    date = datetime.fromtimestamp(timestamp, tz=None)

    # get the date from the file name
    fname = os.path.basename(fullFname)
    tokens = fname.split("_")
    dateFromFname = datetime.strptime(tokens[2], "%Y%m%d")
    # number of days  between the start of the recording and the date in the file name
    delta = date - dateFromFname
    nbDays = delta.days
    if nbDays > 0:
        msg += f"The recording started {nbDays} days after the date in the file name."
    else:
        msg += f"The recording started {-nbDays} days before the date in the file name."
    return date, nbDays, msg


if doRunTests:
    dataPath = "../dat/ReArm.lnk/ReArm_C1P02"
    fname = "Accelerometry/ReArm_C1P02_20210324_1_ac_np.cwa"
    fnameToTest = os.path.join(dataPath, fname)

    print(check_cwa_date(fnameToTest))

In [None]:
def check_file_date(fullFname):
    # analyse the extension of the file
    fpath = os.path.dirname(fullFname)
    fname = os.path.basename(fullFname)
    fname, extension = os.path.splitext(fname)

    # split the file name into tokens
    tokens = fname.split("_")

    # create a message for this file
    msg = ""
    msg += f"{fname}{extension}: "

    # date is set to 1970-01-01 00:00:00 if not found
    date = datetime.fromtimestamp(0, tz=None)

    nbDays = 0

    if extension == ".csv" and len(tokens) > 4 and tokens[5] in ["k", "l"]:
        date, msg = check_csv_date(fullFname, msg)

    if extension == ".xdf":
        date, msg = check_xdf_date(fullFname, msg)

    if extension == ".cwa":
        date, nbDays, msg = check_cwa_date(fullFname, msg)

    if extension in [".oxy3", ".oxy4", ".easy", ".pdf"]:
        date = datetime.strptime(tokens[2], "%Y%m%d")
        msg += "Cannot check date inside {} files. ".format(extension)

    if date is not None:
        date = date.strftime("%Y%m%d")
    if date == tokens[2]:
        # msg += f"({tokens[2]}) matches date inside the file ({date})."
        msg += f" OK."
    else:
        msg += f"({tokens[2]}) does not match date inside the file ({date})."

    if not msg.endswith("OK."):
        print(msg)

def check_all_files_date(visit_path):
    fullFname_goodFiles = os.path.join(visit_path, "goodFiles.log")
    # load goodFiles.txt into a list
    goodFiles = []
    with open(fullFname_goodFiles, "r") as f:
        for line in f:
            goodFiles.append(line.strip())

    for fullFname in goodFiles:
        # print(fullFname)
        check_file_date(fullFname)

        
if doRunTests:
    fullFname_goodFiles = "../dat/ReArm.lnk/ReArm_C1P02"
    check_all_files_date(fullFname_goodFiles)

