# Check the content of the data files

For each file in `goodFiles.txt`, check that the date in the name of the file corresponds to that inside the file.

In [None]:
dataDirectory = "dat/ReArm.lnk/ReArm_C1P02"

# load goodFiles.txt into a list
goodFiles = []
with open("../" + dataDirectory + "/goodFiles.txt", "r") as f:
    for line in f:
        goodFiles.append(line.strip())

print("goodFiles.txt contains {} files".format(len(goodFiles)))

In [None]:
# list all files in the data directory recursively and print the number of files
import os

allFiles = []
print("Searching for files in {}".format(dataDirectory))
print("Files not in 'goodFiles.txt':")
for root, dirs, files in os.walk("../" + dataDirectory):
    for name in files:
        if name == "goodFiles.txt" or name.startswith("."):
            continue
        # get the full path to the file
        fullPath = os.path.join(root, name)
        fullPath = os.path.abspath(fullPath)
        # if this is a file (not a directory) and it is not in goodFiles.txt
        if os.path.isfile(fullPath) and fullPath not in goodFiles:
            # print the file name
            print(fullPath)
            # add the file name to the list
        allFiles.append(fullPath)

print("allFiles contains {} files".format(len(allFiles)))

# Check the content of ONE data file

For one file, check that the date in the name of the file corresponds to that inside the file. 

Here, I use simple, not really structured code, as a first attempt toward the solution.

In [None]:
import numpy as np
from datetime import datetime
import os


fullFname = goodFiles[8]  # xdf file
fullFname = goodFiles[18]  # cwa file
# fullFname = goodFiles[0] # csv file

# analyse the extension of the file
fpath = os.path.dirname(fullFname)
fname = os.path.basename(fullFname)
fname, extension = os.path.splitext(fname)

# split the file name into tokens
tokens = fname.split("_")

# date is set to 1970-01-01 00:00:00 if not found
date = datetime.fromtimestamp(0, tz=None)

# create a message for this file
msg = ""
msg += f"{fname}{extension}: "


if extension == ".csv" and len(tokens) > 4 and tokens[5] in ["k", "l"]:
    # The timestamps are always in column 1, after 3 lines of header
    # but it can be a string or a float (in milliseconds)
    data = np.loadtxt(fullFname, skiprows=3, delimiter=",", max_rows=4, dtype=str)
    # we only need the first timestamp
    timestamp = data[0, 0]
    try:
        timestamp = float(timestamp) / 1000  # in seconds
    except:
        pass
    if isinstance(timestamp, float):
        date = datetime.fromtimestamp(timestamp, tz=None)
    if isinstance(timestamp, str):
        date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")


if extension == ".xdf":
    # There is no recording date in the xdf file...
    # We shall use the recording date from the corresponding kinect file in csv format
    import pyxdf

    # load the EuroMov-Mocap-Kinect stream
    data, header = pyxdf.load_xdf(
        filename=fullFname,
        select_streams=[{"type": "MoCap", "name": "EuroMov-Mocap-Kinect"}],
        synchronize_clocks=True,
        dejitter_timestamps=False,  # to get the raw timestamps to compare with the CSV
    )
    data_XDF = data[0]

    # get the corresponding kinect file name
    kinectFname = (
        tokens[0]
        + "_"
        + tokens[1]
        + "_"
        + tokens[2]
        + "_"
        + tokens[3]
        + "_"
        + tokens[4]
        + "_k.csv"
    )
    fullFnamek = os.path.join(fpath, kinectFname)

    # get the kinect data
    data_CSV = np.loadtxt(fullFnamek, skiprows=3, delimiter=",", dtype=float)

    # get the number of lines in the xdf and csv file
    nbLines_XDF = len(data_XDF["time_stamps"])
    nbLines_CSV = data_CSV.shape[0]

    # check if the number of lines match
    if nbLines_CSV == nbLines_XDF and nbLines_CSV > 0:
        timestampCSV_sec = data_CSV[:, 0] / 1000
        date = datetime.fromtimestamp(timestampCSV_sec[0], tz=None)
    else:
        msg += f"Number of lines in the CSV file ({nbLines_CSV}) does not match the number of lines in the XDF file ({nbLines_XDF})."

if extension == ".cwa":
    from openmovement.load import CwaData

    # load the CWA file (using with statement to ensure file is closed after use)
    with CwaData(
        fullFname, include_gyro=False, include_temperature=True, verbose=False
    ) as cwa_data:
        pass

    timestamp = cwa_data.header["loggingEnd"]
    date = datetime.fromtimestamp(timestamp, tz=None)

    # number of days  between the start of the recording and the date in the file name
    delta = date - datetime.strptime(tokens[2], "%Y%m%d")
    nbDays = delta.days
    if nbDays > 0:
        msg += f"The recording started {nbDays} days after the date in the file name."
    else:
        msg += f"The recording started {-nbDays} days before the date in the file name."


date = date.strftime("%Y%m%d")
if date == tokens[2]:
    msg += f"({tokens[2]}) matches date inside the file ({date})."
else:
    msg += f"({tokens[2]}) does not match date inside the file ({date})."

print(msg)

# Check the content of ALL data files

For all files, check that the date in the name of the file corresponds to that inside the file.

Here, I use structured code, inspired by the previous attempt, but with functions.

In [None]:
def check_csv_date(fullFname, msg=""):
    """
    Check the date of a csv file
    The date is extracted from the first timestamp in the file
    """
    # date is set to 1970-01-01 00:00:00 if not found
    date = datetime.fromtimestamp(0, tz=None)
    if not fullFname.endswith(".csv"):
        msg += f"{fullFname} is not a csv file."
        return date, msg

    # The timestamps are always in column 1, after 3-4 lines of header
    # but it can be a string or a float (in milliseconds)
    data = np.loadtxt(fullFname, skiprows=4, delimiter=",", max_rows=5, dtype=str)
    # we only need the first timestamp
    timestamp = data[0, 0]
    try:
        timestamp = float(timestamp) / 1000  # in seconds
    except:
        pass
    if isinstance(timestamp, float):
        date = datetime.fromtimestamp(timestamp, tz=None)
    if isinstance(timestamp, str):
        date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
    return date, msg


print(check_csv_date(goodFiles[0]))

In [None]:
def check_xdf_date(fullFname, msg=""):
    """
    Check the date of a xdf file
    The date is extracted from the corresponding kinect file in csv format
    The Kinect file must have the same number of lines as the EuroMov Mocap Kinect stream.
    """

    if not fullFname.endswith(".xdf"):
        msg += f"{fullFname} is not a XDF file."
        return None, msg

    import pyxdf

    # load the EuroMov-Mocap-Kinect stream
    data, header = pyxdf.load_xdf(
        filename=fullFname,
        select_streams=[{"type": "MoCap", "name": "EuroMov-Mocap-Kinect"}],
        synchronize_clocks=True,
        dejitter_timestamps=False,  # to get the raw timestamps to compare with the CSV
    )
    data_XDF = data[0]

    # get the corresponding kinect file name
    fpath = os.path.dirname(fullFname)
    fname = os.path.basename(fullFname)
    fname, extension = os.path.splitext(fname)
    tokens = fname.split("_")
    kinectFname = (
        tokens[0]
        + "_"
        + tokens[1]
        + "_"
        + tokens[2]
        + "_"
        + tokens[3]
        + "_"
        + tokens[4]
        + "_k.csv"
    )
    fullFnamek = os.path.join(fpath, kinectFname)

    # check if the kinect file exists
    if not os.path.isfile(fullFnamek):
        msg += f"Kinect file {fullFnamek} not found."
        return None, msg

    # get the kinect data
    data_CSV = np.loadtxt(fullFnamek, skiprows=3, delimiter=",", dtype=float)

    # get the number of lines in the xdf and csv file
    nbLines_XDF = len(data_XDF["time_stamps"])
    nbLines_CSV = data_CSV.shape[0]

    # check if the number of lines match
    if nbLines_CSV == nbLines_XDF and nbLines_CSV > 0:
        timestampCSV_sec = data_CSV[:, 0] / 1000
        date = datetime.fromtimestamp(timestampCSV_sec[0], tz=None)
    else:
        msg += f"Number of lines ({nbLines_CSV}) in the CSV file {fullFnamek} does not match the number of lines ({nbLines_XDF}) in the XDF file ({fullFname})."

    return date, msg


print(check_xdf_date(goodFiles[8]))

In [None]:
def check_cwa_date(fullFname, msg=""):
    """
    Check the date of a cwa file
    The date is extracted from the header of the cwa file
    """

    if not fullFname.endswith(".cwa"):
        msg += f"{fullFname} is not a CWA file."
        return None, None, msg

    from openmovement.load import CwaData

    # load the CWA file (using with statement to ensure file is closed after use)
    with CwaData(
        fullFname, include_gyro=False, include_temperature=True, verbose=False
    ) as cwa_data:
        pass

    timestamp = cwa_data.header["loggingEnd"]
    date = datetime.fromtimestamp(timestamp, tz=None)

    # number of days  between the start of the recording and the date in the file name
    delta = date - datetime.strptime(tokens[2], "%Y%m%d")
    nbDays = delta.days
    if nbDays > 0:
        msg += f"The recording started {nbDays} days after the date in the file name."
    else:
        msg += f"The recording started {-nbDays} days before the date in the file name."
    return date, nbDays, msg


print(check_cwa_date(goodFiles[20]))

In [None]:
import numpy as np
from datetime import datetime
import os


fullFname = goodFiles[8]  # xdf file
# fullFname = goodFiles[18] # cwa file
fullFname = goodFiles[0]  # csv file


def check_file_date(fullFname):
    # analyse the extension of the file
    fpath = os.path.dirname(fullFname)
    fname = os.path.basename(fullFname)
    fname, extension = os.path.splitext(fname)

    # split the file name into tokens
    tokens = fname.split("_")

    # create a message for this file
    msg = ""
    msg += f"{fname}{extension}: "

    # date is set to 1970-01-01 00:00:00 if not found
    date = datetime.fromtimestamp(0, tz=None)

    nbDays = 0

    if extension == ".csv" and len(tokens) > 4 and tokens[5] in ["k", "l"]:
        date, msg = check_csv_date(fullFname, msg)

    if extension == ".xdf":
        date, msg = check_xdf_date(fullFname, msg)

    if extension == ".cwa":
        date, nbDays, msg = check_cwa_date(fullFname, msg)

    if extension in [".oxy3", ".oxy4", ".easy"]:
        date = datetime.strptime(tokens[2], "%Y%m%d")
        msg += f"Cannot check date inside 'oxy' or 'easy' files. "

    if date is not None:
        date = date.strftime("%Y%m%d")
    if date == tokens[2]:
        # msg += f"({tokens[2]}) matches date inside the file ({date})."
        msg += f" OK."
    else:
        msg += f"({tokens[2]}) does not match date inside the file ({date})."

    print(msg)


for fullFname in goodFiles:
    # print(fullFname)
    check_file_date(fullFname)


# check_file_date (goodFiles[15])