# Explore all of the manifests

This notebook will 
- read all of the manifest.tsv files
- search for _filepath as a column name
- check that each of those files exists

In [None]:
import glob
import os

import pandas as pd
from datetime import datetime

In [None]:
t_start = datetime.now()

## Locate all manifests

In [None]:
data_root = "/Volumes/data/datasets/AIREADI/YEAR2/"  # change this to your own path - another example

In [None]:
manifest_list = glob.glob(data_root + "/*/manifest.tsv")
print(f"{len(manifest_list)} manifest.tsv files found")

In [None]:
for m in manifest_list:
    print(m)

# Supporting functions

In [None]:
def file_exists(f):
    if os.path.exists(f) and os.path.getsize(f) > 0:
        fsize = os.path.getsize(f)
    else:
        fsize = 0  # indicates it does not exist or is empty
    return fsize

In [None]:
def read_manifest(m, verbose=False):
    df = pd.read_csv(m, sep="\t")
    if verbose:
        print(f"{m} has columns:\n{df.columns}")

    file_path_cols = [c for c in df.columns if ("filepath" in c)]
    # file_path_cols = [c for c in df.columns if (('file' in c) and not ('file_' in c))]
    if verbose:
        print(f"  file_path columns are {file_path_cols}")

    return df, file_path_cols

# harmonize column names

In [None]:
for m in manifest_list:
    print("\n", 40 * "_")
    df, fpcols = read_manifest(m, verbose=True)

In [None]:
print(manifest_list)

## manifest read check

This script will read each manifest and assemble the information into a pandas dataframe. This can take a long time and is
optional, of course. If the manifest list above was listed, then the manifest files have already been found.

In [None]:
verbose = False  # omit extra statements to help this run more quickly

for m in manifest_list:
    # for m in ['/Data/dataset/wearable_activity_monitor/manifest.tsv', '/Data/dataset/wearable_blood_glucose/manifest.tsv']:
    print("\n", 40 * "_", f" {m} ", 10 * "_")
    df, fpcols = read_manifest(m, verbose=False)
    for c in fpcols:
        print(c)
        for f in df[c].values:
            if (pd.isna(f)) or (f == "None"):
                if verbose:
                    print(f"No filepath for one item in {c}; skip file size check. {f}")
            else:
                try:
                    # fpath = root_dir + f
                    fpath = data_root + f
                except Exception as e:
                    print(f"Failed to concatenate file name {f} due to {e.reason}")
                fsize = file_exists(fpath)
                if verbose and fsize < 10:  # pick some small size to note
                    print(f"Small file report: {fsize} size of {fpath}")
    print(f"Dataframe shape: {df.shape}")

In [None]:
# Useful settings if this notebook is used to view any of the dataframes

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

In [None]:
t_finish = datetime.now()

In [None]:
print("Done finding files.")
print(f"Runtime: {t_finish - t_start}") # may be 30 minutes or longer