# Functions

This notebook contains reusable functions used in other notebooks. It should be run at the beginning of each notebook using ```%%run NB_Functions```.

Contains the following functions:
- get_dir_content
- extract_date_from_path
- get_most_recent_date_in_epoch
- get_most_recent_date


In [None]:
import re
from notebookutils import mssparkutils
from datetime import datetime

In [None]:
def get_dir_content(folder_path, extension=None):
    """
    Recursively retrieves all files within a root folder. Optionally filters files by their extension.

    Arguments:
    folder_path : str : Path to the root folder to search.
    extension : str : Optional. File extension to filter by (e.g., '.json').

    Returns:
    List of file paths.
    """
    dir_paths = mssparkutils.fs.ls(folder_path)
    all_paths = []

    for p in dir_paths:
        if p.isDir:
            all_paths.extend(get_dir_content(p.path, extension))
        else:
            if extension is None or p.path.endswith(extension):
                all_paths.append(p.path)
    
    return all_paths

def extract_date_from_path(file_path):
    """
    Extracts the date from a file path using regular expressions. Date needs to match the YYYY/MM/DD structure.

    Arguments:
    file_path: path to the file to extract the date from.

    Returns:
    Date in format YYYY/MM/DD if found in the file path, None otherwise.
    """
    match = re.search(r'/(\d{4}/\d{2}/\d{2})/', file_path)     # Regular expression to match date in the path in format YYYY/MM/DD

    if match:
        date_str = match.group(1)
        date_obj = datetime.strptime(date_str, '%Y/%m/%d')
        return date_obj

    return None

def get_most_recent_date_in_epoch(files):
    """
    Extracts dates from a list of files and returns the most recent date in epoch format. Dates are extracted using the extract_date_from_path function.

    Arguments:
    files: list of files to extract the date from.

    Returns:
    Date in epoch format.
    """
    file_dates = [extract_date_from_path(file) for file in files]     # Extract dates from files
    most_recent_date = max(file_dates)    # Get the most recent file

    return int(most_recent_date.timestamp())

def get_most_recent_date(files):
    """
    Extracts dates from a list of files and returns the most recent date in YYYY/MM/DD format. Dates are extracted using the extract_date_from_path function.

    Arguments:
    files: list of files to extract the date from.

    Returns:
    Date in YYYY/MM/DD format.
    """
    file_dates = [extract_date_from_path(file) for file in files]     # Extract dates from files
    most_recent_date = max(file_dates)    # Get the most recent file

    return most_recent_date
