In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from datetime import datetime
import re

# Inicia SparkSession si aún no está iniciada
spark = SparkSession.builder.appName("FileProcessing").getOrCreate()

def get_latest_file(lakehouse_path: str) -> str:
    """
    Retrieves the latest CSV file in a given source table folder based on the filename date.

    :param src_table: Name of the table (subfolder in Lakehouse).
    :return: Path of the latest file, or None if no valid files are found.
    """

    # Define the base path for the table's folder
    # lakehouse_path = f"abfss://b2c899fb-e571-4496-aebf-c7a23083635a@onelake.dfs.fabric.microsoft.com/a00cf91a-f92e-498a-9f14-ba10221fb05a/Files/User Created Data/{src_table}/"

    try:
        # List all files in the specified directory
        files_df = spark.read.format("binaryFile").load(lakehouse_path)
        file_paths = [row.path for row in files_df.select("path").collect()]
        
        if not file_paths:
            print(f"No files found in {lakehouse_path}")
            return None

        # Define regex pattern for extracting date and table name
        pattern = r".*/(\d{4}-\d{2}-\d{2})([A-Za-z0-9_]+).csv$"

        # Extract dates and table names
        valid_files = [
            (fp, datetime.strptime(m.group(1), "%Y-%m-%d"), m.group(2))
            for fp in file_paths
            if (m := re.search(pattern, fp))
        ]

        if not valid_files:
            print(f"No valid files matching pattern found in {lakehouse_path}")
            return None

        # Get the latest file based on the extracted date
        latest_file = max(valid_files, key=lambda x: x[1])
        latest_file_path = latest_file[0]
        latest_table_name = latest_file[2]

        # Now, read the CSV file into a DataFrame
        df = spark.read.csv(latest_file_path, header=True, inferSchema=True)

        # Optionally, you can return the DataFrame or just the path
        print(f"Latest file for {lakehouse_path}: {latest_file_path}")
        return df

    except Exception as e:
        print(f"Error processing {lakehouse_path}: {e}")
        return None



StatementMeta(, 4afccb1a-ad93-4741-837b-dc44c4810f12, 18, Finished, Available, Finished)