Move TPC-H tab-delimited files (.tbl extension) from the /databricks-datasets/ directory into a Unity Catalog Volume. This pipeline will ingest generated incremental files in addition to the one-time historical files, so it's easier to write everything to a UC volume instead of reading directly from /databricks-datasets. 

In [None]:
catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.raw")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.config")

folders_to_copy = [(folder.path, folder.name.replace('/','')) for folder in dbutils.fs.ls("/databricks-datasets/tpch/data-001/") if folder.name != 'README.md'] 

folders_in_dest_name = [file.name.replace('/','') for file in dbutils.fs.ls(f"/Volumes/{catalog_name}/{schema_name}/raw/")]

for path, name in folders_to_copy:
  if name not in folders_in_dest_name:
    print(f"Copying files to /Volumes/{catalog_name}/{schema_name}/raw/{name}/...")
    dbutils.fs.cp(path, f"/Volumes/{catalog_name}/{schema_name}/raw/{name}/", recurse=True)
  else:
    print(f"/Volumes/{catalog_name}/{schema_name}/raw/{name}/ is already populated, moving on to the next one...")
    pass # if files are already in the path, ignore it, since we don't want to overwrite them or any other files in the directory