# Parameters passed from Pipeline

In [None]:
dynamicyear = ""
dynamicmonth = ""
stgAccountName = ""
retentionDate = ""
retentionTimeInYears = 2

# Delete data no longer needed on the retention policy in the Data Lake (2 years in our example)

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

adls2_account_name = stgAccountName
adls2_container_name = "datalake"
adls2_folderyear = dynamicyear
adls2_foldermonth = dynamicmonth

spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")  
delta_table_path = 'abfss://{0}@{1}.dfs.core.windows.net/{2}/{3}/'.format(adls2_container_name, adls2_account_name, adls2_folderyear, adls2_foldermonth)

# Create Delta Table Object
deltaTableLake = DeltaTable.forPath(spark, delta_table_path )

# Delete data according to the company police (2 years on our example)
deleteStatement = '{0}'.format(retentionDate)
deltaTableLake.delete("tpep_pickup_datetime < '{0}'".format(deleteStatement)) 
# Vacuum files that will no longer be referenced
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false") 
# zero refers to no retention time for this files, so it means they will be deleted immediately
deltaTableLake.vacuum(0)



# Update configuration file (deleting data no covered by retention, so is not re-ingested)

## Install azure storage library

In [None]:
pip install azure-storage-file-datalake

## Define functions

In [174]:
import json
from collections import defaultdict
from datetime import datetime
from azure.storage.filedatalake import DataLakeServiceClient

## Functions to read data from ADLS and update ACLS

# https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python
def initialize_storage_account(storage_account_name, storage_account_key):
    
    try:  
        service_client = DataLakeServiceClient(account_url=f"https://{storage_account_name}.dfs.core.windows.net", credential=storage_account_key)
    except Exception as e:
        print(e)

    return service_client


def download_file_from_directory(service_client, container, directory, file_name):
    try:
        file_system_client = service_client.get_file_system_client(file_system=container)

        directory_client = file_system_client.get_directory_client(directory)
     
        file_client = directory_client.get_file_client(file_name)

        download = file_client.download_file()

    except Exception as e:
        print(e)
    else:
        configuration = json.loads(download.readall())
        return configuration

def update_datalake_with_retention(config):
    configDeleteFinal={}
    configDeleteFinal["datalakeProperties"] = '{"datalakeProperties":[]}'
    configDelete=[]

    for p_info in config["datalakeProperties"]:
        if (int(retention_year) > int(p_info["year"])) and (int(current_month) < int(p_info["month"])):
            # if the conditions met, this items are out of the new array
            print("Skip entry")
        else:
            configDelete.append(p_info)
    configDeleteFinal["datalakeProperties"]=configDelete    
    return configDeleteFinal

StatementMeta(sysparkpooly5, 39, 4, Finished, Available)

## Define constants, variables and fetch secret values

In [175]:
# Constants
keyvault_ls_name = "Ls_NYCTaxi_KeyVault"
storage_key_name = "datalakeKey"
data_container = "datalake"
config_container = "config"
config_file_name = "datalake_config.json"
config_file_path = "/"

# Variables
current_year = datetime.utcnow().strftime("%Y")
current_month = int(datetime.utcnow().strftime("%m"))
retention_year = int(current_year)-int(retentionTimeInYears)
current_ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

# Secrets based values
storage_access_key = mssparkutils.credentials.getSecretWithLS(keyvault_ls_name, storage_key_name)
storage_acct_connection = f"DefaultEndpointsProtocol=https;AccountName={stgAccountName};AccountKey={storage_access_key};EndpointSuffix=core.windows.net"


StatementMeta(sysparkpooly5, 39, 5, Finished, Available)

## Get configuration file, and delete entries that are no longer in the retention period

In [None]:
from notebookutils import mssparkutils

service_client = initialize_storage_account(stgAccountName, storage_access_key)
config = download_file_from_directory(service_client, config_container, config_file_path, config_file_name)
configDeleteFinal=update_datalake_with_retention(config)


In [None]:
# mssparkutils.fs.help()
source_config = f"abfss://{config_container}@{stgAccountName}.dfs.core.windows.net{config_file_path}{config_file_name}"
backup_config = f"abfss://{config_container}@{stgAccountName}.dfs.core.windows.net{config_file_path}{config_file_name}_{current_ts}"
mssparkutils.fs.mv(source_config, backup_config, overwrite=True)
jsonResult=json.dumps(configDeleteFinal,indent=2, default=str)   
mssparkutils.fs.put(source_config, jsonResult, overwrite=True)