## Install Required Python Modules


*This module installation can be moved to package installation using "requirements.txt"*

*`az synapse spark pool update` with --library-requirements requirements.txt*

*Concern: Long time(~ 20 mins) for the statement to return status*

In [None]:
pip install azure-storage-file-datalake

In [None]:
import json
from collections import defaultdict
from datetime import datetime
from azure.storage.filedatalake import DataLakeServiceClient

## Functions to read data from ADLS and update ACLS

# https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python
def initialize_storage_account(storage_acct, storage_account_key):
    
    try:  
        service_client = DataLakeServiceClient(account_url=f"https://{storage_acct}.dfs.core.windows.net", credential=storage_account_key)
    except Exception as e:
        print(e)

    return service_client


def download_file_from_directory(service_client, container, directory, file_name):
    try:
        file_system_client = service_client.get_file_system_client(file_system=container)

        directory_client = file_system_client.get_directory_client(directory)
     
        file_client = directory_client.get_file_client(file_name)

        download = file_client.download_file()

    except Exception as e:
        print(e)
    else:
        configuration = json.loads(download.readall())
        return configuration


# https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-acl-python
def update_permission_recursively(service_client, container_name, directory_path, is_default_scope, user_type, user_id, permissions):
    
    try:
        file_system_client = service_client.get_file_system_client(file_system=container_name)

        directory_client = file_system_client.get_directory_client(directory_path)
              
        acl = f"{user_type}:{user_id}:{permissions}"

        if is_default_scope:
           acl = f'default:{user_type}:{user_id}:{permissions}'

        directory_client.update_access_control_recursive(acl=acl)

        acl_props = directory_client.get_access_control()
        
        print(f"Permissions for {directory_path} - {user_type}:{user_id} are:\n{acl_props['acl']}") 

    except Exception as e:
        print(e)


# Assumption: Config contains all the perms needed for a given location. Incremental changes are not allowed.
# Evalauate effective permissions requested.
def evaluate_ad_acl_perms():
    ad_perms = defaultdict(int)
    for p_info in config["datalakeProperties"]:
        p_info["lastUpdatedDatalake"] = current_ts
        partition = f"{p_info['year']}/{p_info['month']}"
        partition_path = f"{data_path_prefix}{partition}/"
        
        for perm in p_info["aclPermissions"]:
            for grp in perm["groups"]:
                ad_set.add(grp)
                a_type = perm["type"]
                if a_type == "read":
                    ad_perms[(partition_path, grp)] += 4
                elif a_type == "write":
                    ad_perms[(partition_path, grp)] += 2
                elif a_type == "execute":
                    ad_perms[(partition_path, grp)] += 1
                else:
                    config_check_errors.append(f"Invalid acl type value :'{a_type}' specifed for partition '{partition}' . Acl Type must be one among ['read', 'write', 'execute']")
    return ad_perms


# Assumption: ACL Grant statements are run after data copy step is complete. Otherwise we will run into `The specified path does not exist` errors.
# We are granting "r-x" on all folders (recusively from root) so that anyone can "read and list the *Folders*" . 
# We follow this statement with another recursive update this time including the "datafiles" path which will overwrite any extra permissions granted in the previous step.
# Otherwise, unless we create the parent folders seperately and grant default permissions, we will not have access to parent folders and avoid access denied errors.
def update_parent_folder_acls(ad_perms, ad_map):
    parent_dirs = set()
    for path, ad in ad_perms:
        parent_dirs.add((path.lstrip('/').split('/',1)[0], ad))

    for parentdir, ad in parent_dirs:
        if ad in ad_map:
            update_permission_recursively(service_client, data_container, parentdir, 0, 'group', ad_map[ad], 'r-x')
        else:
            config_check_errors.append(f"{ad} is not a valid ActiveDirectory Group.")


def update_ad_acls(ad_perms, ad_map):
    for k, v in ad_perms.items():
        (part_path, ad_name) = k
        if ad_name in ad_map:
            update_permission_recursively(service_client, data_container, part_path, 0, 'group', ad_map[ad_name], permissions_map[ad_perms[k]])
        else:
            config_check_errors.append(f"{ad_name} is not a valid ActiveDirectory Group.")


def check_config_errors():    
    if len(config_check_errors) > 0:
        raise ValueError(f"Config file check failed. Errors are: {config_check_errors}")
    print("ACL Statements generation and Active Directory Check Complete.")



In [None]:
from notebookutils import mssparkutils

## Define Parameters to Enable connection to Storage

In [None]:

# Read from pipeline
storage_acct = ""

# Mostly constant
permissions_map = {0: "---", 1: "--x", 2: "-w-", 3: "-wx", 4: "r--", 5: "r-x", 6: "rw-", 7: "rwx" }
current_ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
keyvault_ls_name = "Ls_NYCTaxi_KeyVault"
storage_key_name = "datalakeKey"
data_container = "datalake"
config_container = "config"
config_file_name = "datalake_config.json"
config_file_path = "/"
data_path_prefix = ""

# Secrets based values
storage_access_key = mssparkutils.credentials.getSecretWithLS(keyvault_ls_name, storage_key_name)
storage_acct_connection = f"DefaultEndpointsProtocol=https;AccountName={storage_acct};AccountKey={storage_access_key};EndpointSuffix=core.windows.net"


## Read Config from ADLS

In [None]:
service_client = initialize_storage_account(storage_acct, storage_access_key)
config = download_file_from_directory(service_client, config_container, config_file_path, config_file_name)

## Evaluate overall ACLs needed in short form

In [None]:
config_check_errors = []
ad_set = set()
ad_perms = evaluate_ad_acl_perms()
print(ad_perms)
print(ad_set)
# Gather list of ADs and their ids - ids are needed for granting ACLs
# One Option - Requires APP ID - https://github.com/AzureAD/microsoft-authentication-library-for-python
# for now reading from Vault
try:
    ad_map = { ad: mssparkutils.credentials.getSecretWithLS(keyvault_ls_name, ad) for ad in ad_set}
except Exception as e:
    config_check_errors.append(f"No ID returned for given Active directory name. error is {e}")
    
print(ad_map)

## Grant ACLs Recursively

In [None]:
update_parent_folder_acls(ad_perms, ad_map)
update_ad_acls(ad_perms, ad_map)
check_config_errors()


## Update Config file with latest run time
- rename with timestamp and create a new file

In [None]:
# mssparkutils.fs.help()
source_config = f"abfss://{config_container}@{storage_acct}.dfs.core.windows.net{config_file_path}{config_file_name}"
backup_config = f"abfss://{config_container}@{storage_acct}.dfs.core.windows.net{config_file_path}{config_file_name}_{current_ts}"
mssparkutils.fs.mv(source_config, backup_config, overwrite=True)
mssparkutils.fs.put(source_config, json.dumps(config, indent=2, default=str), overwrite=True)
