In [0]:
# Comment the below for manual Run
# SRC_TASK_KEY = "Getting_Sharepoint_Data"

# # Try to get the values, fallback to None if not in a job or values are missing
# try:
#     token = dbutils.jobs.taskValues.get(taskKey=SRC_TASK_KEY, key="access_token", debugValue=None)
#     site_id = dbutils.jobs.taskValues.get(taskKey=SRC_TASK_KEY, key="site_id", debugValue=None)
#     drive_id = dbutils.jobs.taskValues.get(taskKey=SRC_TASK_KEY, key="drive_id", debugValue=None)
# except Exception as e:
#     print("Failed to fetch values from previous task:", e)
#     token = site_id = drive_id = None


In [0]:
%run ./01-SharepointAccess

In [0]:
%run ../Main_Config

In [0]:
# Uncomment for Manual Run
site_id = get_site_id()
drive_id = get_drive_id()
token = get_access_token()

In [0]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime

# # Placeholder functions for obtaining authentication token, site ID, and drive ID
def list_drive_items(folder_id=None):
    """
    Recursively retrieves all file items (excluding folders) in the specified SharePoint drive.
    """
    headers = {"Authorization": f"Bearer {token}"}
    items = []
    
    endpoint = microsoft_graph_url + f"{site_id}" + drive_path + f"{drive_id}" + root_path if not folder_id else microsoft_graph_url + f"{site_id}" + drive_path + f"{drive_id}" + item_path + f"{folder_id}" + children_path

    while endpoint:
        response = requests.get(endpoint, headers=headers)
        if response.status_code == 404:
            logger.error(f"404 Not Found: Folder not found at URL '{url}'")
            return
        
        if response.status_code == 200:
            data = response.json()
            for item in data.get('value', []):
                if 'folder' not in item:  # Exclude folders
                    items.append(item)
                else:
                    # Recursively retrieve items in subfolders
                    items.extend(list_drive_items(item['id']))
            endpoint = data.get('@odata.nextLink', None)
        else:
            logger.error(f"Error retrieving items: {response.status_code} - {response.text}")
            break

    return items


def get_item_permissions(item_id):
    """
    Retrieves permissions for a specific item in the SharePoint drive.
    """
    headers = {"Authorization": f"Bearer {token}"}
    permissions = []
    endpoint = microsoft_graph_url + f"{site_id}" + drive_path + f"{drive_id}" + item_path + f"{item_id}" + permissions_path

    while endpoint:
        response = requests.get(endpoint, headers=headers)

        if response.status_code == 404:
            logger.error(f"404 Not Found: Folder not found at URL '{url}'")
            return
        
        if response.status_code == 200:
            data = response.json()
            permissions.extend(data.get('value', []))
            endpoint = data.get('@odata.nextLink', None)
        else:
            logger.error(f"Error retrieving permissions for item {item_id}: {response.status_code} - {response.text}")
            break

    return permissions

def fetch_permissions_data():
    """
    Main function to fetch permissions data for all items in the SharePoint drive.
    # """

    items = list_drive_items()
    permissions_data = []

    for item in items:
        item_id = item.get('id')
        item_name = item.get('name')
        created_datetime = datetime.strptime(item.get('createdDateTime'), "%Y-%m-%dT%H:%M:%SZ")
        last_modified_datetime = datetime.strptime(item.get('lastModifiedDateTime'), "%Y-%m-%dT%H:%M:%SZ")
        permissions = get_item_permissions(item_id)

        for perm in permissions:
            roles = perm.get('roles', [])

            # Get user details
            users = perm.get("grantedToIdentitiesV2")  # List of users

            if users:
                list_of_users_email = [user.get('user', {}).get('email', None) for user in users]
                list_of_users_name = [user.get('user', {}).get('displayName', None) for user in users]
            else:
                # Fallback to grantedToV2 if grantedToIdentitiesV2 is missing
                granted_to = perm.get('grantedToV2', {}).get('user', {}) or perm.get('grantedToV2', {}).get('siteUser', {})
                list_of_users_email = [granted_to.get('email', None)]
                list_of_users_name = [granted_to.get('displayName', None)]

            # Store multiple user permissions
            for role in roles:
                for email, name in zip(list_of_users_email, list_of_users_name):
                    permissions_data.append({
                        COL_FILE_ID: item_id,
                        COL_FILE_NAME: item_name,
                        COL_PERMISSION: role,
                        COL_USER_ID: email,  # Using email as user_id
                        COL_USER_NAME: name,
                        COL_LAST_MODIFIED_DATETIME: last_modified_datetime,
                        COL_CREATED_DATETIME: created_datetime,
                        COL_LOAD_DATETIME : datetime.utcnow()
                    })         
                    
    return permissions_data

# Fetch permissions data
permissions_list = fetch_permissions_data()

# Define the schema for the DataFrame
schema = StructType([
    StructField(COL_FILE_ID, StringType(), True),
    StructField(COL_FILE_NAME, StringType(), True),
    StructField(COL_PERMISSION, StringType(), True),
    StructField(COL_USER_ID, StringType(), True),
    StructField(COL_USER_NAME, StringType(), True),
    StructField(COL_LAST_MODIFIED_DATETIME, TimestampType(), True),
    StructField(COL_CREATED_DATETIME, TimestampType(), True),
    StructField(COL_LOAD_DATETIME, TimestampType(), True)
])

# Create DataFrame
permissions_df = spark.createDataFrame(permissions_list, schema=schema)

# Droping null values
permissions_df = permissions_df.dropna()

# Optionally, write the DataFrame to a table
permissions_df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(user_permission_path)