# Transformation of Raw Data to silver layer

This notebook will read the json data generated from the application notebook. The raw data is stored in the stage folder with subfolders based on specific items captured.

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from functools import reduce

StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 5, Finished, Available)

## Activity  

The activity is the audit of events that occur in the Fabric workspace. 
To get the structure of the dataframe:
1. read the json file using spark
2. run printSchema
```
df_activity.printSchema()

root
 |-- AccessRequestMessage: string (nullable = true)
 |-- AccessRequestType: string (nullable = true)
 |-- ActionSource: string (nullable = true)
 |-- ActionSourceDetail: string (nullable = true)
 |-- Activity: string (nullable = true)
 |-- ActivityId: string (nullable = true)
 |-- AggregatedWorkspaceInformation: struct (nullable = true)
 |    |-- WorkspaceCount: long (nullable = true)
 |    |-- WorkspacesByCapacitySku: string (nullable = true)
 |    |-- WorkspacesByType: string (nullable = true)
 |-- AppId: string (nullable = true)
 |-- AppName: string (nullable = true)
 |-- AppReportId: string (nullable = true)
 |-- ArtifactAccessRequestInfo: struct (nullable = true)
 |    |-- AccessRequestAction: string (nullable = true)
 |    |-- ArtifactLocationObjectId: string (nullable = true)
 |    |-- ArtifactOwnerInformation: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- EmailAddress: string (nullable = true)
 |    |    |    |-- UserObjectId: string (nullable = true)
 |    |-- RequestId: long (nullable = true)
 |    |-- RequesterUserObjectId: string (nullable = true)
 |    |-- TenantObjectId: string (nullable = true)
 |    |-- WorkspaceName: string (nullable = true)
 |-- ArtifactId: string (nullable = true)
 |-- ArtifactKind: string (nullable = true)
 |-- ArtifactName: string (nullable = true)
 |-- ArtifactType: string (nullable = true)
 |-- AuditedArtifactInformation: struct (nullable = true)
 |    |-- AnnotatedItemType: string (nullable = true)
 |    |-- ArtifactObjectId: string (nullable = true)
 |    |-- Id: string (nullable = true)
 |    |-- Name: string (nullable = true)
 |-- CapacityId: string (nullable = true)
 |-- CapacityName: string (nullable = true)
 |-- CapacityState: string (nullable = true)
 |-- CapacityUsers: string (nullable = true)
 |-- ClientIP: string (nullable = true)
 |-- ConsumptionMethod: string (nullable = true)
 |-- CopiedReportId: string (nullable = true)
 |-- CopiedReportName: string (nullable = true)
 |-- CreationTime: string (nullable = true)
 |-- CredentialSetupMode: string (nullable = true)
 |-- CustomVisualAccessTokenResourceId: string (nullable = true)
 |-- CustomVisualAccessTokenSiteUri: string (nullable = true)
 |-- DashboardId: string (nullable = true)
 |-- DashboardName: string (nullable = true)
 |-- DataConnectivityMode: string (nullable = true)
 |-- DataflowAccessTokenRequestParameters: struct (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- partitionUri: string (nullable = true)
 |    |-- permissions: long (nullable = true)
 |    |-- tokenLifetimeInMinutes: long (nullable = true)
 |-- DataflowAllowNativeQueries: boolean (nullable = true)
 |-- DataflowId: string (nullable = true)
 |-- DataflowName: string (nullable = true)
 |-- DataflowRefreshScheduleType: string (nullable = true)
 |-- DataflowType: string (nullable = true)
 |-- DatasetCertificationStage: string (nullable = true)
 |-- DatasetId: string (nullable = true)
 |-- DatasetName: string (nullable = true)
 |-- Datasets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- DatasetId: string (nullable = true)
 |    |    |-- DatasetName: string (nullable = true)
 |-- DatasourceDetails: boolean (nullable = true)
 |-- DatasourceId: string (nullable = true)
 |-- DatasourceInformations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CredentialType: string (nullable = true)
 |    |    |-- DatasourceObjectId: string (nullable = true)
 |    |    |-- DatasourceReference: string (nullable = true)
 |    |    |-- GatewayObjectId: string (nullable = true)
 |    |    |-- SingleSignOnType: string (nullable = true)
 |-- DatasourceObjectIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Datasources: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ConnectionDetails: string (nullable = true)
 |    |    |-- DatasourceType: string (nullable = true)
 |-- DeploymentPipelineId: long (nullable = true)
 |-- DeploymentPipelineObjectId: string (nullable = true)
 |-- DeploymentPipelineOperationId: string (nullable = true)
 |-- DeploymentPipelineStageOrder: long (nullable = true)
 |-- DistributionMethod: string (nullable = true)
 |-- EmbedTokenId: string (nullable = true)
 |-- EndPoint: string (nullable = true)
 |-- ExcludePersonalWorkspaces: boolean (nullable = true)
 |-- Experience: string (nullable = true)
 |-- ExportEventEndDateTimeParameter: string (nullable = true)
 |-- ExportEventStartDateTimeParameter: string (nullable = true)
 |-- ExportedArtifactInfo: struct (nullable = true)
 |    |-- ArtifactId: long (nullable = true)
 |    |-- ArtifactType: string (nullable = true)
 |    |-- ExportType: string (nullable = true)
 |-- ExternalSubscribeeInformation: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FolderAccessRequests: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- GroupId: long (nullable = true)
 |    |    |-- GroupObjectId: string (nullable = true)
 |    |    |-- RolePermissions: string (nullable = true)
 |    |    |-- UserId: long (nullable = true)
 |    |    |-- UserObjectId: string (nullable = true)
 |-- FolderDisplayName: string (nullable = true)
 |-- FolderObjectId: string (nullable = true)
 |-- GatewayClusterDatasources: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- clusterId: string (nullable = true)
 |    |    |-- credentialDetails: struct (nullable = true)
 |    |    |    |-- credentialType: string (nullable = true)
 |    |    |    |-- encryptedConnection: string (nullable = true)
 |    |    |    |-- encryptionAlgorithm: string (nullable = true)
 |    |    |    |-- isCredentialEncrypted: boolean (nullable = true)
 |    |    |    |-- privacyLevel: string (nullable = true)
 |    |    |    |-- skipTestConnection: boolean (nullable = true)
 |    |    |    |-- useCustomOAuthApp: boolean (nullable = true)
 |    |    |-- credentialType: string (nullable = true)
 |    |    |-- datasourceName: string (nullable = true)
 |    |    |-- datasourceType: string (nullable = true)
 |    |    |-- gatewayClusterName: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- users: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- identifier: string (nullable = true)
 |-- GatewayClusterId: string (nullable = true)
 |-- GatewayClustersObjectIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- GatewayId: string (nullable = true)
 |-- GatewayMemberId: string (nullable = true)
 |-- GatewayStatus: string (nullable = true)
 |-- HasFullReportAttachment: boolean (nullable = true)
 |-- Id: string (nullable = true)
 |-- ImportDisplayName: string (nullable = true)
 |-- ImportId: string (nullable = true)
 |-- ImportSource: string (nullable = true)
 |-- ImportType: string (nullable = true)
 |-- IncludeExpressions: boolean (nullable = true)
 |-- IncludeSubartifacts: boolean (nullable = true)
 |-- InstallTeamsAnalyticsInformation: struct (nullable = true)
 |    |-- ModelId: string (nullable = true)
 |    |-- TenantId: string (nullable = true)
 |    |-- UserId: string (nullable = true)
 |-- IsSuccess: boolean (nullable = true)
 |-- IsTemplateAppFromMarketplace: boolean (nullable = true)
 |-- IsTenantAdminApi: boolean (nullable = true)
 |-- IsUpdateAppActivity: boolean (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- ItemsCount: string (nullable = true)
 |-- LabelEventType: string (nullable = true)
 |-- LastRefreshTime: string (nullable = true)
 |-- Lineage: boolean (nullable = true)
 |-- MentionedUsersInformation: string (nullable = true)
 |-- ModelId: string (nullable = true)
 |-- ModelSettings: struct (nullable = true)
 |    |-- DirectLakeAutoSync: boolean (nullable = true)
 |-- ModelsSnapshots: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- Monikers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ObjectDisplayName: string (nullable = true)
 |-- ObjectId: string (nullable = true)
 |-- ObjectType: string (nullable = true)
 |-- OldSensitivityLabelId: string (nullable = true)
 |-- Operation: string (nullable = true)
 |-- OrgAppPermission: struct (nullable = true)
 |    |-- permissions: string (nullable = true)
 |    |-- recipients: string (nullable = true)
 |-- OrganizationId: string (nullable = true)
 |-- OriginalOwner: string (nullable = true)
 |-- PackageId: long (nullable = true)
 |-- PaginatedReportDataSources: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- connectionString: string (nullable = true)
 |    |    |-- credentialRetrievalType: string (nullable = true)
 |    |    |-- dMMoniker: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- provider: string (nullable = true)
 |-- PinReportToTabInformation: struct (nullable = true)
 |    |-- ChannelId: string (nullable = true)
 |    |-- ChannelName: string (nullable = true)
 |    |-- DatasetId: string (nullable = true)
 |    |-- DatasetName: string (nullable = true)
 |    |-- ReportId: string (nullable = true)
 |    |-- ReportName: string (nullable = true)
 |    |-- TabName: string (nullable = true)
 |    |-- TeamId: string (nullable = true)
 |    |-- TeamName: string (nullable = true)
 |    |-- TeamsAppId: string (nullable = true)
 |    |-- UserId: string (nullable = true)
 |-- RecordType: long (nullable = true)
 |-- RefreshEnforcementPolicy: long (nullable = true)
 |-- RefreshType: string (nullable = true)
 |-- ReportCertificationStage: string (nullable = true)
 |-- ReportId: string (nullable = true)
 |-- ReportName: string (nullable = true)
 |-- ReportType: string (nullable = true)
 |-- RequestId: string (nullable = true)
 |-- RequiredWorkspaces: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ResultStatus: string (nullable = true)
 |-- Schedules: struct (nullable = true)
 |    |-- Days: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- RefreshFrequency: string (nullable = true)
 |    |-- Time: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- TimeZone: string (nullable = true)
 |-- SensitivityLabelEventData: struct (nullable = true)
 |    |-- ActionSource: string (nullable = true)
 |    |-- ActionSourceDetail: string (nullable = true)
 |    |-- LabelEventType: string (nullable = true)
 |    |-- OldSensitivityLabelId: string (nullable = true)
 |    |-- SensitivityLabelId: string (nullable = true)
 |-- SensitivityLabelId: string (nullable = true)
 |-- ShareLinkId: string (nullable = true)
 |-- SharingAction: string (nullable = true)
 |-- SharingInformation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ObjectId: string (nullable = true)
 |    |    |-- RecipientEmail: string (nullable = true)
 |    |    |-- RecipientName: string (nullable = true)
 |    |    |-- ResharePermission: string (nullable = true)
 |    |    |-- TenantObjectId: string (nullable = true)
 |    |    |-- UserPrincipalName: string (nullable = true)
 |-- SharingScope: string (nullable = true)
 |-- SubfolderId: long (nullable = true)
 |-- SubfolderName: string (nullable = true)
 |-- SubfolderObjectId: string (nullable = true)
 |-- SubscribeeInformation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ObjectId: string (nullable = true)
 |    |    |-- RecipientEmail: string (nullable = true)
 |    |    |-- RecipientName: string (nullable = true)
 |-- SubscriptionDetails: struct (nullable = true)
 |    |-- attachmentType: string (nullable = true)
 |    |-- isOnDemand: boolean (nullable = true)
 |    |-- subject: string (nullable = true)
 |    |-- subscriptionObjectId: string (nullable = true)
 |    |-- title: string (nullable = true)
 |-- SubscriptionSchedule: struct (nullable = true)
 |    |-- DaysOfTheMonth: string (nullable = true)
 |    |-- EndDate: string (nullable = true)
 |    |-- StartDate: string (nullable = true)
 |    |-- Time: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- TimeZone: string (nullable = true)
 |    |-- Type: string (nullable = true)
 |    |-- WeekDays: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- SwitchState: string (nullable = true)
 |-- TableName: string (nullable = true)
 |-- TakingOverOwner: string (nullable = true)
 |-- TargetWorkspaceId: string (nullable = true)
 |-- TemplateAppFolderObjectId: string (nullable = true)
 |-- TemplateAppIsInstalledWithAutomation: boolean (nullable = true)
 |-- TemplateAppObjectId: string (nullable = true)
 |-- TemplateAppOwnerTenantObjectId: string (nullable = true)
 |-- TemplateAppVersion: string (nullable = true)
 |-- TemplatePackageName: string (nullable = true)
 |-- TileText: string (nullable = true)
 |-- UpdateFeaturedTables: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- State: string (nullable = true)
 |    |    |-- TableName: string (nullable = true)
 |-- Upns: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- UserAgent: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- UserInformation: struct (nullable = true)
 |    |-- UsersAdded: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- UserKey: string (nullable = true)
 |-- UserType: long (nullable = true)
 |-- WorkSpaceName: string (nullable = true)
 |-- Workload: string (nullable = true)
 |-- WorkspaceAccessList: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- UserAccessList: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- GroupUserAccessRight: string (nullable = true)
 |    |    |    |    |-- Identifier: string (nullable = true)
 |    |    |    |    |-- PrincipalType: string (nullable = true)
 |    |    |    |    |-- UserEmailAddress: string (nullable = true)
 |    |    |-- WorkspaceId: string (nullable = true)
 |-- WorkspaceId: string (nullable = true)
 |-- WorkspacesModifiedSince: string (nullable = true)
 |-- WorkspacesSemicolonDelimitedList: string (nullable = true)

```

### Read json files for activity  

You can read all the files or some of the files depending upon the root path argument
```
root
|--stage
|  |--activity
|  |  |--Year
|  |  |  |--Month
|  |  |  |  |--Day
```
To access all folders in a path use the "*" in place of the name

In [4]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

df_activity = spark.read.option("multiline","true").json("Files/stage/activity/2024/*/*.json")


StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 6, Finished, Available)

## Catalog  

The catalog is broken into two primary subfolders:
1. scans  
   a. Will contain the meta data and data of Fabric  
      - workspaces  
      - datasets  
      - dataflows  
      - datamarts  
      - KQL  
      - eventhub  
      - etc.  
2. snapshot  
    a. have the published applications from the power bi workspaces  

In [5]:
df_catalog_scans = spark.read.option("multiline","true").json("Files/stage/catalog/scans/2024/*/*/*.json")

StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 7, Finished, Available)

## Explode out workspaces  


In [10]:
df_workspaces = df_catalog_scans \
    .withColumn("workspaces", explode(df_catalog_scans["workspaces"])) \
    .drop(df_catalog_scans["datasourceInstances"])

StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 11, Finished, Available)

## Extract datasets from the workspaces root  

Explode out the array 



In [None]:
df_datasets = df_workspaces \
    .withColumn("datasets", explode(df_workspaces['workspaces']['datasets'])) \
    .drop("workspaces")


### Using Sparkler to flatten the dataset into its components

This class will take any spark dataframe that has arrays or structs and converts those columns into its own table
To use Sparkler you need to pass in a spark session and then call the flatten function with a dataset parameters

```
class_variable = Sparkler(session)
dataframe = class_variable.flatten(dataset to flatten)
```


In [7]:
from env.utility.sparkler import Sparkler
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Flatten").getOrCreate()


mySpark = Sparkler(spark)


StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 8, Finished, Available)

In [8]:
def expand_df(folder:str, df:dict, group:str):
    """
    :param folder will be the location you want data to be stored 
    :param df is a dictionary containing spark dataframes
    :param group is part of the path in folder but is meant for grouping subitems together
    """
    for item in df:
        if item != "root":
            name = item.split(sep="_")[-1]

            if name in ['days', 'times']:
                name = item.split(sep="_")[-2:]
                name = name[0]+name[1]

            df[item].write.format("parquet").mode("overwrite").option("overwriteSchema","true").save(f"Files/silver/{folder}/{group}/{name}")


StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 9, Finished, Available)

## Deconstruct workspaces into their primary artifacts  

The idea here is to get the primary artifacts begin captured by the catalog scans (metadata)  
We can then take each artifact and create its own spark dataframe  


In [11]:
lst = df_workspaces._jdf.schema().treeString()

lines = lst.split("\n")

artifacts = list()

for line in lines:
    x = line.split(sep="|")
    if len(x)==3:
        if "array" in line:
            artifacts.append(line.split(sep="-- ")[-1].split(sep=":")[0])


StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 13, Finished, Available)

In [12]:
for artifact in artifacts:
    temp = df_workspaces \
        .withColumn(f"{artifact}", explode(df_workspaces['workspaces'][f'{artifact}'])) \
        .select(f"{artifact}")

    folder = "catalog"
    temp.write.format("parquet").save(f"Files/silver/{folder}/{artifact}/{artifact}")

    try:
        df = mySpark.flatten(temp)

        expand_df(folder=folder,df=df,group=artifact)
    except Exception as e:
        print(e)
    finally:
        if temp:
            del(temp)
        try:
            del(df)
        except:
            pass


StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 14, Finished, Available)

[FIELD_NOT_FOUND] No such struct field `Aria` in `Aria`.`DocumentId`.; line 1 pos 0


## Flatten the Activity into its sub components



In [14]:
act = mySpark.flatten(df_activity)

StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 16, Finished, Available)

## Expand and out and save the sub components to Silver 

In [15]:
expand_df(folder='activity', df=act, group="activity")

StatementMeta(, 9db2917c-231b-4693-8f6a-612bfea034b2, 17, Finished, Available)