**Install or update Purview CLI and required packages (run if needed)**

In [116]:
#!pip install --upgrade purviewcli
#!pip install --upgrade pandas
#!pip install --upgrade matplotlib
#!pip install python-dotenv

In [117]:
import json
import pandas as pd
import os
import shutil
import numpy as np

#installed from package above
from dotenv import load_dotenv

**<mark>Setup Authentication</mark>**

In [118]:
#Create a .env file and add the following lines to it

#PURVIEW_NAME="Purview Account Name"
#AZURE_CLIENT_ID="Your client ID"
#AZURE_TENANT_ID="Your tenant ID"
#AZURE_CLIENT_SECRET="Your Client secret" 

#load environment variables from .env file
load_dotenv(dotenv_path='./.env')

**<mark>User Parameters</mark>**

In [119]:
#filter string for Purview call
filter_json_str = '''
{
  "and": [
    {
      "not": {
        "or": [
          {
            "attributeName": "size",
            "operator": "eq",
            "attributeValue": 0
          },
          {
            "attributeName": "fileSize",
            "operator": "eq",
            "attributeValue": 0
          }
        ]
      }
    },
    {
      "not": {
        "classification": "MICROSOFT.SYSTEM.TEMP_FILE"
      }
    },
    {
      "not": {
        "or": [
          {
            "entityType": "AtlasGlossaryTerm"
          },
          {
            "entityType": "AtlasGlossary"
          }
        ]
      }
    }
  ]
}
'''

filter_bg_only_json_str = '''
{
  "and": [
    {
      "not": {
        "or": [
          {
            "attributeName": "size",
            "operator": "eq",
            "attributeValue": 0
          },
          {
            "attributeName": "fileSize",
            "operator": "eq",
            "attributeValue": 0
          }
        ]
      }
    },
    {
      "not": {
        "classification": "MICROSOFT.SYSTEM.TEMP_FILE"
      }
    },
    {
        "or": [
            {
                "entityType": "AtlasGlossaryTerm"
            },
            {
                "entityType": "AtlasGlossary"
            }
        ]
    }
  ]
}
'''


#facets to be returned
facet_json_str = '''
[{
    "facet": "assetType",
    "count": 0,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "classification",
    "count": 10,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "contactId",
    "count": 10,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "label",
    "count": 10,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "term",
    "count": 10,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "classificationCategory",
    "count": 0,
    "sort": {
        "count": "desc"
    }
}, {
    "facet": "fileExtension",
    "count": 0,
    "sort": {
        "count": "desc"
    }
}]
'''

#root working directory for this notebook
root_working_folder = 'C:\Projects\Purview\PurviewBulk'

#CAN BE LEFT AS DEFAULTS
#folder for creating temporary files. will be removed and re-created with each run
ephemeral_folder = 'ephemeral'

#default file names
asset_export_file_name_template = '{0}_purview_assets.csv'
asset_detail_export_file_name_template = '{0}_details_purview_assets.csv'
filter_file_name = 'filter.json'
filter_bg_only_file_name = 'filter_bg_only.json'
facet_file_name = 'facet.json'

#folder to create all json payloads
update_paylod_folder_name_template = '{0}_updates'

#separator character 
separator_char ='|'


**Setup working folders, filters and facets**

In [120]:
ephemeral_full_path = os.path.join(root_working_folder, ephemeral_folder)
if os.path.exists(ephemeral_full_path):
    shutil.rmtree(ephemeral_full_path, ignore_errors=False)

os.mkdir(ephemeral_full_path)

filter_json_full_path = os.path.join(ephemeral_full_path, filter_file_name)
facet_json_full_path = os.path.join(ephemeral_full_path, facet_file_name)
filter_bg_only_json_full_path = os.path.join(ephemeral_full_path, filter_bg_only_file_name)


file = open(filter_json_full_path, "w") 
file.writelines(filter_json_str) 
file.close() 

file = open(facet_json_full_path, "w") 
file.writelines(facet_json_str) 
file.close() 

file = open(filter_bg_only_json_full_path, "w") 
file.writelines(filter_bg_only_json_str) 
file.close() 

In [121]:
#Will be hydrated later
purviewTermsDF = pd.DataFrame(columns = ['name', 'qualifiedName', 'classification', 'term', 'description','entityType','assetType', 'id'])

In [122]:
def guidToFormalName(guidList):
    global purviewTermsDF
    result = []
    matched_rows = purviewTermsDF.loc[purviewTermsDF['id'].isin(guidList)]
    result.extend(matched_rows['qualifiedName'])

    return result

In [123]:
def formalNameToGuid(formalName):
    global purviewTermsDF
    result =''
    matched_rows = purviewTermsDF.loc[purviewTermsDF['qualifiedName']==formalName]
    if len(matched_rows)==1:
        result = matched_rows.iloc[0]['id']
    
    return result

In [124]:
def listToDataframe(listOfAssets):
    df = pd.DataFrame(columns = ['name', 'qualifiedName', 'classification', 'term', 'description','entityType','assetType', 'id'])

    for ent in listOfAssets:
        if 'term' in ent:
            termGuidList = [itm['guid'] for itm in ent['term']]
            termFormalNameList = guidToFormalName(termGuidList)
        else:
            termFormalNameList = []

        if 'description' not in ent:
            ent['description']=np.nan

        if 'assetType' not in ent:
            ent['assetType']=[]

        if 'classification' not in ent:
            ent['classification']=[]

        if 'entityType' not in ent:
            ent['entityType']=np.nan      

        asset_row = {'name':ent['name']
        ,'qualifiedName':ent['qualifiedName']
        ,'classification':separator_char.join(ent['classification'])
        ,'term':separator_char.join(term for term in termFormalNameList)
        ,'description':ent['description']
        ,'entityType':ent['entityType']
        ,'assetType':','.join(ent['assetType'])
        ,'id':ent['id']}
        df = df.append(asset_row, ignore_index=True)

    return df

In [125]:
def searchPurview(keyword, filter_file, batch_size = 100, recursive_read = False):
    offset = 0
    all_items = []
    this_read_count = -1

    while ((this_read_count == -1) or (this_read_count>0) and recursive_read == True):
        search_output = !pv search query --keywords "{keyword}" --limit {batch_size} --offset {offset} --filterFile  {filter_file} --facets-file {facet_json_full_path}
        search_json = json.loads(''.join(search_output))
        assets_this_read = search_json['value']
        this_read_count = len(assets_this_read)
        all_items.extend(assets_this_read)
        offset += this_read_count

    dfSearchResult = listToDataframe(all_items)

    return len(dfSearchResult), dfSearchResult

In [126]:
def searchPurviewAssets(keyword, batch_size = 50, recursive_read = False):
    return searchPurview(keyword, filter_json_full_path, batch_size, recursive_read)

In [127]:
def searchGlossaryTerms(keyword, batch_size = 50, recursive_read = False):
    return searchPurview(keyword, filter_bg_only_json_full_path, batch_size, recursive_read)

In [128]:
def getColumnStructure(atlasEntity):
    cols = None
    refEnt = None

    thisAsset = atlasEntity['entities'][0]
    if 'relationshipAttributes' not in thisAsset:
        return cols, refEnt

    if 'referredEntities' in atlasEntity:
        refEnt = atlasEntity['referredEntities']

    if 'columns' in thisAsset['relationshipAttributes']:
        cols = thisAsset['relationshipAttributes']['columns']    
    else:
        schGuid = None

        if 'attachedSchema' in thisAsset['relationshipAttributes']:
            attSch = thisAsset['relationshipAttributes']['attachedSchema']
            if len(attSch)>0:
                schGuid = attSch[0]['guid']

        if 'tabular_schema' in thisAsset['relationshipAttributes']:
            tabSch = thisAsset['relationshipAttributes']['tabular_schema']
            if len(tabSch)>0:
                schGuid = tabSch['guid']
        
        if schGuid!=None:
            schema_output = !pv entity readBulk --guid {schGuid}
            schema_json = json.loads(''.join(schema_output))
            return getColumnStructure(schema_json) 
    
    return cols, refEnt
    

In [129]:
purviewTermsCount, purviewTermsDF = searchGlossaryTerms('*', recursive_read=True)

In [130]:
print('Setup is complete!')