# Alation Data Catalog analysis

Use Alation Data Catalog schema export as data set to identify documentation level at the table level.  

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.graph_objs as go
import plotly.express as px

from plotly.offline import iplot

In [2]:
# source data setup
SOURCE_FOLDER = "./data_catalogs/"

## Main Processing

#### create target summary data frames

This data frame will be populated with the results of each source file, and will be used to generated the chart. 

In [3]:
# reset summary data frame 
summary_schema_df = pd.DataFrame( 
                columns = ['schema','count_type','count','pct_described'])
# count type = tables, columns_all, columns_by_table  

summary_schema_df = summary_schema_df.astype({"count": np.int64, "pct_described": np.float64})

summary_schema_df.index = summary_schema_df['schema']

summary_schema_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   schema         0 non-null      object 
 1   count_type     0 non-null      object 
 2   count          0 non-null      int64  
 3   pct_described  0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 0.0+ bytes


#### get list of CSV source files

In [14]:
df_main=pd.DataFrame(columns=['key','title','description'])

df_main

Unnamed: 0,key,title,description


In [15]:
f = 'tax_rpt_schema_19891_2104_2022-03-03T08-58-29-517921.csv'
source_file = os.path.join(SOURCE_FOLDER, f)

df = pd.read_csv(source_file)

In [21]:
df.shape

(6736, 22)

df_main = pd.concat([df_main,df])


In [20]:
df_main.shape

(6736, 22)

In [22]:
df_main

Unnamed: 0,key,title,description,tabdefaultclassification,classification_status,tag,documentation status,steward:groupprofile,steward:user,processinglocation,...,processingcapability,gdpr,is_cdc,coldefaultclassification,under retention,iac classification,intuit_classification,processing3rdparty,table_documentation,processingcategory
0,tax_rpt,Tax Reporting,<p>This is US Turbotax data schema used to sto...,,,,,,,,...,,,,,,,,,,
1,tax_rpt.agent_details,Agent Details,<p>Agent Details for Agent Proficiency</p>,,,,,,crajendran1;rpal,,...,,,,,,,,,,
2,tax_rpt.agent_details_51346,Agent Details 51346,,,,,,,,,...,,,,,,,,,,
3,tax_rpt.agent_details_proficiency,Agent Details Proficiency,<p>The agent details proficiency query is the ...,,,,,,crajendran1;rpal,,...,,,,,,,,,,
4,tax_rpt.agent_details_proficiency_51346,Agent Details Proficiency 51346,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6731,tax_rpt.testing_analytics_master_visitor.monet...,Monetization Take Total Upsell,,,,,,,,,...,,,,,,,,,,
6732,tax_rpt.testing_analytics_master_visitor.monet...,Monetization Total Bundle Detach,,,,,,,,,...,,,,,,,,,,
6733,tax_rpt.testing_analytics_master_visitor.recom...,Recommended Sku,,,,,,,,,...,,,,,,,,,,
6734,tax_rpt.agg_full_service_clickstream.pyr_doc_c...,,,,,,,,,,...,,,,,,,,,,


In [23]:
x = 'tax_rpt.testing_analytics_master_visitor.monetization'

In [24]:
x.split(".")

['tax_rpt', 'testing_analytics_master_visitor', 'monetization']

In [29]:
df_main['schema'] = df_main['key'].apply(lambda x : x.split(".")[0])

In [30]:
df_main['schema'].head()

0    tax_rpt
1    tax_rpt
2    tax_rpt
3    tax_rpt
4    tax_rpt
Name: schema, dtype: object

In [4]:
# sample data frame for one source file
# notice key column is concatenation of schema.table.column 
# we will break this out into separate columns

f = 'tax_rpt_schema_19891_2104_2022-03-03T08-58-29-517921.csv'
source_file = os.path.join(SOURCE_FOLDER, f)
df = pd.read_csv(source_file)

# remove unneeded columns
df.drop([
    'tabdefaultclassification',
    'classification_status',
    'tag',
    'documentation status',
    'steward:groupprofile', 
    'steward:user', 
    'processinglocation',
    'index name', 
    'is_primary_key_manual', 
    'processingcapability', 
    'gdpr',
    'is_cdc',
    'coldefaultclassification', 
    'under retention',
    'iac classification',
    'intuit_classification', 
    'processing3rdparty',
    'table_documentation', 
    'processingcategory'
    ], axis='columns', inplace=True)

key_values = df['key'].tolist()

df.head(10)

Unnamed: 0,key,title,description
0,tax_rpt,Tax Reporting,<p>This is US Turbotax data schema used to sto...
1,tax_rpt.agent_details,Agent Details,<p>Agent Details for Agent Proficiency</p>
2,tax_rpt.agent_details_51346,Agent Details 51346,
3,tax_rpt.agent_details_proficiency,Agent Details Proficiency,<p>The agent details proficiency query is the ...
4,tax_rpt.agent_details_proficiency_51346,Agent Details Proficiency 51346,
5,tax_rpt.agg_agent_level_call_transfer_metric,,
6,tax_rpt.agg_auth_bundle_upsell_cy_vw,,
7,tax_rpt.agg_auth_bundle_upsell_vw,,
8,tax_rpt.agg_auth_id_revenue,,
9,tax_rpt.agg_clickstream_auth_last_event,,


In [5]:
print(source_file)

./data_catalogs/tax_rpt_schema_19891_2104_2022-03-03T08-58-29-517921.csv


In [6]:
source_files = []
# testing
# source_files = [
#         'tax_rpt_schema_19891_2104_2022-03-03T08-58-29-517921.csv',
#         'tax_dm_schema_19392_2104_2022-03-03T08-54-00-581234.csv'               
#         ]

for f in os.listdir(SOURCE_FOLDER):
    if f[-3:] == "csv":
        source_files.append(f)
        
print(source_files)

['tax_dm_schema_19392_2104_2022-03-03T08-54-00-581234.csv', 'tax_rpt_schema_19891_2104_2022-03-03T08-58-29-517921.csv', 'tax_src_schema_19460_2104_2022-03-03T10-51-19-881043.csv']


In [7]:
for f in source_files:
    # read catalog data file 
    source_file = os.path.join(SOURCE_FOLDER, f)
    df = pd.read_csv(source_file, usecols=['key','title','description'])
    
    # remove unneeded columns
#     df.drop([
#         'tabdefaultclassification',
#         'classification_status',
#         'tag',
#         'documentation status',
#         'steward:groupprofile', 
#         'steward:user', 
#         'processinglocation',
#         'index name', 
#         'is_primary_key_manual', 
#         'processingcapability', 
#         'gdpr',
#         'is_cdc',
#         'coldefaultclassification', 
#         'under retention',
#         'iac classification',
#         'intuit_classification', 
#         'processing3rdparty',
#         'table_documentation', 
#         'processingcategory'
#         ], axis='columns', inplace=True)
    
    key_values = df['key'].tolist()

    # break out key into individual elements
    schema_names = []
    table_names = []
    column_names = []

    for i in range(0,len(key_values)):
        key_elements = key_values[i].split(".")

        schema_names.append(key_elements[0])
        if len(key_elements) >= 2:
            table_names.append(key_elements[1])
        else:
            table_names.append("")
        if len(key_elements) >= 3:
            column_names.append(key_elements[2])
        else:
            column_names.append("")

    df['schema'] = schema_names
    df['table'] = table_names
    df['column'] = column_names
    
    # add needs description bolean
    df['needs_description'] = df['description'].isnull() 
    
    # ============= table level counts ===============
    # create table level data set 
    df_tables = df[(df['column'] == "") & (df['table'] != "")][['schema','table','description','needs_description']] 
    
    # create schema name level summary count
    schema_name = df_tables[['schema','table']].groupby(['schema']).count() 
    
    for i in schema_name.index:
        # table count 
        table_count = schema_name['table'].loc[i]
        table_described_pct = int((table_count - df_tables['needs_description'].sum()) * 1000 / table_count ) / 10

        tmp_df = pd.DataFrame({
                    "schema": [i],
                    "count_type": "tables",
                    "count": [table_count],
                    "pct_described": [table_described_pct]
                }
            )
    
    summary_schema_df = pd.concat([summary_schema_df, tmp_df], ignore_index=True, sort=False)

    # ============= all column level counts ===============
    # create column level data set 
    df_columns = df[(df['column'] != "")][['schema','table','column','description','needs_description']] 
    
    # create schema name level summary count
    schema_name = df_columns[['schema','column']].groupby(['schema']).count()
    
    for i in schema_name.index:
        # column count 
        column_count = schema_name['column'].loc[i]
        table_described_pct = int((column_count - df_columns['needs_description'].sum()) * 1000 / column_count ) / 10

        tmp_df = pd.DataFrame({
                    "schema": [i],
                    "count_type": "columns_all",
                    "count": [column_count],
                    "pct_described": [table_described_pct]
                }
            )
    summary_schema_df = pd.concat([summary_schema_df, tmp_df], ignore_index=True, sort=False)

    
summary_schema_df.sort_values(by='schema', ascending=True, inplace=True)

summary_schema_df.head(25)

Unnamed: 0,schema,count_type,count,pct_described
0,tax_dm,tables,256,17.9
1,tax_dm,columns_all,4719,25.2
2,tax_rpt,tables,111,29.7
3,tax_rpt,columns_all,6624,20.9
4,tax_src,tables,328,35.9
5,tax_src,columns_all,15489,13.5


In [8]:
summary_schema_sorted = summary_schema_df[summary_schema_df['pct_described'] >= 5].sort_values(by='pct_described', ascending=False)

summary_schema_sorted

Unnamed: 0,schema,count_type,count,pct_described
4,tax_src,tables,328,35.9
2,tax_rpt,tables,111,29.7
1,tax_dm,columns_all,4719,25.2
3,tax_rpt,columns_all,6624,20.9
0,tax_dm,tables,256,17.9
5,tax_src,columns_all,15489,13.5


In [33]:
fig = px.bar(summary_schema_sorted, x = "schema", y = "pct_described",
             color = "count_type", barmode = 'group',
             title = "Schema level completion")
fig.show()

In [10]:
help(px.bar)

Help on function bar in module plotly.express._chart_types:

bar(data_frame=None, x=None, y=None, color=None, pattern_shape=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None, facet_col_spacing=None, hover_name=None, hover_data=None, custom_data=None, text=None, base=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None, animation_frame=None, animation_group=None, category_orders=None, labels=None, color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, pattern_shape_sequence=None, pattern_shape_map=None, range_color=None, color_continuous_midpoint=None, opacity=None, orientation=None, barmode='relative', log_x=False, log_y=False, range_x=None, range_y=None, text_auto=False, title=None, template=None, width=None, height=None)
        In a bar plot, each row of `data_frame` is represented as a rectangular
        mark.
        
    Parameters
    ----------
    data_frame: DataFrame or array-like or dict
        