In [25]:
# Variables

source_project = ''
source_dataset = ''
source_table = ''

# In what percentage of events does an event parameter need to be found to be
# considered a universal event parameter. In other words, what parameters should
# be configured for all events.
threshold = 0.7

exclude_parameters = []
# comment out the below line to not exclude any parameters
exclude_parameters = ['page_title', 'batch_ordering_id', 'engaged_session_event', 'batch_page_id', 'page_location',
                      'ignore_referrer', 'gtm_container_id', 'engagement_time_msec', 'gtm_container_version', 'page_referrer',
                      'session_engaged', 'gtm_debug_mode', 'ga_session_number' , 'ga_session_id', 'medium', 'term', 'campaign',
                      'source', 'gclid', 'gclsrc', 'gtm_environment', 'debug_mode']
exclude_parameters = ', '.join(exclude_parameters)

In [26]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
%load_ext google.colab.data_table

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)
try:
  sh = gc.open(source_project +':' + source_dataset + ':' + source_table)
except gspread.exceptions.SpreadsheetNotFound:
  sh = gc.create(source_project +':' + source_dataset + ':' + source_table)


Authenticated
The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=source_project)

events = client.query('''
  select distinct
    event_name
  from {sp}.{sd}.{st} '''.format(sp=source_project, sd=source_dataset, st=source_table) ).result()

event_list=[]
for event in events:
  event_list.append(event[0])
event_list

In [None]:
# Dictionary of dataframes
event_params = {}

for event in event_list:
  exclude_list = [f"'{param}'" for param in exclude_parameters.split(', ')]
  event_params[event] = client.query('''
    select
      ep.key
      , count(case when ep.value.string_value is not null then ep.value.string_value end) as string_values
      , count(case when ep.value.int_value is not null then ep.value.int_value end) as int_values
      , count(case when ep.value.float_value is not null then ep.value.float_value end) as float_values
      , count(case when ep.value.double_value is not null then ep.value.double_value end) as double_values
    from {sp}.{sd}.{st}
    cross join
      unnest(event_params) ep
    where event_name = '{en}'
    and ep.key not in ({excl})
    group by ep.key;
    '''.format(sp=source_project, sd=source_dataset, st=source_table, en=event, excl=', '.join(exclude_list)) ).to_dataframe()
  event_params[event]['value_types'] = ""


user_props = client.query('''
  select
    up.key
    , count(case when up.value.string_value is not null then up.value.string_value end) as string_values
    , count(case when up.value.int_value is not null then up.value.int_value end) as int_values
    , count(case when up.value.float_value is not null then up.value.float_value end) as float_values
    , count(case when up.value.double_value is not null then up.value.double_value end) as double_values
  from {sp}.{sd}.{st}
  cross join
    unnest(user_properties) up
  group by up.key;
  '''.format(sp=source_project, sd=source_dataset, st=source_table) ).to_dataframe()

user_props

In [None]:
# warn_params is a list of event and event parameter names that have more than
# one type of value
warn_params = []
warn_props = []

# List parameters that are found in most or all events
param_set = set()
parameter_occurrences = {}
total_events = 0
universal_params = []

# List user properties
prop_set = set()


for event in event_list:
  total_events += 1
  for param in event_params[event].key:
    for index, row in event_params[event].iterrows():
      matched_params = []
      if row['string_values'] > 0:
        matched_params.append('string_value')
      if row['int_values'] > 0:
        matched_params.append('int_value')
      if row['float_values'] > 0:
        matched_params.append('float_value')
      if row['double_values'] > 0:
        matched_params.append('double_value')
      if len(matched_params) > 1:
        warn_params.append( event + ": " + param )
      event_params[event].loc[index, 'value_types'] = ', '.join(matched_params)

      param_set.add(row.key)
      if row['key'] not in parameter_occurrences:
        parameter_occurrences[row['key']] = 1
      else:
        parameter_occurrences[row['key']] += 1

for param in param_set:
  if parameter_occurrences[param] / total_events >  threshold:
    universal_params.append(param)


for event in event_list:
  for prop in user_props[event].key:
    for index, row in user_props[event].iterrows():
      matched_props = []
      if row['string_values'] > 0:
        matched_props.append('string_value')
      if row['int_values'] > 0:
        matched_props.append('int_value')
      if row['float_values'] > 0:
        matched_props.append('float_value')
      if row['double_values'] > 0:
        matched_props.append('double_value')
      if len(matched_props) > 1:
        warn_props.append( event + ": " + prop )
      user_props[event].loc[index, 'value_types'] = ', '.join(matched_props)

      prop_set.add(row.key)


In [None]:


if len(universal_params) > 0:
  print( "The following event parameters appear in at least " + format(threshold, ".0%") + " of events. Consider implementing these as default custom parameters. \n" )
  print( "Please note that the parameters that are implemented by default in GA4 on all events, like page_location, are not removed from this list. \n" )
  print( "You do not need to register default parameters. \n" )
  print('\n '.join(universal_params) + '\n\n')
  try:
    worksheet = sh.worksheet("Universal params")
  except gspread.exceptions.WorksheetNotFound:
    worksheet = sh.add_worksheet(title="Universal params", rows=len(universal_params)+2, cols=1)
  worksheet.update_acell('A1', 'The following event parameters appear in at least ' + format(threshold, ".0%") + ' of events. Consider implementing these as default custom parameters.')
  worksheet.append_rows([[param] for param in universal_params])

print( "These are the event parameters by event: \n" )
for event in event_list:
  print (event +'\n')
  print (event_params[event].loc[:, ['key','value_types']])
  print ('\n')
  try:
    worksheet = sh.worksheet(event)
  except gspread.exceptions.WorksheetNotFound:
    worksheet = sh.add_worksheet(title=event, rows=len(event_params[event].index)+2, cols=len(event_params[event].columns))
  worksheet.update([event_params[event].columns.values.tolist()] + event_params[event].values.tolist())



print( "These are the user properties that appear in the data: \n" )
for event in list(user_props.keys()):  # Iterate over a copy of keys to allow modification
  if user_props[event].empty or user_props[event]['value_types'].isnull().all():
    del user_props[event]  # Remove empty or valueless DataFrames

  if event in user_props: # Check if key exists before printing
    print (event +'\n')
    print (user_props[event].loc[:, ['key','value_types']])
    print ('\n')
    try:
      worksheet = sh.worksheet("user_props")
    except gspread.exceptions.WorksheetNotFound:
      worksheet = sh.add_worksheet(title="user_props", rows=len(user_props[event].index)+2, cols=len(user_props[event].columns))
    worksheet.update([user_props[event].columns.values.tolist()] + user_props[event].values.tolist())


