<a href="https://colab.research.google.com/github/CaretJuice/extract-ga4-event-params-bigquery/blob/main/extract_ga4_event_params_bigquery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Variables

source_project = ''
source_dataset = ''
source_table = ''

# In what percentage of events does an event parameter need to be found to be
# considered a universal event parameter. In other words, what parameters should
# be configured for all events.
threshold = 0.5

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
%load_ext google.colab.data_table

In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=source_project)

events = client.query('''
  select distinct
    event_name
  from {sp}.{sd}.{st} '''.format(sp=source_project, sd=source_dataset, st=source_table) ).result()

event_list=[]
for event in events:
  event_list.append(event[0])
event_list

In [None]:
# Dictionary of dataframes
event_params = {}

for event in event_list:
  event_params[event] = client.query('''
    select
      ep.key
      , count(ep.value.string_value) as string_values
      , count(ep.value.int_value) as int_values
      , count(ep.value.float_value) as float_values
      , count(ep.value.double_value) as double_values
    from {sp}.{sd}.{st}
    cross join
      unnest(event_params) ep
    where event_name = '{en}'
    group by ep.key;
    '''.format(sp=source_project, sd=source_dataset, st=source_table, en=event) ).to_dataframe()
  event_params[event]['value_types'] = ""

user_props = {}

for event in event_list:
  user_props[event] = client.query('''
    select
      up.key
      , count(up.value.string_value) as string_values
      , count(up.value.int_value) as int_values
      , count(up.value.float_value) as float_values
      , count(up.value.double_value) as double_values
    from {sp}.{sd}.{st}
    cross join
      unnest(user_properties) up
    where event_name = '{en}'
    group by up.key;
    '''.format(sp=source_project, sd=source_dataset, st=source_table, en=event) ).to_dataframe()
  user_props[event]['value_types'] = ""



In [None]:
event_params['first_visit'].key


In [None]:
# warn_params is a list of event and event parameter names that have more than
# one type of value
warn_params = []
warn_props = []

# List parameters that are found in most or all events
param_set = set()
parameter_occurrences = {}
total_events = 0
universal_params = []

# List user properties
prop_set = set()


for event in event_list:
  total_events += 1
  for param in event_params[event].key:
    for index, row in event_params[event].iterrows():
      matched_params = []
      if row['string_values'] > 0:
        matched_params.append('string_value')
      if row['int_values'] > 0:
        matched_params.append('int_value')
      if row['float_values'] > 0:
        matched_params.append('float_value')
      if row['double_values'] > 0:
        matched_params.append('double_value')
      if len(matched_params) > 1:
        warn_params.append( event + ": " + param.key )
      event_params[event].loc[index, 'value_types'] = matched_params

      param_set.add(row.key)
      if row['key'] not in parameter_occurrences:
        parameter_occurrences[row['key']] = 1
      else:
        parameter_occurrences[row['key']] += 1

for param in param_set:
  if parameter_occurrences[param] / total_events >  threshold:
    universal_params.append(param)


for event in event_list:
  for param in user_props[event].key:
    for index, row in user_props[event].iterrows():
      matched_props = []
      if row['string_values'] > 0:
        matched_params.append('string_value')
      if row['int_values'] > 0:
        matched_params.append('int_value')
      if row['float_values'] > 0:
        matched_params.append('float_value')
      if row['double_values'] > 0:
        matched_params.append('double_value')
      if len(matched_params) > 1:
        warn_props.append( event + ": " + param.key )
      user_props[event].loc[index, 'value_types'] = matched_params

      prop_set.add(row.key)


In [None]:
#

if len(warn_params) > 0:
  print( "The following event parameters have more than one value type: \n" )
  print(warn_params + '\n\n')

print( "The following event parameters appear in at least " + format(threshold, ".0%") + " of events. Consider implementing these as default custom parameters. \n" )
print( "Please note that the parameters that are implemented by default in GA4 on all events, like page_location, are not removed from this list. \n" )
print( "You do not need to register default parameters. \n" )
print(' '.join(universal_params) + '\n\n')

print( "These are the event parameters by event: \n" )
for event in event_list:
  print (event +'\n')
  print (event_params[event])
  print ('\n')

if len(warn_props) > 0:
  print( "The following user properties have more than one value type: \n" )
  print(warn_props + '\n\n')

print( "These are the user properties that appear in the data: \n" )
print(user_props + '\n\n')


