# Reformating data from GA4

In [1]:
import pandas as pd
from urllib.parse import urlparse, parse_qs, unquote

In [36]:
df = pd.read_csv('data-export.csv')
df.head(15)

Ignore Rows with Metadata

In [38]:
df_cleaned = df[10:]

# take first row as heading and reset index
df_cleaned.columns = df_cleaned.iloc[0]

# remove index from header row
df_cleaned = df_cleaned[1:]
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned = df_cleaned.rename_axis(None, axis=1)

df_cleaned.head()

Unnamed: 0,Landing page,Sessions,Users,New users,Average engagement time per session,Conversions,Total revenue
0,,37358,21215,21361,105.7983297,0,0
1,/%3Fsize=n_20_n,437,307,240,36.38215103,0,0
2,(not set),234,151,1,4.525641026,0,0
3,/%3Fsize=n_60_n,17,1,0,0.0,0,0
4,/%3Fcurrent=n_2_n&q=software%20&size=n_20_n,16,9,9,80.875,0,0


In [39]:
# drop columns
df_cleaned = df_cleaned.drop(['New users', 'Average engagement time per session', 'Conversions', 'Total revenue'], axis=1)
df_cleaned.head()

Unnamed: 0,Landing page,Sessions,Users
0,,37358,21215
1,/%3Fsize=n_20_n,437,307
2,(not set),234,151
3,/%3Fsize=n_60_n,17,1
4,/%3Fcurrent=n_2_n&q=software%20&size=n_20_n,16,9


In [40]:
# drop number of users
df_cleaned = df_cleaned.drop(['Users'], axis=1)

# we will be exploding by sessions later

Unnamed: 0,Landing page,Sessions
0,,37358
1,/%3Fsize=n_20_n,437
2,(not set),234
3,/%3Fsize=n_60_n,17
4,/%3Fcurrent=n_2_n&q=software%20&size=n_20_n,16


In [53]:
df_preprocessed = df_cleaned.copy(deep=True)

# expand landing page links to utm query params
# example: /%3Fcurrent=n_2_n&q=software%20&size=n_20_n

# Define a function to extract query params from a URL
def extract_query_params(url):
    url = unquote(url)
    query_params = parse_qs(urlparse(url).query)
    return query_params

df_preprocessed['Landing page'] = df_preprocessed['Landing page'].astype(str)
df_preprocessed['query_params'] = df_preprocessed['Landing page'].apply(extract_query_params)

df_preprocessed.head()

Unnamed: 0,Landing page,Sessions,query_params
0,,37358,{}
1,/%3Fsize=n_20_n,437,{'size': ['n_20_n']}
2,(not set),234,{}
3,/%3Fsize=n_60_n,17,{'size': ['n_60_n']}
4,/%3Fcurrent=n_2_n&q=software%20&size=n_20_n,16,"{'current': ['n_2_n'], 'q': ['software '], 'si..."


In [56]:
df_preprocessed.iloc[4]['query_params']

{'current': ['n_2_n'], 'q': ['software '], 'size': ['n_20_n']}

In [65]:
# explode by sessions for easier analysis

df_preprocessed['Sessions'] = df_preprocessed['Sessions'].astype(int)

df_preprocessed = df_preprocessed[df_preprocessed['Landing page'] != '(not set)']
df_preprocessed = df_preprocessed[df_preprocessed['Landing page'] != 'nan']

df_preprocessed['copy'] = df_preprocessed['Sessions'].apply(lambda x: list(range(x)))
df_exploded = df_preprocessed.explode('copy').drop(columns='Sessions')
df_exploded.drop(columns='copy', inplace=True)

# reset index
df_exploded = df_exploded.reset_index(drop=True)

df_exploded.head()

Unnamed: 0,Landing page,query_params
0,/%3Fsize=n_20_n,{'size': ['n_20_n']}
1,/%3Fsize=n_20_n,{'size': ['n_20_n']}
2,/%3Fsize=n_20_n,{'size': ['n_20_n']}
3,/%3Fsize=n_20_n,{'size': ['n_20_n']}
4,/%3Fsize=n_20_n,{'size': ['n_20_n']}


In [66]:
# rename landing page to visitor_url
df_exploded = df_exploded.rename(columns={'Landing page': 'visitor_url'})
df_exploded

Unnamed: 0,visitor_url,query_params
0,/%3Fsize=n_20_n,{'size': ['n_20_n']}
1,/%3Fsize=n_20_n,{'size': ['n_20_n']}
2,/%3Fsize=n_20_n,{'size': ['n_20_n']}
3,/%3Fsize=n_20_n,{'size': ['n_20_n']}
4,/%3Fsize=n_20_n,{'size': ['n_20_n']}
...,...,...
663,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc..."
664,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc..."
665,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc..."
666,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc..."


In [67]:
# convert query params to columns
df_exploded = pd.concat([df_exploded, df_exploded['query_params'].apply(pd.Series)], axis=1)

df_exploded.tail()

Unnamed: 0,visitor_url,query_params,size,current,q,filters[0][field],filters[0][values][0],sort-field,sort-direction,fbclid,amp;amp;size
663,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc...",[n_20_n],,,[school],[London School o],,,,
664,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc...",[n_20_n],,,[school],[Nanyang Polytechn],,,,
665,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc...",[n_20_n],,,[school],[Nanyang Technolog],,,,
666,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=scho...,"{'size': ['n_20_n'], 'filters[0][field]': ['sc...",[n_20_n],,,[school],[National Universi],,,,
667,/%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=indu...,"{'size': ['n_60_n'], 'filters[0][field]': ['in...",[n_60_n],,,[industries],[Data Science%],,,,


In [70]:
# what is the amp;amp;size column?
df_exploded['amp;amp;size'].value_counts()

df_exploded.drop('amp;amp;size', axis=1, inplace=True)

In [84]:
pd.set_option('display.max_colwidth', None)

df_exploded.tail()

Unnamed: 0,visitor_url,query_params,size,current,q,filters[0][field],filters[0][values][0],sort-field,sort-direction,fbclid
663,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=London%20School%20o,"{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['London School o']}",[n_20_n],,,[school],[London School o],,,
664,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Polytechn,"{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Polytechn']}",[n_20_n],,,[school],[Nanyang Polytechn],,,
665,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=Nanyang%20Technolog,"{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['Nanyang Technolog']}",[n_20_n],,,[school],[Nanyang Technolog],,,
666,/%3Fsize=n_20_n&filters%5B0%5D%5Bfield%5D=school&filters%5B0%5D%5Bvalues%5D%5B0%5D=National%20Universi,"{'size': ['n_20_n'], 'filters[0][field]': ['school'], 'filters[0][values][0]': ['National Universi']}",[n_20_n],,,[school],[National Universi],,,
667,/%3Fsize=n_60_n&filters%5B0%5D%5Bfield%5D=industries&filters%5B0%5D%5Bvalues%5D%5B0%5D=Data%20Science%,"{'size': ['n_60_n'], 'filters[0][field]': ['industries'], 'filters[0][values][0]': ['Data Science%']}",[n_60_n],,,[industries],[Data Science%],,,


In [88]:
# export data
df_exploded.to_csv('data-preprocessed.csv', index=False)