From 5e447712921e09b79f38b98b91e4dc135cf152ba Mon Sep 17 00:00:00 2001 From: jpaten Date: Sun, 16 Feb 2025 14:47:47 -0800 Subject: [PATCH 1/3] feat: added landing page analytics to package, refactored (#4378) --- .../analytics_package/analytics/fields.py | 58 +++- .../analytics/sheets_elements.py | 279 +++++++++++------- 2 files changed, 215 insertions(+), 122 deletions(-) diff --git a/analytics/analytics_package/analytics/fields.py b/analytics/analytics_package/analytics/fields.py index 3b31a5272..f25e09ab3 100644 --- a/analytics/analytics_package/analytics/fields.py +++ b/analytics/analytics_package/analytics/fields.py @@ -1,7 +1,35 @@ # Metric names -METRIC_EVENT_COUNT = 'eventCount' -METRIC_TOTAL_USERS = 'totalUsers' -METRIC_PAGE_VIEW = 'screenPageViews' +METRIC_EVENT_COUNT = { + "id": "eventCount", + "alias": "Event Count", + "change_alias": "Event Count Change", +} +METRIC_TOTAL_USERS = { + "id": "totalUsers", + "alias": "Total Users", + "change_alias": "Total Users Change", +} + +METRIC_ACTIVE_USERS = { + "id": "activeUsers", + "alias": "Users", + "change_alias": "Active Users Change", +} +METRIC_PAGE_VIEWS = { + "id": "screenPageViews", + "alias": "Total Pageviews", + "change_alias": "Total Pageviews Change", +} +METRIC_SESSIONS = { + "id": "sessions", + "alias": "Sessions", + "change_alias": "Sessions Change", +} +SYNTHETIC_METRIC_CLICKS = { + "id": None, + "alias": "Total Clicks", + "change_alias": "Total Clicks Change", +} # Event Names EVENT_BUILTIN_CLICK = "click" @@ -10,18 +38,26 @@ # DIMENSIONS DIMENSION_PAGE_PATH = { - 'id': 'pagePath', - 'alias': 'page_path', + "id": "pagePath", + "alias": "Page Path", } DIMENSION_BUILTIN_URL = { - 'id': 'linkUrl', - 'alias': 'builtin_url', + "id": "linkUrl", + "alias": "URL", } DIMENSION_EVENT_NAME = { - 'id': 'eventName', - 'alias': 'event_name', + "id": "eventName", + "alias": "Event Name", } DIMENSION_CUSTOM_URL = { - 'id': 'customEvent:click_url', - 'alias': 'outbound_url', + "id": "customEvent:click_url", + "alias": "Outbound URL", +} +DIMENSION_LANDING_PAGE = { + "id": "landingPage", + "alias": "Landing Page", +} +DIMENSION_YEAR_MONTH = { + "id": "yearMonth", + "alias": "Month", } diff --git a/analytics/analytics_package/analytics/sheets_elements.py b/analytics/analytics_package/analytics/sheets_elements.py index 8a6f0f3e9..7655e1835 100644 --- a/analytics/analytics_package/analytics/sheets_elements.py +++ b/analytics/analytics_package/analytics/sheets_elements.py @@ -6,21 +6,22 @@ from urllib.parse import urlparse import datetime as dt -def get_flat_data_df(metrics, dimensions, **other_params): +def get_data_df_from_fields(metrics, dimensions, **other_params): """ - Get a df from the Analytics API with a flat structure (no multiindex). + Get a df from the Analytics API with metrics and dimensions as specified in fields.py - :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param metrics: the metrics to get :param dimensions: the dimensions to get + :param dimension_index: whether to use the dimensions as the index, defaults to False + :param other_params: any other parameters to be passed to the get_data_df function, including service params :return: a DataFrame with the data from the Analytics API """ df = get_data_df( - metrics, + [metric["id"] for metric in metrics], [dimension["id"] for dimension in dimensions], - **other_params, + **other_params ) - return df.reset_index().rename(columns=get_rename_dict(dimensions)).copy() + return df.reset_index().rename(columns=get_rename_dict(dimensions+metrics)).copy() def get_rename_dict(dimensions): """Get a dictionary to rename the columns of a DataFrame.""" @@ -28,7 +29,7 @@ def get_rename_dict(dimensions): zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) ) -def get_outbound_links_df(analytics_params): +def get_outbound_links_df(analytics_params, ignore_index=True): """ Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. analytics_params cannot currently include a dimension_filter @@ -39,7 +40,7 @@ def get_outbound_links_df(analytics_params): pd.set_option('future.no_silent_downcasting', True) assert "dimension_filter" not in analytics_params # Get the builtin "Click" event - df_builtin_links = get_flat_data_df( + df_builtin_links = get_data_df_from_fields( [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], [DIMENSION_PAGE_PATH, DIMENSION_BUILTIN_URL, DIMENSION_EVENT_NAME], dimension_filter=f"eventName=={EVENT_BUILTIN_CLICK}", @@ -48,7 +49,7 @@ def get_outbound_links_df(analytics_params): [DIMENSION_PAGE_PATH["alias"], DIMENSION_BUILTIN_URL["alias"]] ).sum().reset_index() # Get the custom "outbound_link_click" event - df_custom_links = get_flat_data_df( + df_custom_links = get_data_df_from_fields( [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], [DIMENSION_EVENT_NAME, DIMENSION_CUSTOM_URL, DIMENSION_PAGE_PATH], dimension_filter=f"eventName=={EVENT_CUSTOM_CLICK}", @@ -69,27 +70,34 @@ def get_outbound_links_df(analytics_params): # Use the builtin link, unless the link is not in the custom links, in which case use the custom link df_all_links = df_all_links.loc[ ~(df_all_links["truncated_url"].isin(df_outbound_links_fragments["truncated_url"]) & df_all_links["builtin"]) - ].sort_values(METRIC_EVENT_COUNT, ascending=False) + ].sort_values(METRIC_EVENT_COUNT["alias"], ascending=False) df_all_links["is_fragment"] = df_all_links["is_fragment"].fillna(False).astype(bool) # Use the builtin link, unless the link is a fragment, in which case use the custom link - df_all_links["complete_url"] = df_all_links["builtin_url"].where( + df_all_links["complete_url"] = df_all_links[DIMENSION_BUILTIN_URL["alias"]].where( ~df_all_links["is_fragment"], - df_all_links["outbound_url"] + df_all_links[DIMENSION_CUSTOM_URL["alias"]] ) df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname) df_all_links = df_all_links.drop( - columns=["builtin_url", "outbound_url", "builtin", "is_fragment"] + columns=[DIMENSION_BUILTIN_URL["alias"], DIMENSION_CUSTOM_URL["alias"], "builtin", "is_fragment"] ).rename( columns={ - DIMENSION_PAGE_PATH["alias"]: "Page Path", "complete_url": "Outbound Link", - METRIC_EVENT_COUNT: "Total Clicks", - METRIC_TOTAL_USERS: "Total Users", + METRIC_EVENT_COUNT["alias"]: SYNTHETIC_METRIC_CLICKS["alias"], "hostname": "Hostname", } - )[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users"]] + )[[ + DIMENSION_PAGE_PATH["alias"], + "Hostname", + "Outbound Link", + SYNTHETIC_METRIC_CLICKS["alias"], + METRIC_TOTAL_USERS["alias"] + ]].copy() - return df_all_links.copy().reset_index(drop=True) + if not ignore_index: + return df_all_links.set_index(["Page Path", "Outbound Link", "Hostname"]) + else: + return df_all_links.reset_index(drop=True) def get_outbound_links_change(analytics_params, start_current, end_current, start_previous, end_previous): """ @@ -100,44 +108,18 @@ def get_outbound_links_change(analytics_params, start_current, end_current, star :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month """ - analytics_params_month_1 = { - **analytics_params, - "start_date": start_current, - "end_date": end_current, - } - analytics_params_month_2 = { - **analytics_params, - "start_date": start_previous, - "end_date": end_previous, - } - df_current = get_outbound_links_df(analytics_params_month_1).set_index( - ["Page Path", "Outbound Link", "Hostname"] - ) - df_previous = get_outbound_links_df(analytics_params_month_2).set_index( - ["Page Path", "Outbound Link", "Hostname"] - ) - total_clicks_percent_change = get_change( - df_current["Total Clicks"], - df_previous["Total Clicks"], - start_current, - end_current, - start_previous, - end_previous - ) - total_users_percent_change = get_change( - df_current["Total Users"], - df_previous["Total Users"], - start_current, - end_current, - start_previous, - end_previous + return get_one_period_change_df( + get_outbound_links_df, + [SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS], + analytics_params, + start_current, + end_current, + start_previous, + end_previous, + sort_results=[SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS] ) - df_reindexed = df_current.reindex(total_clicks_percent_change.index).fillna(0) - df_reindexed["Total Clicks Percent Change"] = total_clicks_percent_change - df_reindexed["Total Users Percent Change"] = total_users_percent_change - return df_reindexed.sort_values(["Total Clicks", "Total Users"], ascending=False, kind="stable").reset_index() -def get_page_views_df(analytics_params): +def get_page_views_df(analytics_params, ignore_index=False): """ Get a DF with page views from the Analytics API. @@ -145,18 +127,14 @@ def get_page_views_df(analytics_params): :return: a DataFrame with the page views from the Analytics API """ assert "dimension_filter" not in analytics_params - df_response = get_flat_data_df( - [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS, METRIC_PAGE_VIEW], + df_response = get_data_df_from_fields( + [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS, METRIC_PAGE_VIEWS], [DIMENSION_PAGE_PATH, DIMENSION_EVENT_NAME], **analytics_params, dimension_filter=f"eventName=={EVENT_PAGE_VIEW}", - ).rename( - columns={ - DIMENSION_PAGE_PATH["alias"]: "Page Path", - METRIC_PAGE_VIEW: "Total Views", - METRIC_TOTAL_USERS: "Total Users", - } - )[["Page Path", "Total Views", "Total Users"]].copy() + )[[DIMENSION_PAGE_PATH["alias"], METRIC_PAGE_VIEWS["alias"], METRIC_TOTAL_USERS["alias"]]].copy() + if not ignore_index: + df_response = df_response.set_index(DIMENSION_PAGE_PATH["alias"]) return df_response def get_page_views_change(analytics_params, start_current, end_current, start_previous, end_previous): @@ -168,43 +146,18 @@ def get_page_views_change(analytics_params, start_current, end_current, start_pr :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month """ - analytics_params_current = { - **analytics_params, - "start_date": start_current, - "end_date": end_current, - } - analytics_params_previous = { - **analytics_params, - "start_date": start_previous, - "end_date": end_previous, - } - df_current = get_page_views_df(analytics_params_current).set_index("Page Path") - df_previous = get_page_views_df(analytics_params_previous).set_index("Page Path") - combined_index = df_current.index.union(df_previous.index) - df_current_reindexed = df_current.reindex(combined_index).fillna(0) - df_previous_reindexed = df_previous.reindex(combined_index) - views_percent_change = get_change( - df_current_reindexed["Total Views"], - df_previous_reindexed["Total Views"], - start_current, - end_current, - start_previous, + return get_one_period_change_df( + get_page_views_df, + [METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS], + analytics_params, + start_current, + end_current, + start_previous, end_previous, + sort_results=[METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS] ) - users_percent_change = get_change( - df_current_reindexed["Total Users"], - df_previous_reindexed["Total Users"], - start_current, - end_current, - start_previous, - end_previous, - ) - df_reindexed = df_current.reindex(views_percent_change.index).fillna(0) - df_reindexed["Total Views Percent Change"] = views_percent_change - df_reindexed["Total Users Percent Change"] = users_percent_change - return df_reindexed.sort_values(["Total Views", "Total Users"], ascending=False, kind="stable").reset_index() -def get_change(series_current, series_previous, start_current, end_current, start_previous, end_previous, combined_index = None): +def get_one_period_change_series(series_current, series_previous, start_current, end_current, start_previous, end_previous, combined_index = None): """ Get the percent change between two serieses, accounting for different numbers of days in the month. :param series_current: the series representing the current month @@ -228,6 +181,74 @@ def get_change(series_current, series_previous, start_current, end_current, star change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan}) return change +def get_string_or_alias(string_or_alias): + """ + Get the string or alias of a metric or dimension. + :param string_or_alias: the string or alias to get + :return: the string or alias of the metric or dimension + """ + if isinstance(string_or_alias, dict) and "alias" in string_or_alias: + return string_or_alias["alias"] + elif isinstance(string_or_alias, str): + return string_or_alias + else: + raise ValueError("string_or_alias must be a string or a dictionary with an alias key") + +def get_one_period_change_df(df_function, change_metrics, analytics_params, start_current, end_current, start_previous, end_previous, sort_results=None, dimension_index=False): + """ + Get a DF with the change between two periods for the given metrics, renamed to match titles + :param metrics: the objects representing metrics to be displayed + :param metric_titles: the titles to be displayed for the metrics + :param dimensions: the objects representing dimensions to be displayed + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the prior month + :param end_previous: the end date for the prior month + """ + + analytics_params_current = { + **analytics_params, + "start_date": start_current, + "end_date": end_current, + } + analytics_params_previous = { + **analytics_params, + "start_date": start_previous, + "end_date": end_previous, + } + + df_current = df_function( + analytics_params_current, + ignore_index=False + ) + df_previous = df_function( + analytics_params_previous, + ignore_index=False + ) + df_changes = pd.concat( + [ + get_one_period_change_series( + df_current[metric["alias"]], df_previous[metric["alias"]], start_current, end_current, start_previous, end_previous + ) for metric in change_metrics + ], + axis=1, + ).rename( + columns={metric["alias"]: metric["change_alias"] for metric in change_metrics} + ) + df_current_with_changes = pd.concat( + [df_current.reindex(df_changes.index).fillna(0), df_changes], + axis=1 + ) + if sort_results: + df_current_with_changes = df_current_with_changes.sort_values( + [metric["alias"] for metric in sort_results], ascending=False, kind="stable" + ) + if dimension_index: + return df_current_with_changes + else: + return df_current_with_changes.reset_index() + class ADDITIONAL_DATA_BEHAVIOR(Enum): ADD = "add" REPLACE = "replace" @@ -240,17 +261,15 @@ def get_page_views_over_time_df(analytics_params, additional_data_path=None, add :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None """ return get_change_over_time_df( - ["Users", "Total Pageviews"], - ["activeUsers", "screenPageViews"], - ["Month"], - "yearMonth", + [METRIC_ACTIVE_USERS, METRIC_PAGE_VIEWS], + DIMENSION_YEAR_MONTH, additional_data_path=additional_data_path, additional_data_behavior=additional_data_behavior, **analytics_params ) def get_change_over_time_df( - metric_titles, metrics, time_title, time_dimension, include_changes=True, change_title_suffix = " Change", additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params + metrics, time_dimension, include_changes=True, additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params ): """ Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles @@ -266,14 +285,14 @@ def get_change_over_time_df( :param other_params: any other parameters to be passed to the get_df_over_time function, including service params """ df_api = get_df_over_time( - metric_titles, - metrics, - time_dimension, - sort_results=[time_dimension], + [metric["alias"] for metric in metrics], + [metric["id"] for metric in metrics], + time_dimension["id"], + sort_results=[time_dimension["id"]], df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)), format_table=False, **other_params - ) + ).rename({time_dimension["id"]: time_dimension["alias"]}) df_combined = pd.DataFrame() @@ -289,13 +308,51 @@ def get_change_over_time_df( df_combined = df_api if include_changes: - assert change_title_suffix is not None df_combined[ - [f"{title}{change_title_suffix}" for title in metric_titles] - ] = df_combined[metric_titles].pct_change(periods=-1).replace({np.inf: np.nan}) + [metric["change_alias"] for metric in metrics] + ] = df_combined[ + [metric["alias"] for metric in metrics] + ].pct_change(periods=-1).replace({np.inf: np.nan}) if strftime_format is not None: df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format) - return df_combined.reset_index(names=time_title) + return df_combined.reset_index(names=time_dimension["alias"]) + + +def get_landing_page_df(analytics_params, ignore_index=True): + """ + Get a DataFrame with landing pages from the Analytics API. + + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :return: a DataFrame with the landing pages from the Analytics API + """ + df_response = get_data_df_from_fields( + [METRIC_SESSIONS], + [DIMENSION_LANDING_PAGE], + **analytics_params, + )[[DIMENSION_LANDING_PAGE["alias"], METRIC_SESSIONS["alias"]]].copy() + if not ignore_index: + df_response = df_response.set_index(DIMENSION_LANDING_PAGE["alias"]) + return df_response + +def get_landing_page_change(analytics_params, start_current, end_current, start_previous, end_previous): + """ + Get a DF with landing pages from the Analytics API and a comparison for the prior month + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the previous month + :param end_previous: the end date for the previous month + """ + return get_one_period_change_df( + get_landing_page_df, + [METRIC_SESSIONS], + analytics_params, + start_current, + end_current, + start_previous, + end_previous, + sort_results=[METRIC_SESSIONS] + ) \ No newline at end of file From 9c11c1e06ecd00c788661723eb5d034e7d8bbf73 Mon Sep 17 00:00:00 2001 From: jpaten Date: Sun, 16 Feb 2025 17:05:00 -0800 Subject: [PATCH 2/3] chore: refactored util functions for sheets to a different file (#4378) --- .../analytics/_sheets_utils.py | 107 +++++++++ .../analytics_package/analytics/entities.py | 100 +++++++++ .../analytics_package/analytics/fields.py | 63 ------ .../analytics/sheets_elements.py | 209 ++++++------------ 4 files changed, 272 insertions(+), 207 deletions(-) create mode 100644 analytics/analytics_package/analytics/_sheets_utils.py create mode 100644 analytics/analytics_package/analytics/entities.py delete mode 100644 analytics/analytics_package/analytics/fields.py diff --git a/analytics/analytics_package/analytics/_sheets_utils.py b/analytics/analytics_package/analytics/_sheets_utils.py new file mode 100644 index 000000000..e5c1bf194 --- /dev/null +++ b/analytics/analytics_package/analytics/_sheets_utils.py @@ -0,0 +1,107 @@ +import datetime as dt +from .charts import get_data_df, get_df_over_time +from .entities import ADDITIONAL_DATA_BEHAVIOR +import numpy as np +import pandas as pd + +def get_data_df_from_fields(metrics, dimensions, **other_params): + """ + Get a df from the Analytics API with metrics and dimensions as specified in fields.py + + :param metrics: the metrics to get + :param dimensions: the dimensions to get + :param other_params: any other parameters to be passed to the get_data_df function, including service params + :return: a DataFrame with the data from the Analytics API. + The DF has an arbitrary RangeIndex, + string columns containing dimensions with names equal to the dimension alias value, + and int columns containing metrics with names equal to the metric alias value. + """ + df = get_data_df( + [metric["id"] for metric in metrics], + [dimension["id"] for dimension in dimensions], + **other_params + ) + return df.reset_index().rename(columns=get_rename_dict(dimensions+metrics)).copy() + + +def get_rename_dict(dimensions): + """Get a dictionary to rename the columns of a DataFrame.""" + return dict( + zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) + ) + + +def get_one_period_change_series(series_current, series_previous, start_current, end_current, start_previous, end_previous): + """ + Get the percent change between two serieses, accounting for different numbers of days in the month. + :param series_current: the series representing the current month + :param series_previous: the series representing the prior month + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the prior month + :param end_previous: the end date for the prior month + :return: a Series with the change between the two serieses + """ + # Check that both serieses have the same index names + assert series_current.index.names == series_previous.index.names + # Reindex both serieses to have the same index + combined_index = series_current.index.union(series_previous.index) + current_length = float((dt.datetime.fromisoformat(end_current) - dt.datetime.fromisoformat(start_current)).days + 1) + previous_length = float((dt.datetime.fromisoformat(end_previous) - dt.datetime.fromisoformat(start_previous)).days + 1) + assert current_length != 0 and previous_length != 0 + series_current_reindexed = series_current.reindex(combined_index).fillna(0) + # Adjust the values from the prior series to account for the different number of days in the month + series_previous_reindexed = (series_previous.reindex(combined_index) * current_length / previous_length) + change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan}) + return change + + +def get_change_over_time_df( + metrics, time_dimension, include_changes=True, additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params +): + """ + Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles + :param metrics: the metrics to be displayed + :param time_dimension: the time dimension to be used + :param include_changes: whether to include the percent change columns, defaults to True + :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None + :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None + :param strftime_format: the format to use for the time dimension, defaults to "%Y-%m". None means a datetime will be returned + :param other_params: any other parameters to be passed to the get_df_over_time function, including service params + :returns: a datetime with the values of the metrics for each time dimension. + Columns are the time dimension alias (as a datetime), metric aliases (as ints), and change metric aliases (as floats) + """ + df_api = get_df_over_time( + [metric["alias"] for metric in metrics], + [metric["id"] for metric in metrics], + time_dimension["id"], + sort_results=[time_dimension["id"]], + df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)), + format_table=False, + **other_params + ).rename({time_dimension["id"]: time_dimension["alias"]}) + + df_combined = pd.DataFrame() + + if additional_data_path is not None: + assert additional_data_behavior is not None + df_saved = pd.read_json(additional_data_path) + if additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.ADD: + df_combined = df_api.add(df_saved.astype(int), fill_value=0)[::-1] + elif additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.REPLACE: + df_combined = pd.concat([df_saved, df_api], ignore_index=False) + df_combined = df_combined.loc[~df_combined.index.duplicated(keep="first")].sort_index(ascending=False) + else: + df_combined = df_api + + if include_changes: + df_combined[ + [metric["change_alias"] for metric in metrics] + ] = df_combined[ + [metric["alias"] for metric in metrics] + ].pct_change(periods=-1).replace({np.inf: np.nan}) + + if strftime_format is not None: + df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format) + + return df_combined.reset_index(names=time_dimension["alias"]) \ No newline at end of file diff --git a/analytics/analytics_package/analytics/entities.py b/analytics/analytics_package/analytics/entities.py new file mode 100644 index 000000000..851002eec --- /dev/null +++ b/analytics/analytics_package/analytics/entities.py @@ -0,0 +1,100 @@ +# Metric names +# The number of events that occur +from enum import Enum + +# The number of events that occur +METRIC_EVENT_COUNT = { + "id": "eventCount", + "alias": "Event Count", + "change_alias": "Event Count Change", +} +# The total number of users that trigger an event +# Includes users who visit very briefly and do not interact with the site +# See https://support.google.com/analytics/answer/12253918?hl=en +METRIC_TOTAL_USERS = { + "id": "totalUsers", + "alias": "Total Users", + "change_alias": "Total Users Change", +} +# The number of active users as defined by GA4 +# See https://support.google.com/analytics/answer/12253918?hl=en +METRIC_ACTIVE_USERS = { + "id": "activeUsers", + "alias": "Users", + "change_alias": "Active Users Change", +} +# The number of page views +METRIC_PAGE_VIEWS = { + "id": "screenPageViews", + "alias": "Total Pageviews", + "change_alias": "Total Pageviews Change", +} +# The number of sessions +METRIC_SESSIONS = { + "id": "sessions", + "alias": "Sessions", + "change_alias": "Sessions Change", +} +# The total number of clicks on outbound links. Generated from other metrics, so does not have an id field +SYNTHETIC_METRIC_CLICKS = { + "id": None, + "alias": "Total Clicks", + "change_alias": "Total Clicks Change", +} + +# Event Names +# The builtin outbound link click event. Stores the clicked URL in DIMENSION_BUILTIN_URL +# Triggers under some circumstances where custom click does not, but does not include url fragments in any dimensions +EVENT_BUILTIN_CLICK = "click" +# The custom outbound link click event. Stores the clicked URL DIMENSION_CUSTOM_URL +# Includes url fragments, sometimes has a slightly different count to the built in click event +EVENT_CUSTOM_CLICK = "outbound_link_clicked" +# The builtin page view event. +EVENT_PAGE_VIEW = "page_view" + +# DIMENSIONS +# The path to the page the user is on when the event occurs. Does not include fragments or parameters +DIMENSION_PAGE_PATH = { + "id": "pagePath", + "alias": "Page Path", +} +# The url of the clicked link, only returned in EVENT_BUILTIN_CLICK. Does not include URL fragments +DIMENSION_BUILTIN_URL = { + "id": "linkUrl", + "alias": "URL", +} +# The name of the event. See GA4 docs for event names +DIMENSION_EVENT_NAME = { + "id": "eventName", + "alias": "Event Name", +} +# The url of the clicked link, only returned in EVENT_CUSTOM_CLICK. Includes URL fragments. +DIMENSION_CUSTOM_URL = { + "id": "customEvent:click_url", + "alias": "Outbound URL", +} +# The landing page for a session +DIMENSION_LANDING_PAGE = { + "id": "landingPage", + "alias": "Landing Page", +} +# The current month in the format YYYYMM +DIMENSION_YEAR_MONTH = { + "id": "yearMonth", + "alias": "Month", +} +# The hostname of the clicked link. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL +SYNTHETIC_DIMENSION_CLICKED_HOSTNAME = { + "id": None, + "alias": "Clicked Hostname", +} +# The complete clicked link, including hostname, parameters, fragments, and prefix. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL +SYNTHETIC_DIMENSION_CLICKED_LINK = { + "id": None, + "alias": "Outbound Link", +} + +# Used as arguments in get_change_over_time_df +class ADDITIONAL_DATA_BEHAVIOR(Enum): + ADD = "add" # Sum the cached data with the api data + REPLACE = "replace"# Replace the api data with the cached data \ No newline at end of file diff --git a/analytics/analytics_package/analytics/fields.py b/analytics/analytics_package/analytics/fields.py deleted file mode 100644 index f25e09ab3..000000000 --- a/analytics/analytics_package/analytics/fields.py +++ /dev/null @@ -1,63 +0,0 @@ -# Metric names -METRIC_EVENT_COUNT = { - "id": "eventCount", - "alias": "Event Count", - "change_alias": "Event Count Change", -} -METRIC_TOTAL_USERS = { - "id": "totalUsers", - "alias": "Total Users", - "change_alias": "Total Users Change", -} - -METRIC_ACTIVE_USERS = { - "id": "activeUsers", - "alias": "Users", - "change_alias": "Active Users Change", -} -METRIC_PAGE_VIEWS = { - "id": "screenPageViews", - "alias": "Total Pageviews", - "change_alias": "Total Pageviews Change", -} -METRIC_SESSIONS = { - "id": "sessions", - "alias": "Sessions", - "change_alias": "Sessions Change", -} -SYNTHETIC_METRIC_CLICKS = { - "id": None, - "alias": "Total Clicks", - "change_alias": "Total Clicks Change", -} - -# Event Names -EVENT_BUILTIN_CLICK = "click" -EVENT_CUSTOM_CLICK = "outbound_link_clicked" -EVENT_PAGE_VIEW = "page_view" - -# DIMENSIONS -DIMENSION_PAGE_PATH = { - "id": "pagePath", - "alias": "Page Path", -} -DIMENSION_BUILTIN_URL = { - "id": "linkUrl", - "alias": "URL", -} -DIMENSION_EVENT_NAME = { - "id": "eventName", - "alias": "Event Name", -} -DIMENSION_CUSTOM_URL = { - "id": "customEvent:click_url", - "alias": "Outbound URL", -} -DIMENSION_LANDING_PAGE = { - "id": "landingPage", - "alias": "Landing Page", -} -DIMENSION_YEAR_MONTH = { - "id": "yearMonth", - "alias": "Month", -} diff --git a/analytics/analytics_package/analytics/sheets_elements.py b/analytics/analytics_package/analytics/sheets_elements.py index 7655e1835..0ff992cb0 100644 --- a/analytics/analytics_package/analytics/sheets_elements.py +++ b/analytics/analytics_package/analytics/sheets_elements.py @@ -1,46 +1,24 @@ -from enum import Enum -import numpy as np import pandas as pd -from .charts import get_data_df, get_df_over_time -from .fields import * -from urllib.parse import urlparse -import datetime as dt - -def get_data_df_from_fields(metrics, dimensions, **other_params): - """ - Get a df from the Analytics API with metrics and dimensions as specified in fields.py - - :param metrics: the metrics to get - :param dimensions: the dimensions to get - :param dimension_index: whether to use the dimensions as the index, defaults to False - :param other_params: any other parameters to be passed to the get_data_df function, including service params - :return: a DataFrame with the data from the Analytics API - """ - df = get_data_df( - [metric["id"] for metric in metrics], - [dimension["id"] for dimension in dimensions], - **other_params - ) - return df.reset_index().rename(columns=get_rename_dict(dimensions+metrics)).copy() -def get_rename_dict(dimensions): - """Get a dictionary to rename the columns of a DataFrame.""" - return dict( - zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) - ) +from ._sheets_utils import * +from .entities import * +from urllib.parse import urlparse def get_outbound_links_df(analytics_params, ignore_index=True): """ - Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. + Get a DataFrame with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. analytics_params cannot currently include a dimension_filter :param analytics_params: the parameters for the Analytics API, including authentication and property ids - :return: a DataFrame with the outbound links from the Analytics API + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Dimensions: DIMENSION_PAGE_PATH, SYNTHETIC_DIMENSION_CLICKED_HOSTNAME, SYNTHETIC_DIMENSION_CLICKED_LINK + Metrics: SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS """ pd.set_option('future.no_silent_downcasting', True) assert "dimension_filter" not in analytics_params # Get the builtin "Click" event - df_builtin_links = get_data_df_from_fields( + df_builtin_links =get_data_df_from_fields( [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], [DIMENSION_PAGE_PATH, DIMENSION_BUILTIN_URL, DIMENSION_EVENT_NAME], dimension_filter=f"eventName=={EVENT_BUILTIN_CLICK}", @@ -78,35 +56,46 @@ def get_outbound_links_df(analytics_params, ignore_index=True): df_all_links[DIMENSION_CUSTOM_URL["alias"]] ) df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname) + dimension_aliases_to_keep = [ + DIMENSION_PAGE_PATH["alias"], + SYNTHETIC_DIMENSION_CLICKED_LINK["alias"], + SYNTHETIC_DIMENSION_CLICKED_HOSTNAME["alias"], + ] + metric_aliases_to_keep = [ + SYNTHETIC_METRIC_CLICKS["alias"], + METRIC_TOTAL_USERS["alias"], + ] df_all_links = df_all_links.drop( columns=[DIMENSION_BUILTIN_URL["alias"], DIMENSION_CUSTOM_URL["alias"], "builtin", "is_fragment"] ).rename( columns={ - "complete_url": "Outbound Link", + "complete_url": SYNTHETIC_DIMENSION_CLICKED_LINK["alias"], METRIC_EVENT_COUNT["alias"]: SYNTHETIC_METRIC_CLICKS["alias"], - "hostname": "Hostname", + "hostname": SYNTHETIC_DIMENSION_CLICKED_HOSTNAME["alias"], } )[[ - DIMENSION_PAGE_PATH["alias"], - "Hostname", - "Outbound Link", - SYNTHETIC_METRIC_CLICKS["alias"], - METRIC_TOTAL_USERS["alias"] + *dimension_aliases_to_keep, *metric_aliases_to_keep ]].copy() if not ignore_index: - return df_all_links.set_index(["Page Path", "Outbound Link", "Hostname"]) + return df_all_links.set_index(dimension_aliases_to_keep) else: return df_all_links.reset_index(drop=True) def get_outbound_links_change(analytics_params, start_current, end_current, start_previous, end_previous): """ - Get a DF with outbound links from the Analytics API and a comparison for the prior month + Get a DataFrame with outbound links from the Analytics API and a comparison for the prior period + :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month + :return: a DataFrame with the outbound links from the Analytics API. + By default, dimensions and metrics both form columns. + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_PAGE_PATH, SYNTHETIC_DIMENSION_CLICKED_HOSTNAME, SYNTHETIC_DIMENSION_CLICKED_LINK + Metrics: SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS """ return get_one_period_change_df( get_outbound_links_df, @@ -121,10 +110,13 @@ def get_outbound_links_change(analytics_params, start_current, end_current, star def get_page_views_df(analytics_params, ignore_index=False): """ - Get a DF with page views from the Analytics API. + Get a DataFrame with page views from the Analytics API :param analytics_params: the parameters for the Analytics API, including authentication and property ids - :return: a DataFrame with the page views from the Analytics API + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Dimensions: DIMENSION_PAGE_PATH + Metrics: METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS """ assert "dimension_filter" not in analytics_params df_response = get_data_df_from_fields( @@ -139,12 +131,17 @@ def get_page_views_df(analytics_params, ignore_index=False): def get_page_views_change(analytics_params, start_current, end_current, start_previous, end_previous): """ - Get a DF with page views from the Analytics API and a comparison for the prior month + Get a DataFrame with page views from the Analytics API and a comparison for the prior month + :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_PAGE_PATH + Metrics: METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS """ return get_one_period_change_df( get_page_views_df, @@ -156,55 +153,21 @@ def get_page_views_change(analytics_params, start_current, end_current, start_pr end_previous, sort_results=[METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS] ) - -def get_one_period_change_series(series_current, series_previous, start_current, end_current, start_previous, end_previous, combined_index = None): - """ - Get the percent change between two serieses, accounting for different numbers of days in the month. - :param series_current: the series representing the current month - :param series_previous: the series representing the prior month - :param start_current: the start date for the current month in the format "YYYY-MM-DD" - :param end_current: the end date for the current month - :param start_previous: the start date for the prior month - :param end_previous: the end date for the prior month - :return: a Series with the change between the two serieses - """ - # Check that both serieses have the same index names - assert series_current.index.names == series_previous.index.names - # Reindex both serieses to have the same index - combined_index = series_current.index.union(series_previous.index) - current_length = float((dt.datetime.fromisoformat(end_current) - dt.datetime.fromisoformat(start_current)).days + 1) - previous_length = float((dt.datetime.fromisoformat(end_previous) - dt.datetime.fromisoformat(start_previous)).days + 1) - assert current_length != 0 and previous_length != 0 - series_current_reindexed = series_current.reindex(combined_index).fillna(0) - # Adjust the values from the prior series to account for the different number of days in the month - series_previous_reindexed = (series_previous.reindex(combined_index) * current_length / previous_length) - change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan}) - return change - -def get_string_or_alias(string_or_alias): - """ - Get the string or alias of a metric or dimension. - :param string_or_alias: the string or alias to get - :return: the string or alias of the metric or dimension - """ - if isinstance(string_or_alias, dict) and "alias" in string_or_alias: - return string_or_alias["alias"] - elif isinstance(string_or_alias, str): - return string_or_alias - else: - raise ValueError("string_or_alias must be a string or a dictionary with an alias key") -def get_one_period_change_df(df_function, change_metrics, analytics_params, start_current, end_current, start_previous, end_previous, sort_results=None, dimension_index=False): +def get_one_period_change_df(df_function, change_metrics, analytics_params, start_current, end_current, start_previous, end_previous, sort_results=None, ignore_index=False): """ - Get a DF with the change between two periods for the given metrics, renamed to match titles - :param metrics: the objects representing metrics to be displayed - :param metric_titles: the titles to be displayed for the metrics - :param dimensions: the objects representing dimensions to be displayed + Get a DataFrame with the change between two periods for the given metrics, renamed to match titles + :param df_function: a function that returns a dataframe, with numerical columns matching the aliases of change_metrics + :param change_metrics: an iterable of the objects representing metrics to be displayed :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the prior month :param end_previous: the end date for the prior month + :param sort_results: an iterable containing the metrics to sort the results by, defaults to None + :param ignore_index: if true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the change between two periods for the given metrics, renamed to match titles + Columns are dimension aliases (as strings), metric aliases (as ints), and metric change aliases (as floats) """ analytics_params_current = { @@ -244,21 +207,22 @@ def get_one_period_change_df(df_function, change_metrics, analytics_params, star df_current_with_changes = df_current_with_changes.sort_values( [metric["alias"] for metric in sort_results], ascending=False, kind="stable" ) - if dimension_index: + if ignore_index: return df_current_with_changes else: return df_current_with_changes.reset_index() - -class ADDITIONAL_DATA_BEHAVIOR(Enum): - ADD = "add" - REPLACE = "replace" def get_page_views_over_time_df(analytics_params, additional_data_path=None, additional_data_behavior=None): """ Get a DataFrame with pageviews and total active users over time from the Analytics API. + :param analytics_params: the parameters for the Analytics API, including service params, start dates, and end dates :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None - :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None + :param additional_data_behavior: the behavior to use when adding the additional data, as an instance of ADDITIONAL_DATA_BEHAVIOR, defaults to None + :return: a DataFrame with the pageviews and total active users over time from the Analytics API. + Columns are the dimension aliases, metrics (as ints), and change metrics (as floats) + Dimensions: DIMENSION_YEAR_MONTH (as a datetime) + Metrics: METRIC_ACTIVE_USERS, METRIC_PAGE_VIEWS """ return get_change_over_time_df( [METRIC_ACTIVE_USERS, METRIC_PAGE_VIEWS], @@ -268,64 +232,16 @@ def get_page_views_over_time_df(analytics_params, additional_data_path=None, add **analytics_params ) -def get_change_over_time_df( - metrics, time_dimension, include_changes=True, additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params -): - """ - Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles - :param metric_titles: the titles of the metrics to be displayed - :param metrics: the metrics to be displayed - :param time_title: the title to be displayed for the time dimension - :param time_dimension: the time dimension to be displayed - :param include_changes: whether to include the percent change columns, defaults to True - :param change_title_suffix: the suffix to be added to the change columns, defaults to " Change" - :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None - :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None - :param strftime_format: the format to use for the time dimension, defaults to "%Y-%m". None means a datetime will be returned - :param other_params: any other parameters to be passed to the get_df_over_time function, including service params - """ - df_api = get_df_over_time( - [metric["alias"] for metric in metrics], - [metric["id"] for metric in metrics], - time_dimension["id"], - sort_results=[time_dimension["id"]], - df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)), - format_table=False, - **other_params - ).rename({time_dimension["id"]: time_dimension["alias"]}) - - df_combined = pd.DataFrame() - - if additional_data_path is not None: - assert additional_data_behavior is not None - df_saved = pd.read_json(additional_data_path) - if additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.ADD: - df_combined = df_api.add(df_saved.astype(int), fill_value=0)[::-1] - elif additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.REPLACE: - df_combined = pd.concat([df_saved, df_api], ignore_index=False) - df_combined = df_combined.loc[~df_combined.index.duplicated(keep="first")].sort_index(ascending=False) - else: - df_combined = df_api - - if include_changes: - df_combined[ - [metric["change_alias"] for metric in metrics] - ] = df_combined[ - [metric["alias"] for metric in metrics] - ].pct_change(periods=-1).replace({np.inf: np.nan}) - - if strftime_format is not None: - df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format) - - return df_combined.reset_index(names=time_dimension["alias"]) - - def get_landing_page_df(analytics_params, ignore_index=True): """ Get a DataFrame with landing pages from the Analytics API. :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions :return: a DataFrame with the landing pages from the Analytics API + By default, dimension and metric aliases both form columns + Dimensions: DIMENSION_LANDING_PAGE, + Metrics: METRIC_SESSIONS """ df_response = get_data_df_from_fields( [METRIC_SESSIONS], @@ -338,12 +254,17 @@ def get_landing_page_df(analytics_params, ignore_index=True): def get_landing_page_change(analytics_params, start_current, end_current, start_previous, end_previous): """ - Get a DF with landing pages from the Analytics API and a comparison for the prior month + Get a DataFrame with landing pages from the Analytics API and a comparison for the prior month :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month + :return: a DataFrame with the landing pages from the Analytics API. + By default, dimensions and metrics both form columns + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_LANDING_PAGE + Metrics: METRIC_SESSIONS """ return get_one_period_change_df( get_landing_page_df, From 9127f48bde98979f79e8c37b9a5a4f0342d733f5 Mon Sep 17 00:00:00 2001 From: jpaten Date: Sun, 16 Feb 2025 17:07:18 -0800 Subject: [PATCH 3/3] chore: bumped setup.py (#4378) --- analytics/analytics_package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analytics/analytics_package/setup.py b/analytics/analytics_package/setup.py index 7dbbcc622..be39c427d 100644 --- a/analytics/analytics_package/setup.py +++ b/analytics/analytics_package/setup.py @@ -2,7 +2,7 @@ setup( name="analytics", - version="3.4.1", + version="4.0.0", packages=["analytics"], install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client", "gspread", "gspread-formatting"], ) \ No newline at end of file