<a href="https://colab.research.google.com/github/Adlucent/ga4-return-prediction/blob/main/2_Data_Cleaning_%26_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2. Data Cleaning & Encoding

In this notebook:<br>
1. Define GCP Variables & Import Libraries<br>
2. Load dataset saved at the end of Notebook 1<br>
3. Data Cleaning<br>
  - Update data types<br>
  - Clean column values (remove special characters, etc.)<br>
4. Data Encoding<br>
5. Write table to BQ

## Define GCP Variables

In [None]:
################################################################################
######################### CHANGE BQ PROJECT NAME BELOW #########################
################################################################################

project_name = 'adl-analytics' #add proj name
region = "US"  # GCP project region
table_name = 'adl-analytics.return_prediction_ga4.return_prediction_ga4_1b'

## Import Libraries

In [None]:
# If your notebook does not have pandas_gbq you can install it here:
# ! pip install pandas_gbq

In [None]:
# Google credentials
from google.colab import auth
auth.authenticate_user()

# # BigQuery Magics
# '''BigQuery magics are used to run BigQuery SQL queries in a python environment.
# These queries can also be run in the BigQuery UI '''

# from google.cloud import bigquery
# from google.cloud.bigquery import magics, Client, QueryJobConfig

# magics.context.project = project_name #update project name
# client = bigquery.Client(project=magics.context.project)

# Interface between Jupyter and BigQuery
import pandas_gbq

# data processing libraries
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# suppress notebook warnings
import warnings
warnings.filterwarnings('ignore')

# dataframe formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# Load data

In [None]:
sql = f"""
SELECT *
FROM `{table_name}`;
"""
data = pandas_gbq.read_gbq(sql, project_id=project_name, location=region, use_bqstorage_api=True)

Downloading: 100%|[32m██████████[0m|


In [None]:
# create a copy of the imported data to avoid re-importing if we need to revert to original table
df = data.copy()

In [None]:
df.head()

Unnamed: 0,event_date,event_name,event_params_engagement_time_msec,event_params_ga_session_id,event_params_ga_session_number,event_params_page_title,event_params_session_engaged,user_pseudo_id,user_ltv_revenue,device_category,device_mobile_brand_name,device_mobile_model_name,device_web_info_browser,geo_country,traffic_source_medium,ecommerce_total_item_quantity,ecommerce_purchase_revenue_in_usd,total_return_item_quantity,ecommerce_tax_value_in_usd,ecommerce_unique_items,ecommerce_transaction_id,item_category,item_price_in_usd,item_quantity,item_revenue_in_usd,item_refund_in_usd,item_refund_quantity,item_promotion_name
0,20201105,scroll,0,9520224276,15,Home,0.0,6430802.072348309,0.0,desktop,Microsoft,Edge,Edge,United States,organic,0,0.0,0,0.0,0,,,0.0,0,0.0,0.0,0,
1,20201116,view_promotion,0,1338488416,15,Home,0.0,637884255.3997903,0.0,desktop,Microsoft,Edge,Edge,France,organic,0,0.0,0,0.0,1,,,0.0,0,0.0,0.0,0,Reach New Heights
2,20201128,view_item,18850,7416885767,21,Home,1.0,87116489.53071336,0.0,desktop,Microsoft,Edge,Edge,Spain,Other,0,0.0,0,0.0,12,,Home/Sale/,0.0,0,0.0,0.0,0,
3,20201128,view_item,30247,7416885767,21,Home,1.0,87116489.53071336,0.0,desktop,Microsoft,Edge,Edge,Spain,Other,0,0.0,0,0.0,12,,Home/Sale/,0.0,0,0.0,0.0,0,
4,20201128,view_item,30247,7416885767,21,Home,1.0,87116489.53071336,0.0,desktop,Microsoft,Edge,Edge,Spain,Other,0,0.0,0,0.0,12,,Home/Sale/,0.0,0,0.0,0.0,0,


In [None]:
df.shape

(2002010, 28)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002010 entries, 0 to 2002009
Data columns (total 28 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   event_date                         object 
 1   event_name                         object 
 2   event_params_engagement_time_msec  Int64  
 3   event_params_ga_session_id         object 
 4   event_params_ga_session_number     Int64  
 5   event_params_page_title            object 
 6   event_params_session_engaged       float64
 7   user_pseudo_id                     object 
 8   user_ltv_revenue                   float64
 9   device_category                    object 
 10  device_mobile_brand_name           object 
 11  device_mobile_model_name           object 
 12  device_web_info_browser            object 
 13  geo_country                        object 
 14  traffic_source_medium              object 
 15  ecommerce_total_item_quantity      Int64  
 16  ecommerce_purchase

# Data cleaning

## ID columns
The ID columns in the dataset should be of type object, as they should be treated as categorical values even if they could be interpreted as numeric.

In [None]:
id_cols = ['event_params_ga_session_id',
           'user_pseudo_id',
           'ecommerce_transaction_id']

In [None]:
for col in id_cols:
    df[col] = df[col].replace({'(not set)': None})

In [None]:
df[id_cols].dtypes

event_params_ga_session_id    object
user_pseudo_id                object
ecommerce_transaction_id      object
dtype: object

In [None]:
df[id_cols].isna().sum()

event_params_ga_session_id          0
user_pseudo_id                      0
ecommerce_transaction_id      1987843
dtype: int64

## Datetime columns
These columns include the `event_date` column.

In [None]:
date_cols = ['event_date']

In [None]:
df.event_date = df.event_date.astype('datetime64')

In [None]:
df[date_cols].dtypes

event_date    datetime64[ns]
dtype: object

In [None]:
df[date_cols].isna().sum()

event_date    0
dtype: int64

In [None]:
df[date_cols].describe()

Unnamed: 0,event_date
count,2002010
unique,92
top,2020-12-11 00:00:00
freq,70430
first,2020-11-01 00:00:00
last,2021-01-31 00:00:00


## Boolean columns
Ultimately, we'd like to use the `epromotion_name`, and `event_params_session_engaged` features as boolean values.

In [None]:
bool_cols = ['item_promotion_name',
             'event_params_session_engaged']

In [None]:
df.item_promotion_name = df.item_promotion_name.apply(lambda x: True if x else False)

In [None]:
df.event_params_session_engaged = df.event_params_session_engaged.astype(bool)

In [None]:
df[bool_cols].dtypes

item_promotion_name             bool
event_params_session_engaged    bool
dtype: object

In [None]:
df[bool_cols].isna().sum()

item_promotion_name             0
event_params_session_engaged    0
dtype: int64

In [None]:
for col in bool_cols:
    print(col)
    print(df[col].value_counts())
    print()

item_promotion_name
False    1959115
True       42895
Name: item_promotion_name, dtype: int64

event_params_session_engaged
True     1967857
False      34153
Name: event_params_session_engaged, dtype: int64



## Numeric columns
Ensure that all numeric columns are properly formatted and have missing values filled in.

In [None]:
int_cols = ['event_params_engagement_time_msec',
            'event_params_ga_session_number',
            'ecommerce_total_item_quantity',
            'total_return_item_quantity',
            'ecommerce_unique_items',
            'item_quantity',
            'item_refund_quantity']

In [None]:
float_cols = ['user_ltv_revenue',
              'ecommerce_purchase_revenue_in_usd',
              'ecommerce_tax_value_in_usd',
              'item_price_in_usd',
              'item_revenue_in_usd',
              'item_refund_in_usd']

In [None]:
num_cols = int_cols + float_cols

For all numeric columns, it is safe to assume that if the value is missing, it should be replaced with 0. <br>
We did this previously, so these should already be in the correct datatype and not have any missing values.

In [None]:
df[num_cols].isna().sum()

event_params_engagement_time_msec    0
event_params_ga_session_number       0
ecommerce_total_item_quantity        0
total_return_item_quantity           0
ecommerce_unique_items               0
item_quantity                        0
item_refund_quantity                 0
user_ltv_revenue                     0
ecommerce_purchase_revenue_in_usd    0
ecommerce_tax_value_in_usd           0
item_price_in_usd                    0
item_revenue_in_usd                  0
item_refund_in_usd                   0
dtype: int64

In [None]:
df[num_cols].dtypes

event_params_engagement_time_msec      Int64
event_params_ga_session_number         Int64
ecommerce_total_item_quantity          Int64
total_return_item_quantity             Int64
ecommerce_unique_items                 Int64
item_quantity                          Int64
item_refund_quantity                   Int64
user_ltv_revenue                     float64
ecommerce_purchase_revenue_in_usd    float64
ecommerce_tax_value_in_usd           float64
item_price_in_usd                    float64
item_revenue_in_usd                  float64
item_refund_in_usd                   float64
dtype: object

In [None]:
df[num_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
event_params_engagement_time_msec,2002010.0,10536.342,42128.63,0.0,271.0,4365.0,11406.0,36399379.0
event_params_ga_session_number,2002010.0,3.625,4.303,1.0,1.0,2.0,5.0,154.0
ecommerce_total_item_quantity,2002010.0,0.49,3.013,0.0,0.0,0.0,0.0,400.0
total_return_item_quantity,2002010.0,0.013,0.641,0.0,0.0,0.0,0.0,145.0
ecommerce_unique_items,2002010.0,7.808,5.598,0.0,0.0,12.0,12.0,30.0
item_quantity,2002010.0,0.071,0.671,0.0,0.0,0.0,0.0,160.0
item_refund_quantity,2002010.0,0.002,0.116,0.0,0.0,0.0,0.0,50.0
user_ltv_revenue,2002010.0,21.607,72.833,0.0,0.0,0.0,0.0,1530.0
ecommerce_purchase_revenue_in_usd,2002010.0,0.81,13.481,0.0,0.0,0.0,0.0,1530.0
ecommerce_tax_value_in_usd,2002010.0,0.065,1.105,0.0,0.0,0.0,0.0,140.0


## Categorical columns
Correct any formatting issues and standardize any missing values.

In [None]:
cat_cols = ['event_name',
            'event_params_page_title',
            'device_category',
            'device_mobile_brand_name',
            'device_mobile_model_name',
            'device_web_info_browser',
            'geo_country',
            'traffic_source_medium',
            'item_category']

In [None]:
for col in cat_cols:
    print(col)
    print(df[col].unique()[:50])
    print()

event_name
['scroll' 'view_promotion' 'view_item' 'user_engagement' 'session_start'
 'begin_checkout' 'page_view' 'add_to_cart' 'select_item' 'purchase'
 'add_payment_info' 'add_shipping_info' 'click' 'view_search_results'
 'select_promotion' 'first_visit']

event_params_page_title
['Home' 'Socks | Apparel | Google Merchandise Store'
 'Apparel | Google Merchandise Store'
 'The Google Merchandise Store - Log In' 'Sale | Google Merchandise Store'
 'Shop by Brand | Google Merchandise Store' 'Lifestyle'
 'Checkout Your Information' 'Payment Method' 'Shopping Cart'
 "Men's / Unisex | Apparel | Google Merchandise Store"
 'Womens | Apparel | Google Merchandise Store'
 'Google Unisex Pride Eco-Tee Black' 'Store search results'
 'Stickers | Stationery | Google Merchandise Store'
 'Checkout Confirmation' 'New | Google Merchandise Store'
 'Google | Shop by Brand | Google Merchandise Store' 'Google Totepak'
 'Drinkware | Lifestyle | Google Merchandise Store'
 'Small Goods | Lifestyle | Google Merc

Since we already replaced missing values previously, the data does not need null value replacement here.

In [None]:
df[cat_cols].isna().sum()

event_name                       0
event_params_page_title       1514
device_category                  0
device_mobile_brand_name         0
device_mobile_model_name         0
device_web_info_browser          0
geo_country                  13754
traffic_source_medium       732408
item_category               646422
dtype: int64

In [None]:
df[cat_cols].dtypes

event_name                  object
event_params_page_title     object
device_category             object
device_mobile_brand_name    object
device_mobile_model_name    object
device_web_info_browser     object
geo_country                 object
traffic_source_medium       object
item_category               object
dtype: object

### Parse `event_params_page_title` and `items_item_category`

In [None]:
df.event_params_page_title = df.event_params_page_title.apply(lambda x: x.replace(' | Google Merchandise Store', '') if x is not None else x)

In [None]:
df.item_category = df.item_category.apply(lambda x: x.replace('Home/', '').strip('/') if x is not None else x)

In [None]:
exclude_words = ['#IamRemarkable',
                 'Android',
                 'BLM',
                 'Android',
                 'Google',
                 'Friends',
                 'Supernatural',
                 'Unisex',
                 'YouTube',
                 'Tote',
                 'Sticker',
                 'Food',
                 'Straw/',
                 'Tee']

In [None]:
df['event_params_parent_page'] = df.event_params_page_title.apply(lambda x: x.split(' | ')[1] \
                                                                  if (x is not None) and (len(x.split(' | ')) > 1) \
                                                                  else None \
                                                                  if (x is not None) and (set(x.split(' ')).intersection(exclude_words))\
                                                                  else x)

In [None]:
df.event_params_parent_page = df.event_params_parent_page.replace({"Kids' Apparel": 'Apparel'})

In [None]:
df['event_params_parent_page'].unique()

array(['Home', 'Apparel', None, 'Sale', 'Shop by Brand', 'Lifestyle',
       'Checkout Your Information', 'Payment Method', 'Shopping Cart',
       'Store search results', 'Stationery', 'Checkout Confirmation',
       'New', 'Campus Collection', 'Eco-Friendly', 'Checkout Review',
       'Black Lives Matter', 'Your Wishlist',
       'Frequently Asked Questions', 'Page Unavailable', 'Return Policy',
       'Shipping Information', 'Drinkware', 'Accessories', 'Gift Cards',
       'Bags', 'Office', 'Electronics', 'Terms of Use'], dtype=object)

In [None]:
df['event_params_child_page'] = df.event_params_page_title.apply(lambda x: x.split(' | ')[0] if (x is not None) and (len(x.split(' | ')) > 1) else None)

In [None]:
df['event_params_child_page'].unique()

array([None, 'Socks', "Men's / Unisex", 'Womens', 'Stickers', 'Google',
       'Drinkware', 'Small Goods', 'Notebooks', 'Hats', 'YouTube', 'Bags',
       'Kids', 'Android', 'Google Cloud', 'Writing', 'Water Bottles',
       'Infant', '#IamRemarkable', "Men's T-Shirts", 'Mugs & Tumblers',
       'Youth', 'More Bags', 'Toddler', 'Backpacks', 'Shopping & Totes',
       "Women's T-Shirts", 'Audio'], dtype=object)

In [None]:
df.item_category = df.item_category.apply(lambda x: x.replace("Kid's-", '')\
                                                     .replace('Kids-', '')\
                                                     .replace('Notebooks & Journals', 'Notebooks')\
                                                     .replace('Shopping and Totes', 'Shopping & Totes')\
                                                     .replace("Women's", 'Womens')\
                                                     .replace('Writing Instruments', 'Writing') if x is not None else x)

In [None]:
df['item_parent_category'] = df.item_category.apply(lambda x: x.split('/')[0].strip() \
                                                    if (x is not None) and (len(x.split('/')) == 2) and (x != "Men's / Unisex") \
                                                    else '/'.join(x.split('/')[1:]) \
                                                    if (x is not None) and (len(x.split('/')) == 3) \
                                                    else x)

In [None]:
df['item_parent_category'].unique()

array([None, 'Sale', "Men's / Unisex", 'Shop by Brand', 'Apparel',
       'Lifestyle', 'Womens', 'Stationery', 'New', 'Google',
       'Campus Collection', 'Office', 'Clearance', 'Eco-Friendly', 'Bags',
       'Accessories', 'Small Goods', 'Drinkware', 'Uncategorized Items',
       'Kids', 'Electronics Accessories', 'Notebooks', 'Writing',
       'Water Bottles', 'Infant', 'Fun', 'Black Lives Matter',
       "Men's T-Shirts", 'Hats', 'Android', 'Mugs & Tumblers', 'Youth',
       'More Bags', 'Toddler', 'YouTube', 'Gift Cards', 'Backpacks',
       'Shopping & Totes'], dtype=object)

In [None]:
df['item_child_category'] = df.item_category.apply(lambda x: x.split('/')[1] \
                                                   if (x is not None) and (len(x.split('/')) == 2) and (x != "Men's / Unisex") \
                                                   else '/'.join(x.split('/')[1:]) \
                                                   if (x is not None) and (len(x.split('/')) == 3)\
                                                   else x)

In [None]:
df['item_child_category'].unique()

array([None, 'Sale', "Men's / Unisex", 'Shop by Brand', 'Apparel',
       'Lifestyle', 'Womens', 'Stickers', 'New', 'Google',
       'Campus Collection', 'Small Goods', 'Office', 'Notebooks', 'Socks',
       'Drinkware', 'Hats', 'Stationery', 'YouTube', 'Clearance', 'Bags',
       'Eco-Friendly', 'Kids', 'Accessories', 'Uncategorized Items',
       'Google Cloud', 'Android', 'Electronics Accessories', 'Writing',
       'Water Bottles', 'Infant', 'Fun', 'Black Lives Matter',
       "Men's T-Shirts", '#IamRemarkable', 'Mugs & Tumblers', 'Youth',
       'More Bags', 'Toddler', 'Gift Cards', 'Backpacks',
       'Shopping & Totes'], dtype=object)

**Determine parent categories, child categories, and child subcategories**

In [None]:
# assume event_params_parent_page contains all actual parent categories
# remove categories that are actually child categories
# remove_from_parent = ['Drinkware', 'Bags', "Campus Collection", "Eco-Friendly", "Black Lives Matter", 'Office']
parent_categories = list(df['event_params_parent_page'].unique())

In [None]:
parent_categories.remove('Accessories')
parent_categories.remove('Bags')
parent_categories.remove('Black Lives Matter')
parent_categories.remove('Campus Collection')
parent_categories.remove('Drinkware')
parent_categories.remove('Eco-Friendly')
parent_categories.remove('Office')
parent_categories

['Home',
 'Apparel',
 None,
 'Sale',
 'Shop by Brand',
 'Lifestyle',
 'Checkout Your Information',
 'Payment Method',
 'Shopping Cart',
 'Store search results',
 'Stationery',
 'Checkout Confirmation',
 'New',
 'Checkout Review',
 'Your Wishlist',
 'Frequently Asked Questions',
 'Page Unavailable',
 'Return Policy',
 'Shipping Information',
 'Gift Cards',
 'Electronics',
 'Terms of Use']

In [None]:
# assume event_params_child_page that have a parent that is not identified as a parent category is a child-sub category
child_subcategories = list(df[~df.event_params_parent_page.isin(parent_categories)]['event_params_child_page'].drop_duplicates().values)
child_subcategories += ["Men's T-Shirts", 'Infant', 'Youth', 'Toddler', "Women's T-Shirts"]
child_subcategories

[None,
 'Water Bottles',
 'Mugs & Tumblers',
 'More Bags',
 'Backpacks',
 'Shopping & Totes',
 "Men's T-Shirts",
 'Infant',
 'Youth',
 'Toddler',
 "Women's T-Shirts"]

In [None]:
# assume whatever event_params_child_page that are not subcategories are child categories
child_categories = list(set(df.event_params_child_page.unique()).difference(child_subcategories))
child_categories

['Drinkware',
 'Android',
 'YouTube',
 'Small Goods',
 'Womens',
 "Men's / Unisex",
 'Kids',
 'Google',
 'Writing',
 'Bags',
 'Notebooks',
 'Google Cloud',
 'Hats',
 '#IamRemarkable',
 'Stickers',
 'Socks',
 'Audio']

Seeing the values that are in `item_category`, fill out more parent, child, and child subcategories

In [None]:
# uncategorized "parent" categories
set(df['item_parent_category'].unique()).difference(parent_categories).difference(child_categories).difference(child_subcategories)

{'Accessories',
 'Black Lives Matter',
 'Campus Collection',
 'Clearance',
 'Eco-Friendly',
 'Electronics Accessories',
 'Fun',
 'Office',
 'Uncategorized Items'}

In [None]:
parent_categories += ['Clearance']

In [None]:
child_categories += ['Accessories', 'Black Lives Matter', 'Campus Collection', 'Eco-Friendly', 'Fun', 'Uncategorized Items', 'Office']

In [None]:
child_subcategories += ['Electronics Accessories']

In [None]:
# uncategorized "child" categories, None
set(df['item_child_category'].unique()).difference(parent_categories).difference(child_categories).difference(child_subcategories)

set()

Now that parent, child, and child subcategories are determined, correctly parse the information that is in the `items_item_category` column.

In [None]:
df['split_item_category'] = df.item_category.apply(lambda x: x.split('/') \
                                                   if (x is not None) and (len(x.split('/')) < 3) and (x != "Men's / Unisex") \
                                                   else ['/'.join(x.split('/')[1:])] \
                                                   if (x is not None) and (x != "Men's / Unisex") \
                                                   else [x] \
                                                   if (x is not None) \
                                                   else x)

In [None]:
df['item_parent_category'] = df['split_item_category'].apply(lambda x: list(set(x).intersection(parent_categories))[0] \
                                                             if (x is not None) and (set(x).intersection(parent_categories)) \
                                                             else None)

In [None]:
df['item_child_category'] = df['split_item_category'].apply(lambda x: list(set(x).intersection(child_categories))[0] \
                                                            if (x is not None) and (set(x).intersection(child_categories)) \
                                                            else None)

In [None]:
df['item_child_subcategory'] = df['split_item_category'].apply(lambda x: list(set(x).intersection(child_subcategories))[0] \
                                                               if (x is not None) and (set(x).intersection(child_subcategories)) \
                                                               else None)

In [None]:
df[['item_parent_category', 'item_child_category', 'item_child_subcategory']].drop_duplicates().sort_values(['item_child_category', 'item_parent_category'])[:25]


Unnamed: 0,item_parent_category,item_child_category,item_child_subcategory
56389,Shop by Brand,#IamRemarkable,
709,,Accessories,
4086,Shop by Brand,Android,
59092,,Android,
563,Lifestyle,Bags,
613,,Bags,
47648,,Black Lives Matter,
83,,Campus Collection,
293,Lifestyle,Drinkware,
974,,Drinkware,


In [None]:
df[['item_parent_category', 'item_child_category', 'item_child_subcategory']].sort_values(['item_child_category', 'item_parent_category']).drop_duplicates()[25:]


Unnamed: 0,item_parent_category,item_child_category,item_child_subcategory
208,Apparel,Socks,
80,Stationery,Stickers,
1181,,Uncategorized Items,
60,Apparel,Womens,
59,,Womens,
12526,Stationery,Writing,
12564,,Writing,
486,Shop by Brand,YouTube,
195990,,YouTube,
21,Apparel,,


In [None]:
# fix some categorizations that are missing upstream categories
df.loc[df.item_child_category == 'Accessories', 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_category == 'Black Lives Matter', 'item_parent_category'] = 'Collections'
df.loc[df.item_child_category == 'Campus Collection', 'item_parent_category'] = 'Collections'
df.loc[df.item_child_category == 'Eco-Friendly', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_category == 'Fun', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == "Men's / Unisex", 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_category == 'Office', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_category == 'Uncategorized Items', 'item_parent_category'] = 'Lifestyle'

In [None]:
# fix child subcategories that are missing upstream categories
df.loc[df.item_child_subcategory == "Men's T-Shirts", 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_subcategory == "Men's T-Shirts", 'item_child_category'] = "Men's / Unisex"
df.loc[df.item_child_subcategory == 'Mugs & Tumblers', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == 'Mugs & Tumblers', 'item_child_category'] = 'Drinkware'
df.loc[df.item_child_subcategory == 'Backpacks', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == 'Backpacks', 'item_child_category'] = 'Bags'
df.loc[df.item_child_subcategory == 'Water Bottles', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == 'Water Bottles', 'item_child_category'] = 'Drinkware'
df.loc[df.item_child_subcategory == 'More Bags', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == 'More Bags', 'item_child_category'] = 'Bags'
df.loc[df.item_child_subcategory == 'Electronics Accessories', 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_subcategory == 'Electronics Accessories', 'item_child_category'] = 'Accessories'
df.loc[df.item_child_subcategory == 'Shopping & Totes', 'item_parent_category'] = 'Lifestyle'
df.loc[df.item_child_subcategory == 'Shopping & Totes', 'item_child_category'] = 'Bags'
df.loc[df.item_child_subcategory == 'Infant', 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_subcategory == 'Infant', 'item_child_category'] = 'Kids'
df.loc[df.item_child_subcategory == 'Youth', 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_subcategory == 'Youth', 'item_child_category'] = 'Kids'
df.loc[df.item_child_subcategory == 'Toddler', 'item_parent_category'] = 'Apparel'
df.loc[df.item_child_subcategory == 'Toddler', 'item_child_category'] = 'Kids'

In [None]:
missing_categorizatons = df[(df.item_parent_category.isna()) & (~df.item_child_category.isna())][['item_parent_category', 'item_child_category']].drop_duplicates()
missing_categorizatons

Unnamed: 0,item_parent_category,item_child_category
6,,Men's / Unisex
59,,Womens
82,,Google
613,,Bags
890,,Small Goods
974,,Drinkware
1687,,Kids
10537,,Notebooks
12564,,Writing
58701,,Hats


In [None]:
correct_categorizations = df[(~df.item_parent_category.isna()) & (~df.item_child_category.isna())][['item_parent_category', 'item_child_category']].drop_duplicates()


In [None]:
for i, row in missing_categorizatons.iterrows():
    correct_parent = correct_categorizations[correct_categorizations.item_child_category == row['item_child_category']]['item_parent_category'].values[0]
    df.loc[df.item_child_category == row['item_child_category'], 'item_parent_category'] = correct_parent


In [None]:
df[['item_parent_category', 'item_child_category', 'item_child_subcategory']].drop_duplicates().sort_values(['item_child_category', 'item_parent_category'])


Unnamed: 0,item_parent_category,item_child_category,item_child_subcategory
56389,Shop by Brand,#IamRemarkable,
709,Apparel,Accessories,
5585,Apparel,Accessories,Electronics Accessories
4086,Shop by Brand,Android,
563,Lifestyle,Bags,
169873,Lifestyle,Bags,More Bags
634822,Lifestyle,Bags,Backpacks
935769,Lifestyle,Bags,Shopping & Totes
47648,Collections,Black Lives Matter,
83,Collections,Campus Collection,


# Encode data
In order to use non-numeric features in machine learning, they have to be encoded to numeric values. <br>
This includes any categorical or boolean features.

## Boolean features

In [None]:
bool_cols = df.select_dtypes('bool').columns
bool_cols

Index(['event_params_session_engaged', 'item_promotion_name'], dtype='object')

In [None]:
df[bool_cols] = df[bool_cols].astype(int)
df[bool_cols].head()

Unnamed: 0,event_params_session_engaged,item_promotion_name
0,0,0
1,0,1
2,1,0
3,1,0
4,1,0


## Categorical features
I'll use one hot encoding for categorical features here.

In [None]:
cat_cols = df.select_dtypes('object').columns
cat_cols

Index(['event_name', 'event_params_ga_session_id', 'event_params_page_title',
       'user_pseudo_id', 'device_category', 'device_mobile_brand_name',
       'device_mobile_model_name', 'device_web_info_browser', 'geo_country',
       'traffic_source_medium', 'ecommerce_transaction_id', 'item_category',
       'event_params_parent_page', 'event_params_child_page',
       'item_parent_category', 'item_child_category', 'split_item_category',
       'item_child_subcategory'],
      dtype='object')

In [None]:
value_counts = {}

for col in cat_cols:
    if ('_id' not in col) and ('_page_title' not in col) and ('item_category' not in col):
        value_counts[col] = df[col].value_counts(normalize=True)
        print(col)
        print(df[col].value_counts(normalize=True))
        print()

event_name
view_item             0.453
add_to_cart           0.150
page_view             0.114
user_engagement       0.105
scroll                0.056
select_item           0.053
begin_checkout        0.030
view_promotion        0.008
purchase              0.008
session_start         0.008
add_shipping_info     0.006
add_payment_info      0.006
first_visit           0.002
view_search_results   0.001
select_promotion      0.000
click                 0.000
Name: event_name, dtype: float64

device_category
desktop   0.574
mobile    0.403
tablet    0.023
Name: device_category, dtype: float64

device_mobile_brand_name
Apple       0.423
Google      0.353
Samsung     0.079
Other       0.077
Xiaomi      0.022
Microsoft   0.021
Huawei      0.013
Mozilla     0.012
Name: device_mobile_brand_name, dtype: float64

device_mobile_model_name
Chrome       0.284
iPhone       0.214
Other        0.211
Safari       0.193
ChromeBook   0.052
Edge         0.019
iPad         0.013
Firefox      0.012
Pixel 4 XL

In [None]:
# store the what categorical levels are doing to be transformed into for one-hot encoding
cat_dict = {}

for key, value in value_counts.items():
    # if less than or equal to 10 distinct levels,
    # keep all levels
    # no transformation
    if len(value) <= 6:
        cat_dict[key] = {(level): (level.replace(' ', '') if level != None else level) for level in value.index}
    # if greater than 6 distinct levels,
    # keep top proportion levels
    # and create 'Other' level for lower proportion levels
    ## EXCEPT for None, None is always its own level ##
    else:
        if key == 'geo_country':
            keep_levels = value[value >= 0.02]
            drop_levels = value[value < 0.02]
        elif key == 'item_child_category':
            keep_levels = value[value >= 0.05]
            drop_levels = value[value < 0.05]
        else:
            keep_levels = value[value >= 0.01]
            drop_levels = value[value < 0.01]

        cat_dict[key] = {(level): (level.replace("'", '').replace('-', '').replace('/', '').replace('&', '').replace(' ', '') if level != None else level) for level in keep_levels.index}
        cat_dict[key].update({(level): ('Other' if level != None else level) for level in drop_levels.index})

In [None]:
for key, value in cat_dict.items():
    df[key] = df[key].replace(value)

In [None]:
for col in cat_cols:
    if ('_id' not in col) and ('_page_title' not in col) and ('item_category' not in col):
        value_counts[col] = df[col].value_counts(normalize=True)
        print(col)
        print(df[col].value_counts())
        print()

event_name
view_item          906923
add_to_cart        299477
page_view          228328
user_engagement    209469
scroll             112920
select_item        106009
Other               78159
begin_checkout      60725
Name: event_name, dtype: int64

device_category
desktop    1148852
mobile      807227
tablet       45931
Name: device_category, dtype: int64

device_mobile_brand_name
Apple        847013
Google       706425
Samsung      157394
Other        154113
Xiaomi        43507
Microsoft     42176
Huawei        26702
Mozilla       24680
Name: device_mobile_brand_name, dtype: int64

device_mobile_model_name
Chrome        569218
iPhone        428185
Other         426524
Safari        385527
ChromeBook    103773
Edge           37084
iPad           27019
Firefox        24680
Name: device_mobile_model_name, dtype: int64

device_web_info_browser
Chrome            1371104
Safari             472330
Other               62902
Edge                37678
Firefox             34308
AndroidWebview 

In [None]:
one_hot_dict = {}

for col in cat_dict.keys():
    one_hot_dict[col] = pd.get_dummies(df[col], prefix=col)
    # drop one column to avoid multicollinearity issues
    # if a None column is availible, drop None (pd.get_dummies automatically ignores None)
    # elif drop Other
    # else drop the smallest category
    if (one_hot_dict[col].shape[1] > 1) & (None not in df[col].unique()):
        drop_level = ''
        if 'Other' in df[col].unique():
            drop_level = 'Other'
        else:
            drop_level = (value_counts[col].index[-1]).replace(' ', '')
        print(drop_level)
        one_hot_dict[col].drop(f'{col}_{drop_level}', axis=1, inplace=True)

Other
tablet
Other
Other
Other


In [None]:
cat_cols = df.select_dtypes('object').columns
cat_cols

Index(['event_name', 'event_params_ga_session_id', 'event_params_page_title',
       'user_pseudo_id', 'device_category', 'device_mobile_brand_name',
       'device_mobile_model_name', 'device_web_info_browser', 'geo_country',
       'traffic_source_medium', 'ecommerce_transaction_id', 'item_category',
       'event_params_parent_page', 'event_params_child_page',
       'item_parent_category', 'item_child_category', 'split_item_category',
       'item_child_subcategory'],
      dtype='object')

In [None]:
df_non_categorical = df[['event_date',
                         'event_params_engagement_time_msec',
                         'event_params_ga_session_id',
                         'event_params_ga_session_number',
                         'event_params_session_engaged',
                         'user_pseudo_id',
                         'user_ltv_revenue',
                         'ecommerce_total_item_quantity',
                         'total_return_item_quantity',
                         'ecommerce_purchase_revenue_in_usd',
                         'ecommerce_tax_value_in_usd',
                         'ecommerce_unique_items',
                         'ecommerce_transaction_id',
                         'item_price_in_usd',
                         'item_promotion_name',
                         'item_quantity',
                         'item_refund_in_usd',
                         'item_refund_quantity',
                         'item_revenue_in_usd']]

In [None]:
len(['event_date',
     'event_params_engagement_time_msec',
     'event_params_ga_session_id',
     'event_params_ga_session_number',
     'event_params_session_engaged',
     'user_pseudo_id',
     'user_ltv_revenue',
     'ecommerce_total_item_quantity',
     'total_return_item_quantity',
     'ecommerce_purchase_revenue_in_usd',
     'ecommerce_tax_value_in_usd',
     'ecommerce_unique_items',
     'ecommerce_transaction_id',
     'item_price_in_usd',
     'item_promotion_name',
     'item_quantity',
     'item_refund_in_usd',
     'item_refund_quantity',
     'item_revenue_in_usd'])

19

In [None]:
df_final = pd.concat([df_non_categorical] + list(one_hot_dict.values()), axis=1)

In [None]:
df_final

Unnamed: 0,event_date,event_params_engagement_time_msec,event_params_ga_session_id,event_params_ga_session_number,event_params_session_engaged,user_pseudo_id,user_ltv_revenue,ecommerce_total_item_quantity,total_return_item_quantity,ecommerce_purchase_revenue_in_usd,ecommerce_tax_value_in_usd,ecommerce_unique_items,ecommerce_transaction_id,item_price_in_usd,item_promotion_name,item_quantity,item_refund_in_usd,item_refund_quantity,item_revenue_in_usd,event_name_add_to_cart,event_name_begin_checkout,event_name_page_view,event_name_scroll,event_name_select_item,event_name_user_engagement,event_name_view_item,device_category_desktop,device_category_mobile,device_mobile_brand_name_Apple,device_mobile_brand_name_Google,device_mobile_brand_name_Huawei,device_mobile_brand_name_Microsoft,device_mobile_brand_name_Mozilla,device_mobile_brand_name_Samsung,device_mobile_brand_name_Xiaomi,device_mobile_model_name_Chrome,device_mobile_model_name_ChromeBook,device_mobile_model_name_Edge,device_mobile_model_name_Firefox,device_mobile_model_name_Safari,device_mobile_model_name_iPad,device_mobile_model_name_iPhone,device_web_info_browser_AndroidWebview,device_web_info_browser_Chrome,device_web_info_browser_Edge,device_web_info_browser_Firefox,device_web_info_browser_Safari,geo_country_Canada,geo_country_France,geo_country_India,geo_country_Other,geo_country_Spain,geo_country_UnitedKingdom,geo_country_UnitedStates,traffic_source_medium_Other,traffic_source_medium_cpc,traffic_source_medium_organic,traffic_source_medium_referral,event_params_parent_page_Apparel,event_params_parent_page_CampusCollection,event_params_parent_page_CheckoutConfirmation,event_params_parent_page_CheckoutYourInformation,event_params_parent_page_EcoFriendly,event_params_parent_page_Home,event_params_parent_page_Lifestyle,event_params_parent_page_New,event_params_parent_page_Other,event_params_parent_page_PaymentMethod,event_params_parent_page_Sale,event_params_parent_page_ShopbyBrand,event_params_parent_page_ShoppingCart,event_params_parent_page_Stationery,event_params_child_page_Bags,event_params_child_page_Drinkware,event_params_child_page_Google,event_params_child_page_Hats,event_params_child_page_Kids,event_params_child_page_MensUnisex,event_params_child_page_Notebooks,event_params_child_page_Other,event_params_child_page_SmallGoods,event_params_child_page_Socks,event_params_child_page_Stickers,event_params_child_page_Womens,event_params_child_page_Writing,event_params_child_page_YouTube,item_parent_category_Apparel,item_parent_category_Collections,item_parent_category_Lifestyle,item_parent_category_New,item_parent_category_Other,item_parent_category_Sale,item_parent_category_ShopbyBrand,item_parent_category_Stationery,item_child_category_Bags,item_child_category_CampusCollection,item_child_category_Drinkware,item_child_category_Google,item_child_category_Kids,item_child_category_MensUnisex,item_child_category_Other,item_child_category_SmallGoods,item_child_category_Womens,item_child_subcategory_Backpacks,item_child_subcategory_ElectronicsAccessories,item_child_subcategory_Infant,item_child_subcategory_MensTShirts,item_child_subcategory_MugsTumblers,item_child_subcategory_Other,item_child_subcategory_WaterBottles
0,2020-11-05,0,9520224276,15,0,6430802.0723483089,0.000,0,0,0.000,0.000,0,,0.000,0,0,0.000,0,0.000,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2020-11-16,0,1338488416,15,0,637884255.3997903180,0.000,0,0,0.000,0.000,1,,0.000,1,0,0.000,0,0.000,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2020-11-28,18850,7416885767,21,1,87116489.5307133653,0.000,0,0,0.000,0.000,12,,0.000,0,0,0.000,0,0.000,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2020-11-28,30247,7416885767,21,1,87116489.5307133653,0.000,0,0,0.000,0.000,12,,0.000,0,0,0.000,0,0.000,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2020-11-28,30247,7416885767,21,1,87116489.5307133653,0.000,0,0,0.000,0.000,12,,0.000,0,0,0.000,0,0.000,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002005,2021-01-12,5235,1044126687,14,1,27333015.4039048206,105.000,0,0,0.000,0.000,0,,0.000,0,0,0.000,0,0.000,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2002006,2021-01-12,572,1044126687,14,1,27333015.4039048206,105.000,0,0,0.000,0.000,0,,0.000,0,0,0.000,0,0.000,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2002007,2021-01-12,463,1044126687,14,1,27333015.4039048206,105.000,0,0,0.000,0.000,0,,0.000,0,0,0.000,0,0.000,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2002008,2021-01-12,6,1044126687,14,1,27333015.4039048206,105.000,0,0,0.000,0.000,0,,0.000,0,0,0.000,0,0.000,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Write data back to gbq

In [None]:
# create table schema to prevent errors when uploading data
schema = []

for col, dtype in zip(df_final.dtypes.index, df_final.dtypes.values):
    col_dict = {}
    col_dict['name'] = col

    if dtype == object:
        typ = 'STRING'
    elif dtype == bool:
        typ = 'BOOLEAN'
    elif dtype == 'datetime64[ns]':
        typ = 'DATETIME'
    elif dtype in (int, 'Int64', 'uint8'):
        typ = 'INTEGER'
    elif dtype == float:
        typ = 'FLOAT'
    else:
        print(dtype)
    col_dict['type'] = typ

    schema.append(col_dict)

In [None]:
schema

[{'name': 'event_date', 'type': 'DATETIME'},
 {'name': 'event_params_engagement_time_msec', 'type': 'INTEGER'},
 {'name': 'event_params_ga_session_id', 'type': 'STRING'},
 {'name': 'event_params_ga_session_number', 'type': 'INTEGER'},
 {'name': 'event_params_session_engaged', 'type': 'INTEGER'},
 {'name': 'user_pseudo_id', 'type': 'STRING'},
 {'name': 'user_ltv_revenue', 'type': 'FLOAT'},
 {'name': 'ecommerce_total_item_quantity', 'type': 'INTEGER'},
 {'name': 'total_return_item_quantity', 'type': 'INTEGER'},
 {'name': 'ecommerce_purchase_revenue_in_usd', 'type': 'FLOAT'},
 {'name': 'ecommerce_tax_value_in_usd', 'type': 'FLOAT'},
 {'name': 'ecommerce_unique_items', 'type': 'INTEGER'},
 {'name': 'ecommerce_transaction_id', 'type': 'STRING'},
 {'name': 'item_price_in_usd', 'type': 'FLOAT'},
 {'name': 'item_promotion_name', 'type': 'INTEGER'},
 {'name': 'item_quantity', 'type': 'INTEGER'},
 {'name': 'item_refund_in_usd', 'type': 'FLOAT'},
 {'name': 'item_refund_quantity', 'type': 'INTEGER

In [None]:
df_final.to_gbq(f'{project_name}.return_prediction_ga4.return_prediction_ga4_1c',
                      project_id=project_name,
                      if_exists='replace',
                      location=region,
                      chunksize=100_000,
                      table_schema=schema)

100%|██████████| 1/1 [00:00<00:00, 4495.50it/s]
