# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [169]:
import polars as pl 
import polars.selectors as cs 
from google.cloud import bigquery
import numpy as np 

In [170]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [171]:
# Create client object
client = bigquery.Client()

In [172]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [173]:
run_query = client.query(QUERY)

In [174]:
df = pl.from_arrow(run_query.to_arrow())

In [175]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [176]:
df.shape

(49512, 13)

In [177]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [178]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501541120,"""11577526465501235""","""Northern America""","""United States""","""not available in demo dataset""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Android Men's Vintage Tank""","""Home/Apparel/Men's/Men's-T-Shi…"


In [179]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [180]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [181]:
df.filter(pl.col('transactionId')!='null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [182]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [183]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))
df_cleaned

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,i32,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Google Women's Scoop Neck Tee …","""Apparel"""


In [230]:
encoding = df_cleaned.filter(pl.col('country') == "United States").select(
    pl.col('date'),pl.col('fullVisitorId'),pl.col('country'),pl.col('transactionId'),pl.col('v2ProductName')
)
encoding

date,fullVisitorId,country,transactionId,v2ProductName
date,str,str,i32,str
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google Snapback Hat Black"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Android Wool Heather Cap Heath…"
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google Blackout Cap"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google 5-Panel Snapback Cap"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
2017-07-31,"""7483600664917507409""","""United States""",1,"""Android 17oz Stainless Steel S…"
2017-07-31,"""7483600664917507409""","""United States""",1,"""Android Men's Short Sleeve Her…"
2017-07-31,"""7483600664917507409""","""United States""",1,"""Android Men's Take Charge Shor…"
2017-07-31,"""7483600664917507409""","""United States""",1,"""Google Women's Scoop Neck Tee …"


In [186]:
# encoding = encoding.with_columns(pl.col('transactionId').mean().alias('transactionId'))

In [231]:
canada = df_cleaned.filter(pl.col('country') == "Canada").select(
    pl.col('date'),pl.col('fullVisitorId'),pl.col('country'),pl.col('transactionId'),pl.col('v2ProductName')
)
# canada = canada.with_columns(pl.col('transactionId').mean().alias('transactionId'))
canada

date,fullVisitorId,country,transactionId,v2ProductName
date,str,str,i32,str
2017-07-31,"""6933896823600086395""","""Canada""",0,"""Gift Card- $100.00"""
2017-07-31,"""6933896823600086395""","""Canada""",0,"""Gift Card - $25.00"""
2017-07-31,"""6933896823600086395""","""Canada""",0,"""Gift Card - $250.00"""
2017-07-31,"""6933896823600086395""","""Canada""",0,"""Gift Card - $50.00"""
2017-07-31,"""7700062233308774111""","""Canada""",0,"""Google Men's Long Sleeve Ragla…"
…,…,…,…,…
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's 1/4 Zip Jacket …"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Performance Ful…"


In [232]:
new_df = encoding.vstack(canada)
new_df

date,fullVisitorId,country,transactionId,v2ProductName
date,str,str,i32,str
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google Snapback Hat Black"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Android Wool Heather Cap Heath…"
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google Blackout Cap"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Google 5-Panel Snapback Cap"""
2017-07-31,"""9308310352918219134""","""United States""",0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's 1/4 Zip Jacket …"
2017-07-31,"""7444499703471460562""","""Canada""",0,"""Google Women's Performance Ful…"


In [233]:
new_df = new_df.with_columns(pl.when(pl.col('country') != 'United States').then(1).otherwise(0).alias("country"))
new_df

date,fullVisitorId,country,transactionId,v2ProductName
date,str,i32,i32,str
2017-07-31,"""9308310352918219134""",0,0,"""Google Snapback Hat Black"""
2017-07-31,"""9308310352918219134""",0,0,"""Android Wool Heather Cap Heath…"
2017-07-31,"""9308310352918219134""",0,0,"""Google Blackout Cap"""
2017-07-31,"""9308310352918219134""",0,0,"""Google 5-Panel Snapback Cap"""
2017-07-31,"""9308310352918219134""",0,0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
2017-07-31,"""7444499703471460562""",1,0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""",1,0,"""Google Women's Quilted Insulat…"
2017-07-31,"""7444499703471460562""",1,0,"""Google Women's 1/4 Zip Jacket …"
2017-07-31,"""7444499703471460562""",1,0,"""Google Women's Performance Ful…"


In [234]:
new_df.select(pl.col('v2ProductName')).unique()

v2ProductName
str
"""Google Toddler Short Sleeve Te…"
"""Waze Baby on Board Window Deca…"
"""Android RFID Journal"""
"""Nest® Learning Thermostat 3rd …"
"""Google Onesie Red/Graphite"""
…
"""Google Vintage Henley Grey/Bla…"
"""Google Lunch Bag"""
"""Android Men's Take Charge Shor…"
"""Google Leather Journal"""


In [235]:
new_df.select(pl.col('v2ProductName'))

v2ProductName
str
"""Google Snapback Hat Black"""
"""Android Wool Heather Cap Heath…"
"""Google Blackout Cap"""
"""Google 5-Panel Snapback Cap"""
"""Android 5-Panel Low Cap"""
…
"""Google Women's Quilted Insulat…"
"""Google Women's Quilted Insulat…"
"""Google Women's 1/4 Zip Jacket …"
"""Google Women's Performance Ful…"


In [216]:
from sklearn.preprocessing import LabelEncoder

In [257]:
le = LabelEncoder()
le.fit(new_df.select(pl.col('v2ProductName')))

  y = column_or_1d(y, warn=True)


In [258]:
encoded_products = le.transform(new_df.select(pl.col('v2ProductName')))
encoded_products

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([122,  37,  66, ..., 136, 145, 166], dtype=int64)

In [259]:
new_df= new_df.with_columns(pl.Series('v2ProductName', encoded_products))
new_df

date,fullVisitorId,country,transactionId,v2ProductName
date,str,i32,i32,i64
2017-07-31,"""9308310352918219134""",0,0,122
2017-07-31,"""9308310352918219134""",0,0,37
2017-07-31,"""9308310352918219134""",0,0,66
2017-07-31,"""9308310352918219134""",0,0,62
2017-07-31,"""9308310352918219134""",0,0,9
…,…,…,…,…
2017-07-31,"""7444499703471460562""",1,0,146
2017-07-31,"""7444499703471460562""",1,0,147
2017-07-31,"""7444499703471460562""",1,0,136
2017-07-31,"""7444499703471460562""",1,0,145


In [260]:
new_df

date,fullVisitorId,country,transactionId,v2ProductName
date,str,i32,i32,i64
2017-07-31,"""9308310352918219134""",0,0,122
2017-07-31,"""9308310352918219134""",0,0,37
2017-07-31,"""9308310352918219134""",0,0,66
2017-07-31,"""9308310352918219134""",0,0,62
2017-07-31,"""9308310352918219134""",0,0,9
…,…,…,…,…
2017-07-31,"""7444499703471460562""",1,0,146
2017-07-31,"""7444499703471460562""",1,0,147
2017-07-31,"""7444499703471460562""",1,0,136
2017-07-31,"""7444499703471460562""",1,0,145


In [229]:
new_df = new_df.with_columns(pl.DataFrame(transformed)).alias('v2ProductName')
new_df

ShapeError: unable to add a column of length 49512 to a DataFrame of height 32696

In [205]:
new_df.write_csv("../data/cleaned_google_analytics.csv")

In [None]:
# Need to split the data into subsets & perform the target encoding... k fold target encoding
# num_samples = len(new_df) // 8
# num_samples
# new_df = new_df.with_columns(pl.DataFrame({'kfold':np.repeat(np.arange(1, 9), num_samples)}))
# new_df

4087

### Data Pre-processing Pipeline

In [202]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    df.write_csv("../data/cleaned_google_analytics.csv")
    return df 

In [204]:
preprocessing_pipeline(df)

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google Women's Scoop Neck Tee …","""Apparel"""


# Conclusions 
- 