# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [64]:
import polars as pl 
import polars.selectors as cs 
from google.cloud import bigquery

In [65]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [82]:
# Create client object
client = bigquery.Client()

In [83]:
QUERY = ("SELECT `date`, `visitId`,`visitNumber`,`visitStartTime`,`visitorId`,`userId`,"
         "`trafficSource`.`referralPath`,`trafficSource`.`source`,`device`.`browser`,"
         "`device`.`operatingSystem`,`device`.`isMobile`,`device`.`deviceCategory`,"
         "`geoNetwork`.`continent`,`geoNetwork`.`subContinent`,`geoNetwork`.`country`,"
         "`geoNetwork`.`region`,`geoNetwork`.`metro`,`geoNetwork`.`city`,`totals`.`visits`,"
         "`totals`.`pageviews`,`totals`.`timeOnSite`,`totals`.`transactions`,`totals`.`transactionRevenue`,"
         "`totals`.`totalTransactionRevenue`,"
         "FROM`bigquery-public-data.google_analytics_sample.ga_sessions_20170801`;")

In [84]:
run_query = client.query(QUERY)

In [85]:
df = pl.from_arrow(run_query.to_arrow())



In [86]:
df.head()

date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,i64,i64,i64,i64,str,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""20170801""",1501591568,1,1501591568,,,,"""(direct)""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Southern Europe""","""Greece""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501589647,2,1501589647,,,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Asia""","""Southern Asia""","""India""","""Maharashtra""","""(not set)""","""Mumbai""",1,1,,,,
"""20170801""",1501616621,1,1501616621,,,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Northern Europe""","""United Kingdom""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501601200,1,1501601200,,,"""/analytics/web/""","""analytics.google.com""","""Firefox""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""Texas""","""Dallas-Ft. Worth TX""","""Dallas""",1,1,,,,
"""20170801""",1501615525,1,1501615525,,,"""/analytics/web/""","""adwords.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,


In [71]:
df.shape

(2556, 24)

In [72]:
df.columns

['date',
 'visitId',
 'visitNumber',
 'visitStartTime',
 'visitorId',
 'userId',
 'referralPath',
 'source',
 'browser',
 'operatingSystem',
 'isMobile',
 'deviceCategory',
 'continent',
 'subContinent',
 'country',
 'region',
 'metro',
 'city',
 'visits',
 'pageviews',
 'timeOnSite',
 'transactions',
 'transactionRevenue',
 'totalTransactionRevenue']

In [73]:
df.sample()

date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,i64,i64,i64,i64,str,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""20170801""",1501606548,9,1501606548,,,,"""(direct)""","""Chrome""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,2,5,,,


In [74]:
df.describe()

statistic,date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,str,f64,f64,f64,f64,str,str,str,str,str,f64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64
"""count""","""2556""",2556.0,2556.0,2556.0,0.0,"""0""","""649""","""2556""","""2556""","""2556""",2556.0,"""2556""","""2556""","""2556""","""2556""","""2556""","""2556""","""2556""",2556.0,2556.0,1310.0,43.0,43.0,43.0
"""null_count""","""0""",0.0,0.0,0.0,2556.0,"""2556""","""1907""","""0""","""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,1246.0,2513.0,2513.0,2513.0
"""mean""",,1501600000.0,2.661189,1501600000.0,,,,,,,0.318858,,,,,,,,1.0,4.279734,325.051908,1.046512,193140000.0,206740000.0
"""std""",,22804.553787,12.0222,22804.234297,,,,,,,,,,,,,,,0.0,7.966343,592.411724,0.213083,478700000.0,478520000.0
"""min""","""20170801""",1501600000.0,1.0,1501600000.0,,,"""/""","""(direct)""","""Android Browser""","""(not set)""",0.0,"""desktop""","""(not set)""","""(not set)""","""(not set)""","""(not set)""","""(not set)""","""(not set)""",1.0,1.0,1.0,1.0,1990000.0,2990000.0
"""25%""",,1501600000.0,1.0,1501600000.0,,,,,,,,,,,,,,,1.0,1.0,32.0,1.0,29580000.0,34790000.0
"""50%""",,1501600000.0,1.0,1501600000.0,,,,,,,,,,,,,,,1.0,2.0,97.0,1.0,45670000.0,50790000.0
"""75%""",,1501600000.0,2.0,1501600000.0,,,,,,,,,,,,,,,1.0,4.0,337.0,1.0,157140000.0,173900000.0
"""max""","""20170801""",1501700000.0,326.0,1501700000.0,,,"""/yt/about/press/""","""youtube.com""","""YaBrowser""","""iOS""",1.0,"""tablet""","""Oceania""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1.0,155.0,5455.0,2.0,2933600000.0,2935600000.0


In [75]:
df.filter(pl.col('transactionRevenue') > 0)

date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,i64,i64,i64,i64,str,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""20170801""",1501621191,4,1501621191,,,,"""(direct)""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""California""","""San Francisco-Oakland-San Jose…","""Mountain View""",1,5,56,1,35290000,40290000
"""20170801""",1501612542,6,1501612542,,,"""/""","""(direct)""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""California""","""San Francisco-Oakland-San Jose…","""Mountain View""",1,11,212,1,27430000,30430000
"""20170801""",1501599317,1,1501599317,,,,"""(direct)""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,14,559,1,1990000,2990000
"""20170801""",1501620679,3,1501620679,,,,"""(direct)""","""Safari""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,14,237,1,13290000,22290000
"""20170801""",1501595478,3,1501595478,,,"""/mail/ca/u/0/""","""mail.google.com""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""New York""","""New York NY""","""New York""",1,16,1474,1,11960000,17960000
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170801""",1501628356,8,1501628356,,,"""/""","""(direct)""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""Illinois""","""Chicago IL""","""Chicago""",1,48,1827,2,126680000,400960000
"""20170801""",1501647149,1,1501647149,,,,"""(direct)""","""Chrome""","""Windows""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,58,1685,1,4990000,6990000
"""20170801""",1501608078,8,1501608078,,,"""/""","""(direct)""","""Chrome""","""Macintosh""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""California""","""San Francisco-Oakland-San Jose…","""San Francisco""",1,55,1572,1,2933610000,2935610000
"""20170801""",1501627131,16,1501627131,,,,"""(direct)""","""Chrome""","""Windows""",false,"""desktop""","""Americas""","""Northern America""","""United States""","""New York""","""New York NY""","""New York""",1,112,4444,1,400210000,406210000


In [76]:
df.select(pl.all().is_null().sum())

date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,2556,2556,1907,0,0,0,0,0,0,0,0,0,0,0,0,0,1246,2513,2513,2513


In [77]:
df.head()

date,visitId,visitNumber,visitStartTime,visitorId,userId,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,i64,i64,i64,i64,str,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""20170801""",1501591568,1,1501591568,,,,"""(direct)""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Southern Europe""","""Greece""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501589647,2,1501589647,,,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Asia""","""Southern Asia""","""India""","""Maharashtra""","""(not set)""","""Mumbai""",1,1,,,,
"""20170801""",1501616621,1,1501616621,,,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Northern Europe""","""United Kingdom""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501601200,1,1501601200,,,"""/analytics/web/""","""analytics.google.com""","""Firefox""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""Texas""","""Dallas-Ft. Worth TX""","""Dallas""",1,1,,,,
"""20170801""",1501615525,1,1501615525,,,"""/analytics/web/""","""adwords.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,


In [None]:
df_cleaned = df.drop(pl.col('visitorId','userId'))
df_cleaned.head()

date,visitId,visitNumber,visitStartTime,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
str,i64,i64,i64,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""20170801""",1501591568,1,1501591568,,"""(direct)""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Southern Europe""","""Greece""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501589647,2,1501589647,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Asia""","""Southern Asia""","""India""","""Maharashtra""","""(not set)""","""Mumbai""",1,1,,,,
"""20170801""",1501616621,1,1501616621,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Northern Europe""","""United Kingdom""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
"""20170801""",1501601200,1,1501601200,"""/analytics/web/""","""analytics.google.com""","""Firefox""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""Texas""","""Dallas-Ft. Worth TX""","""Dallas""",1,1,,,,
"""20170801""",1501615525,1,1501615525,"""/analytics/web/""","""adwords.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,


In [None]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitId,visitNumber,visitStartTime,referralPath,source,browser,operatingSystem,isMobile,deviceCategory,continent,subContinent,country,region,metro,city,visits,pageviews,timeOnSite,transactions,transactionRevenue,totalTransactionRevenue
date,i64,i64,i64,str,str,str,str,bool,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
2017-08-01,1501591568,1,1501591568,,"""(direct)""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Southern Europe""","""Greece""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
2017-08-01,1501589647,2,1501589647,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Asia""","""Southern Asia""","""India""","""Maharashtra""","""(not set)""","""Mumbai""",1,1,,,,
2017-08-01,1501616621,1,1501616621,"""/analytics/web/""","""analytics.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Europe""","""Northern Europe""","""United Kingdom""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,
2017-08-01,1501601200,1,1501601200,"""/analytics/web/""","""analytics.google.com""","""Firefox""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""Texas""","""Dallas-Ft. Worth TX""","""Dallas""",1,1,,,,
2017-08-01,1501615525,1,1501615525,"""/analytics/web/""","""adwords.google.com""","""Chrome""","""Windows""",False,"""desktop""","""Americas""","""Northern America""","""United States""","""not available in demo dataset""","""not available in demo dataset""","""not available in demo dataset""",1,1,,,,


### Data Pre-processing Pipeline

In [None]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.drop(pl.col('visitorId','userId'))
    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    return df 

In [81]:
preprocessing_pipeline(df)

ColumnNotFoundError: "userId" not found

Resolved plan until failure:

	---> FAILED HERE RESOLVING THIS_NODE <---
DF ["date", "visitId", "visitNumber", "visitStartTime"]; PROJECT */22 COLUMNS; SELECTION: None

# Conclusions 
- 