# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [103]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
import polars as pl 
from google.cloud import bigquery
import numpy as np 
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [105]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [106]:
# df = pl.read_csv("../data/google_analytics_data.csv",ignore_errors=True)

In [107]:
# Create client object
client = bigquery.Client()

In [108]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [109]:
run_query = client.query(QUERY)

In [110]:
df = pl.from_arrow(run_query.to_arrow())

In [111]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [112]:
df.shape

(49512, 13)

In [113]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [114]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501533234,"""2370141417178282776""","""Northern America""","""United States""","""Sunnyvale""","""Chrome""","""Macintosh""","""desktop""","""google""",,"""Google Men's Zip Hoodie""","""Home/Apparel/Men's/Men's-Outer…"


In [115]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [116]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [117]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [118]:
# df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
# df.head()

In [119]:
df = df.filter(~pl.col('v2ProductCategory').str.contains('origCatName'))
df = df.with_columns(pl.col("fullVisitorId").cast(str))
df.write_csv("../data/google_analytics_data.csv")

# Encoding the df

In [120]:
# Daily Visitors 
df.group_by(pl.col('date')).agg(pl.col('fullVisitorId').unique().len()).sort("date")

date,fullVisitorId
str,u32
"""20170731""",1641


In [121]:
# Daily Purchases 
df.group_by(pl.col('date')).agg((pl.col("transactionId") != "null").len()).sort("date")

date,transactionId
str,u32
"""20170731""",49441


In [122]:
encoded_df = target_encoding(df)
encoded_df.write_csv('../data/encoded_df.csv')
encoded_df

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
str,f64,f64,f64,f64,f64,f64,i32,f64
"""2219384770970157334""",0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
"""2219384770970157334""",0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
"""2219384770970157334""",0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
"""2219384770970157334""",0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
"""2219384770970157334""",0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
…,…,…,…,…,…,…,…,…
"""7483600664917507409""",0.013646,0.012644,0.011229,0.016248,0.01104,0.019489,1,0.254777
"""7483600664917507409""",0.013646,0.012644,0.011229,0.016248,0.01104,0.019489,1,0.262
"""7483600664917507409""",0.013646,0.012644,0.011229,0.016248,0.01104,0.019489,1,0.262
"""7483600664917507409""",0.013646,0.012644,0.011229,0.016248,0.01104,0.019489,1,0.262


In [123]:
# encoded_df = label_encoding(df)
# encoded_df

In [124]:
user_profiles_ = user_profiles(encoded_df)
user_profiles_.head()

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
str,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[i32],list[f64]
"""6283868298710663989""","[0.0, 0.0, … 0.0]","[0.005886, 0.005886, … 0.005886]","[0.011229, 0.011229, … 0.011229]","[0.00545, 0.00545, … 0.00545]","[0.01104, 0.01104, … 0.01104]","[0.0, 0.0, … 0.0]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""3109676882483240775""","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.011229, 0.011229, … 0.011229]","[0.016248, 0.016248, … 0.016248]","[0.01104, 0.01104, … 0.01104]","[0.003369, 0.003369, … 0.003369]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""1975233809594236859""","[0.013646, 0.013646, 0.013646]","[0.018963, 0.018963, 0.018963]","[0.011229, 0.011229, 0.011229]","[0.017348, 0.017348, 0.017348]","[0.01104, 0.01104, 0.01104]","[0.003369, 0.003369, 0.003369]","[0, 0, 0]","[0.0, 0.0, 0.0]"
"""4697729988920670456""","[0.0, 0.0, … 0.0]","[0.005886, 0.005886, … 0.005886]","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.003792, 0.003792, … 0.003792]","[0.019489, 0.019489, … 0.019489]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""9726418589753226132""","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.011229, 0.011229, … 0.011229]","[0.00545, 0.00545, … 0.00545]","[0.01104, 0.01104, … 0.01104]","[0.003369, 0.003369, … 0.003369]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"


In [125]:
purchasing_visitors=encoded_df.filter(pl.col('transactionId') != 0).select(pl.col('fullVisitorId')).unique()
purchasing_visitors.write_csv("../data/purchasing_visitors.csv")

In [127]:
recommendation_eval_pipeline("2981325201816358942", user_profiles_, encoded_df, df)

0.2

In [128]:
recommendation_pipeline("2981325201816358942", user_profiles_, encoded_df, df)

['Waze', 'Apparel', 'Apparel', 'Office', 'Apparel']

### Data Pre-processing Pipeline

In [129]:
def preprocessing_pipeline(df: pl.DataFrame, active_user: str) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """
    df = df.filter(~pl.col('v2ProductCategory').str.contains('origCatName'))
    df = df.with_columns(pl.col("fullVisitorId").cast(str))
    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    encoded_df_ = target_encoding(df)
    
    user_profiles_ = user_profiles(encoded_df_)

    rec = recommendation_pipeline(active_user, user_profiles_, encoded_df, df)
    
    return rec

In [130]:
preprocessing_pipeline(df,"2981325201816358942")

['Waze', 'Apparel', 'Apparel', 'Bottles', 'Drinkware']

# Conclusions 
- 