# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [167]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [168]:
import polars as pl 
from google.cloud import bigquery
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [169]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [170]:
# Create client object
client = bigquery.Client()

In [171]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [172]:
import pandas as pd

In [173]:
run_query = client.query(QUERY)

In [174]:
df = pl.from_arrow(run_query.to_arrow())

In [175]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [176]:
df.shape

(49512, 13)

In [177]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [178]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501504429,"""2173067697636247085""","""Western Europe""","""Austria""","""not available in demo dataset""","""Chrome""","""Android""","""mobile""","""google""",,"""YouTube Twill Cap""","""Home/Shop by Brand/YouTube/"""


In [179]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [180]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [181]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [182]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [183]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))

# Encoding the df

In [184]:
encoded_df = target_encoding(df)
encoded_df.write_csv('../data/encoded_df.csv')
encoded_df

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
str,f64,f64,f64,f64,f64,f64,i32,f64
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
…,…,…,…,…,…,…,…,…
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.254777
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.262
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.262
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.262


In [185]:
# encoded_df = label_encoding(df)
# encoded_df

In [186]:
user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('city'), 
    pl.col('browser'),pl.col('operatingSystem'),pl.col('deviceCategory'),
    pl.col('source'),pl.col('transactionId'),pl.col('v2ProductCategory')
)
user_profiles

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
str,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[i32],list[f64]
"""856273082667959194""","[0.014312, 0.014312, … 0.014312]","[0.014785, 0.014785, … 0.014785]","[0.011785, 0.011785, … 0.011785]","[0.01692, 0.01692, … 0.01692]","[0.011631, 0.011631, … 0.011631]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""7051248816727707528""","[0.014312, 0.014312, … 0.014312]","[0.006208, 0.006208, … 0.006208]","[0.011785, 0.011785, … 0.011785]","[0.006482, 0.006482, … 0.006482]","[0.003792, 0.003792, … 0.003792]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""7445235885559107095""","[0.014312, 0.014312, … 0.014312]","[0.040404, 0.040404, … 0.040404]","[0.003759, 0.003759, … 0.003759]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 1]","[0.0, 0.0, … 0.25]"
"""0418173267353770376""",[0.0],[0.0],[0.011785],[0.01692],[0.011631],[0.020215],[0],[0.0]
"""3781974561839956163""","[0.0, 0.0, … 0.0]","[0.006208, 0.006208, … 0.006208]","[0.011785, 0.011785, … 0.011785]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
…,…,…,…,…,…,…,…,…
"""0560109743223224342""","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.011785, 0.011785, … 0.011785]","[0.006482, 0.006482, … 0.006482]","[0.003792, 0.003792, … 0.003792]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""2307682084695168667""","[0.0, 0.0, … 0.0]","[0.006208, 0.006208, … 0.006208]","[0.011785, 0.011785, … 0.011785]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""5382147745779575950""","[0.014312, 0.014312, … 0.014312]","[0.013889, 0.013889, … 0.013889]","[0.000797, 0.000797, … 0.000797]","[0.000566, 0.000566, … 0.000566]","[0.003792, 0.003792, … 0.003792]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""0226924999574625231""","[0.014312, 0.014312, … 0.014312]","[0.003949, 0.003949, … 0.003949]","[0.011785, 0.011785, … 0.011785]","[0.000566, 0.000566, … 0.000566]","[0.003792, 0.003792, … 0.003792]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"


In [239]:
purchasing_visitors=encoded_df.filter(pl.col('transactionId') != 0).select(pl.col('fullVisitorId')).unique()
all_recs = []
for i in purchasing_visitors["fullVisitorId"].to_list():
    prec = recommendation_eval_pipeline(i, user_profiles, encoded_df, df)
    all_recs.append(prec)
np.mean(all_recs)

0.21583333333333332

In [241]:
purchasing_visitors

fullVisitorId
str
"""8530723892127582602"""
"""4950411203281265700"""
"""7188768914363923661"""
"""527426783375720325"""
"""7463172420271311409"""
…
"""4952396078130707023"""
"""5687667730920600613"""
"""0280589015308186459"""
"""186231215995844689"""


In [242]:
recommendation_eval_pipeline("2766162501835274072", user_profiles, encoded_df, df)

0.25

In [None]:
df.write_csv("../data/cleaned_google_analytics.csv")

### Data Pre-processing Pipeline

In [None]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    encoded_df = encoding(df)
    
    user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
        pl.col('country'), pl.col('browser'),
        pl.col('transactionId'), pl.col('v2ProductName')
    )
    user_profiles_no_id = user_profiles.drop('fullVisitorId')
    user_profiles_no_id = user_profiles_no_id.to_numpy()
    active_users = encoded_df.drop('fullVisitorId')
    active_users = active_users.to_numpy()

    similarities = calculate_similarity(user_profiles, user_profiles_no_id, active_users)
    sorted_dict = sorted(similarities.items(), key=lambda x: x[1])

    
    df.write_csv("../data/cleaned_google_analytics.csv")
    return recommendation(df,sorted_dict) 

In [None]:
# preprocessing_pipeline(df)

# Conclusions 
- 