# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import polars as pl 
from google.cloud import bigquery
import numpy as np 
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [4]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [5]:
# Create client object
client = bigquery.Client()

In [6]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [7]:
run_query = client.query(QUERY)

In [8]:
df = pl.from_arrow(run_query.to_arrow())

In [9]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [10]:
df.shape

(49512, 13)

In [11]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [12]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501528632,"""526271526505583481""","""Northern Europe""","""United Kingdom""","""London""","""Chrome""","""Windows""","""desktop""","""google""",,"""YouTube Men's Vintage Henley""","""Home/Shop by Brand/YouTube/"""


In [13]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [14]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [15]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [16]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [17]:
df = df.filter(~pl.col('v2ProductCategory').str.contains('origCatName'))

In [18]:
df.write_csv('../data/data.csv')

# Encoding the df

In [19]:
entire_data = pl.read_csv("../data/google_analytics_data.csv",ignore_errors=True)

In [20]:
daily_visitors=entire_data.group_by(pl.col('date')).agg(pl.col('fullVisitorId').unique().count()).sort("date")
daily_visitors

date,fullVisitorId
i64,u32
20160801,837
20160802,921
20160803,1084
20160804,1252
20160805,1049
…,…
20170728,1425
20170729,1032
20170730,1174
20170731,1534


In [21]:
daily_purchases=entire_data.group_by(pl.col('date')).agg((pl.col("transactionId") != "null").count()).sort("date")
daily_purchases

date,transactionId
i64,u32
20160801,226
20160802,124
20160803,0
20160804,78
20160805,376
…,…
20170728,330
20170729,150
20170730,194
20170731,462


In [22]:
encoded_df = target_encoding(entire_data)
encoded_df.write_csv('../data/encoded_df.csv')
encoded_df

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
i64,f64,f64,f64,f64,f64,f64,i32,f64
741694360311983889,0.00003,0.003105,0.004717,0.002803,0.005017,0.00231,0,0.0
751716124732042493,0.005934,0.01018,0.001468,0.00104,0.001015,0.00231,0,0.0
3875753008535006311,0.005934,0.003105,0.000491,0.00104,0.001015,0.000824,0,0.0
,0.00003,0.000514,0.004717,0.006595,0.005017,0.00231,0,0.0
1321377067218362907,0.005934,0.003105,0.004717,0.001074,0.001015,0.00231,0,0.0
…,…,…,…,…,…,…,…,…
4499955707548721756,0.005934,0.011592,0.004717,0.00104,0.001015,0.00231,0,0.0
7699714430086919615,0.005934,0.004581,0.001468,0.00104,0.001015,0.00231,0,0.0
606143131872088967,0.000144,0.0,0.004717,0.001074,0.001015,0.005561,0,0.0
1935665364061088509,0.000121,0.0,0.004717,0.006595,0.005017,0.00231,0,0.0


In [23]:
# encoded_df = label_encoding(df)
# encoded_df

In [24]:
user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('city'), 
    pl.col('browser'),pl.col('operatingSystem'),pl.col('deviceCategory'),
    pl.col('source'),pl.col('transactionId'),pl.col('v2ProductCategory')
)
user_profiles

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
i64,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[i32],list[f64]
7513999583179206180,"[0.000139, 0.000139, … 0.000139]","[0.000143, 0.000143, … 0.000143]","[0.004717, 0.004717, … 0.004717]","[0.008721, 0.008721, … 0.008721]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
58179579207013070,"[0.001917, 0.001917, … 0.001917]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.00231, 0.00231, … 0.00231]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
8906589127135298127,"[0.005934, 0.005934, … 0.005934]","[0.003105, 0.003105, … 0.003105]","[0.001468, 0.001468, … 0.001468]","[0.00104, 0.00104, … 0.00104]","[0.001015, 0.001015, … 0.001015]","[0.005561, 0.005561, … 0.00231]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
5880261448096819927,"[0.005934, 0.005934, … 0.005934]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
3582635577865598343,"[0.000331, 0.000331, … 0.000331]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.00231, 0.00231, … 0.00231]","[0, 0, … 0]","[0.016398, 0.016398, … 0.016398]"
…,…,…,…,…,…,…,…,…
8635504048844388696,"[0.000128, 0.000128, … 0.000128]","[0.003105, 0.003105, … 0.003105]","[0.001468, 0.001468, … 0.001468]","[0.00104, 0.00104, … 0.00104]","[0.001015, 0.001015, … 0.001015]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
6359330005069703105,"[0.005934, 0.005934, … 0.005934]","[0.004581, 0.004581, … 0.004581]","[0.004717, 0.004717, … 0.004717]","[0.004433, 0.004433, … 0.004433]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 1, … 0]","[0.237429, 0.242536, … 0.23826]"
2768671868007492883,"[0.000349, 0.000349, … 0.000349]","[0.003401, 0.003401, … 0.003401]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
535780506994537509,"[0.000312, 0.000312, … 0.000312]","[0.003105, 0.003105, … 0.003105]","[0.002017, 0.002017, … 0.002017]","[0.006595, 0.006595, … 0.006595]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"


In [25]:
# rec_stream = pl.DataFrame(recommendation_eval_pipeline("2703137619338184529",user_profiles, encoded_df, df))
# rec_stream.write_csv("../data/recommendations.csv")

In [26]:
purchasing_visitors=encoded_df.filter(pl.col('transactionId') != 0).select(pl.col('fullVisitorId')).unique()
purchasing_visitors.write_csv("../data/purchasing_visitors.csv")

In [27]:
entire_data = entire_data.with_columns(pl.col("fullVisitorId").cast(str))

In [28]:
user_profiles = user_profiles.with_columns(pl.col("fullVisitorId").cast(str))
user_profiles

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
str,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[i32],list[f64]
"""7513999583179206180""","[0.000139, 0.000139, … 0.000139]","[0.000143, 0.000143, … 0.000143]","[0.004717, 0.004717, … 0.004717]","[0.008721, 0.008721, … 0.008721]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""58179579207013070""","[0.001917, 0.001917, … 0.001917]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.00231, 0.00231, … 0.00231]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""8906589127135298127""","[0.005934, 0.005934, … 0.005934]","[0.003105, 0.003105, … 0.003105]","[0.001468, 0.001468, … 0.001468]","[0.00104, 0.00104, … 0.00104]","[0.001015, 0.001015, … 0.001015]","[0.005561, 0.005561, … 0.00231]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""5880261448096819927""","[0.005934, 0.005934, … 0.005934]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""3582635577865598343""","[0.000331, 0.000331, … 0.000331]","[0.003105, 0.003105, … 0.003105]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.00231, 0.00231, … 0.00231]","[0, 0, … 0]","[0.016398, 0.016398, … 0.016398]"
…,…,…,…,…,…,…,…,…
"""8635504048844388696""","[0.000128, 0.000128, … 0.000128]","[0.003105, 0.003105, … 0.003105]","[0.001468, 0.001468, … 0.001468]","[0.00104, 0.00104, … 0.00104]","[0.001015, 0.001015, … 0.001015]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""6359330005069703105""","[0.005934, 0.005934, … 0.005934]","[0.004581, 0.004581, … 0.004581]","[0.004717, 0.004717, … 0.004717]","[0.004433, 0.004433, … 0.004433]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 1, … 0]","[0.237429, 0.242536, … 0.23826]"
"""2768671868007492883""","[0.000349, 0.000349, … 0.000349]","[0.003401, 0.003401, … 0.003401]","[0.004717, 0.004717, … 0.004717]","[0.002803, 0.002803, … 0.002803]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""535780506994537509""","[0.000312, 0.000312, … 0.000312]","[0.003105, 0.003105, … 0.003105]","[0.002017, 0.002017, … 0.002017]","[0.006595, 0.006595, … 0.006595]","[0.005017, 0.005017, … 0.005017]","[0.005561, 0.005561, … 0.005561]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"


In [29]:
encoded_df = encoded_df.with_columns(pl.col("fullVisitorId").cast(str))

In [30]:
purchasing_visitors=encoded_df.filter(pl.col('transactionId') != 0).select(pl.col('fullVisitorId')).unique()
all_recs = []
for i in purchasing_visitors["fullVisitorId"].to_list():
    prec = recommendation_eval_pipeline(i, user_profiles, encoded_df, entire_data)
    all_recs.append(prec)
np.mean(all_recs)

KeyboardInterrupt: 

In [31]:
recommendation_eval_pipeline("2357073894289283929", user_profiles, encoded_df, entire_data)

KeyboardInterrupt: 

In [None]:
np.savetxt("../data/model_accuracy.csv",all_recs)

In [None]:
df.write_csv("../data/cleaned_google_analytics.csv")

In [None]:
recommendation_pipeline("2703137619338184529", user_profiles, encoded_df, entire_data)

['Waze', 'Headgear', 'Drinkware', 'Bottles', 'Drinkware']

In [None]:
recommendation_eval_pipeline("2703137619338184529", user_profiles, encoded_df, entire_data)

ComputeError: cannot compare string with numeric type (i64)

### Data Pre-processing Pipeline

In [None]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """
    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    encoded_df = encoding(df)
    
    user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
        pl.col('country'), pl.col('browser'),
        pl.col('transactionId'), pl.col('v2ProductName')
    )
    user_profiles_no_id = user_profiles.drop('fullVisitorId')
    user_profiles_no_id = user_profiles_no_id.to_numpy()
    active_users = encoded_df.drop('fullVisitorId')
    active_users = active_users.to_numpy()

    similarities = calculate_similarity(user_profiles, user_profiles_no_id, active_users)
    sorted_dict = sorted(similarities.items(), key=lambda x: x[1])

    
    df.write_csv("../data/cleaned_google_analytics.csv")
    return recommendation(df,sorted_dict) 

In [None]:
# preprocessing_pipeline(df)

# Conclusions 
- 