# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [424]:
import polars as pl 
import polars.selectors as cs 
from google.cloud import bigquery
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

In [425]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [426]:
# Create client object
client = bigquery.Client()

In [427]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [428]:
run_query = client.query(QUERY)

In [429]:
df = pl.from_arrow(run_query.to_arrow())



In [430]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [431]:
df.shape

(49512, 13)

In [432]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [433]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501545607,"""0468885122926227782""","""Northern America""","""United States""","""Los Angeles""","""Chrome""","""Android""","""mobile""","""google""",,"""YouTube Twill Cap""","""Home/Shop by Brand/YouTube/"""


In [434]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [435]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [436]:
df.filter(pl.col('transactionId')!='null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [437]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [438]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))
df_cleaned

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,i32,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Google Women's Scoop Neck Tee …","""Apparel"""


In [439]:
encoding = df_cleaned.filter(pl.col('country') == "United States").select(
    pl.col('fullVisitorId'),pl.col('country'), pl.col('browser'),
    pl.col('transactionId'),pl.col('v2ProductName')
)
encoding

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google Snapback Hat Black"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Android Wool Heather Cap Heath…"
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google Blackout Cap"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google 5-Panel Snapback Cap"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Take Charge Shor…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Google Women's Scoop Neck Tee …"


In [440]:
canada = df_cleaned.filter(pl.col('country') == "Canada").select(
    pl.col('fullVisitorId'),pl.col('country'), pl.col('browser'),
    pl.col('transactionId'),pl.col('v2ProductName')
)
# canada = canada.with_columns(pl.col('transactionId').mean().alias('transactionId'))
canada

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""6933896823600086395""","""Canada""","""Chrome""",0,"""Gift Card- $100.00"""
"""6933896823600086395""","""Canada""","""Chrome""",0,"""Gift Card - $25.00"""
"""6933896823600086395""","""Canada""","""Chrome""",0,"""Gift Card - $250.00"""
"""6933896823600086395""","""Canada""","""Chrome""",0,"""Gift Card - $50.00"""
"""7700062233308774111""","""Canada""","""Chrome""",0,"""Google Men's Long Sleeve Ragla…"
…,…,…,…,…
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's 1/4 Zip Jacket …"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Performance Ful…"


In [441]:
new_df = encoding.vstack(canada)
new_df

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google Snapback Hat Black"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Android Wool Heather Cap Heath…"
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google Blackout Cap"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Google 5-Panel Snapback Cap"""
"""9308310352918219134""","""United States""","""Chrome""",0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's 1/4 Zip Jacket …"
"""7444499703471460562""","""Canada""","""Chrome""",0,"""Google Women's Performance Ful…"


In [442]:
new_df = new_df.with_columns(pl.when(pl.col('country') != 'United States').then(1).otherwise(0).alias("country"))
new_df

fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,str,i32,str
"""9308310352918219134""",0,"""Chrome""",0,"""Google Snapback Hat Black"""
"""9308310352918219134""",0,"""Chrome""",0,"""Android Wool Heather Cap Heath…"
"""9308310352918219134""",0,"""Chrome""",0,"""Google Blackout Cap"""
"""9308310352918219134""",0,"""Chrome""",0,"""Google 5-Panel Snapback Cap"""
"""9308310352918219134""",0,"""Chrome""",0,"""Android 5-Panel Low Cap"""
…,…,…,…,…
"""7444499703471460562""",1,"""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""",1,"""Chrome""",0,"""Google Women's Quilted Insulat…"
"""7444499703471460562""",1,"""Chrome""",0,"""Google Women's 1/4 Zip Jacket …"
"""7444499703471460562""",1,"""Chrome""",0,"""Google Women's Performance Ful…"


In [443]:
new_df.select(pl.col('v2ProductName')).unique()

v2ProductName
str
"""Android Women's Long Sleeve Bl…"
"""Colored Pencil Set"""
"""Android RFID Journal"""
"""Google Laptop Backpack"""
"""YouTube Youth Short Sleeve Tee…"
…
"""Google Infant Short Sleeve Tee…"
"""Gift Card - $250.00"""
"""Google Women's Long Sleeve Ble…"
"""Waze Men's Typography Short Sl…"


In [444]:
le = LabelEncoder()
le.fit(new_df.select(pl.col('v2ProductName')))

  y = column_or_1d(y, warn=True)


In [445]:
encoded_products = le.transform(new_df.select(pl.col('v2ProductName')))
encoded_products

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([122,  37,  66, ..., 136, 145, 166])

In [446]:
new_df= new_df.with_columns(pl.Series('v2ProductName', encoded_products))
new_df

fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,str,i32,i32
"""9308310352918219134""",0,"""Chrome""",0,122
"""9308310352918219134""",0,"""Chrome""",0,37
"""9308310352918219134""",0,"""Chrome""",0,66
"""9308310352918219134""",0,"""Chrome""",0,62
"""9308310352918219134""",0,"""Chrome""",0,9
…,…,…,…,…
"""7444499703471460562""",1,"""Chrome""",0,146
"""7444499703471460562""",1,"""Chrome""",0,147
"""7444499703471460562""",1,"""Chrome""",0,136
"""7444499703471460562""",1,"""Chrome""",0,145


In [447]:
le.fit(df.select(pl.col('browser')))

  y = column_or_1d(y, warn=True)


In [448]:
encoded_products = le.transform(new_df.select(pl.col('browser')))
encoded_products

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2, 2, 2, ..., 2, 2, 2])

In [449]:
product_vector= new_df.with_columns(pl.Series('browser', encoded_products))
product_vector

fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,i32,i32,i32
"""9308310352918219134""",0,2,0,122
"""9308310352918219134""",0,2,0,37
"""9308310352918219134""",0,2,0,66
"""9308310352918219134""",0,2,0,62
"""9308310352918219134""",0,2,0,9
…,…,…,…,…
"""7444499703471460562""",1,2,0,146
"""7444499703471460562""",1,2,0,147
"""7444499703471460562""",1,2,0,136
"""7444499703471460562""",1,2,0,145


In [450]:
user_profiles = product_vector.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('browser'),
    pl.col('transactionId'), pl.col('v2ProductName')
)
user_profiles

fullVisitorId,country,browser,transactionId,v2ProductName
str,list[i32],list[i32],list[i32],list[i32]
"""4428761147737419762""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[200, 90, … 120]"
"""4550925942700572899""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[216, 2, … 215]"
"""0599917515908848884""","[0, 0, … 0]","[2, 2, … 2]","[0, 0, … 0]","[170, 169, … 172]"
"""2632632793650115212""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[37, 122, … 17]"
"""5704384357665807794""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[7, 28, … 57]"
…,…,…,…,…
"""2305866137091677651""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 17]"
"""2941670814960377669""","[0, 0, … 0]","[4, 4, … 4]","[0, 0, … 0]","[48, 74, … 185]"
"""4242697711413375486""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 113]"
"""0737779921228462840""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[95, 132, … 109]"


In [451]:
user_profiles_without_id = user_profiles.drop('fullVisitorId')
user_profiles_without_id

country,browser,transactionId,v2ProductName
list[i32],list[i32],list[i32],list[i32]
"[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[200, 90, … 120]"
"[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[216, 2, … 215]"
"[0, 0, … 0]","[2, 2, … 2]","[0, 0, … 0]","[170, 169, … 172]"
"[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[37, 122, … 17]"
"[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[7, 28, … 57]"
…,…,…,…
"[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 17]"
"[0, 0, … 0]","[4, 4, … 4]","[0, 0, … 0]","[48, 74, … 185]"
"[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 113]"
"[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[95, 132, … 109]"


In [452]:
user_profiles_without_id = user_profiles_without_id.to_numpy()
user_profiles_without_id

array([[array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
        array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([200,  90,  48, 131,  44,  67, 195, 121, 198, 199,  84, 120])],
       [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]),
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([216,   2, 224, 217, 222, 223, 221, 220, 219, 226, 227, 215])],
       [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([170, 169, 171, 228, 172, 170, 169, 171, 228, 172, 170, 169, 171,
               228, 172])                                                      ],
       ...,
       [array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [453]:
product_vector = product_vector.drop('fullVisitorId')
product_vector

country,browser,transactionId,v2ProductName
i32,i32,i32,i32
0,2,0,122
0,2,0,37
0,2,0,66
0,2,0,62
0,2,0,9
…,…,…,…
1,2,0,146
1,2,0,147
1,2,0,136
1,2,0,145


In [454]:
product_vector = product_vector.to_numpy()
product_vector

array([[  0,   2,   0, 122],
       [  0,   2,   0,  37],
       [  0,   2,   0,  66],
       ...,
       [  1,   2,   0, 136],
       [  1,   2,   0, 145],
       [  1,   2,   0, 166]])

In [455]:
user_profile_avg_similarity = []
for i in range(0,len(user_profiles_without_id)):
    test_user_profile = np.vstack(user_profiles_without_id[i]).T
    avg_similarity = cosine_similarity(test_user_profile, [product_vector[-1]])
    avg_similarity = avg_similarity.mean()
    user_profile_avg_similarity.append(avg_similarity)
print(user_profile_avg_similarity)

[0.9998638618417205, 0.9334705172925682, 0.9999807424512039, 0.9288505188429513, 0.7692918475160947, 0.9999615576732535, 0.9762760255937826, 0.9334705172925682, 0.9999775686207473, 0.9443423726853656, 0.9997803132660462, 0.9374178189119502, 0.9999828339598629, 0.9712231268136773, 0.9705956262609119, 0.9992401010380814, 0.997794242202746, 0.9095116340847582, 0.9260569743029926, 0.9976826527529697, 0.9555000665432053, 0.9856015616088614, 0.9961464144471645, 0.9988169819047158, 0.9919061449429062, 0.9995164644929263, 0.9637696560237927, 0.9334705172925682, 0.9334705172925682, 0.9985000876502246, 0.999798368780159, 0.9949294556018465, 0.9334705172925682, 0.9987856716911532, 0.9904089236389223, 0.9374178189119502, 0.9998052021220544, 0.9762760255937826, 0.9629680918405787, 0.979125056933101, 0.9993967623258488, 0.9762760255937826, 0.9997905814752636, 0.9977023750381417, 0.9334705172925682, 0.9962991355812697, 0.9999801505007438, 0.9874477719741955, 0.999964839698852, 0.9975240781792994, 0.9

In [456]:
user_profiles

fullVisitorId,country,browser,transactionId,v2ProductName
str,list[i32],list[i32],list[i32],list[i32]
"""4428761147737419762""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[200, 90, … 120]"
"""4550925942700572899""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[216, 2, … 215]"
"""0599917515908848884""","[0, 0, … 0]","[2, 2, … 2]","[0, 0, … 0]","[170, 169, … 172]"
"""2632632793650115212""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[37, 122, … 17]"
"""5704384357665807794""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[7, 28, … 57]"
…,…,…,…,…
"""2305866137091677651""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 17]"
"""2941670814960377669""","[0, 0, … 0]","[4, 4, … 4]","[0, 0, … 0]","[48, 74, … 185]"
"""4242697711413375486""","[1, 1, … 1]","[2, 2, … 2]","[0, 0, … 0]","[95, 16, … 113]"
"""0737779921228462840""","[0, 0, … 0]","[10, 10, … 10]","[0, 0, … 0]","[95, 132, … 109]"


In [457]:
for i in range(0,len(user_profiles)):
    print(user_profiles[i].select(pl.col('fullVisitorId')))

shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ str                 │
╞═════════════════════╡
│ 4428761147737419762 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ str                 │
╞═════════════════════╡
│ 4550925942700572899 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ str                 │
╞═════════════════════╡
│ 0599917515908848884 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ str                 │
╞═════════════════════╡
│ 2632632793650115212 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ str                 │
╞═════════════════════╡
│ 5704384357665807794 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ fullVisitorId       │
│ ---                 │
│ st

In [475]:
user_profile_avg_similarity_dict = {}
for i in range(0,len(user_profiles)):
    test_user_profile = np.vstack(user_profiles_without_id[i]).T
    avg_similarity = cosine_similarity(test_user_profile, [product_vector[-1]])
    avg_similarity = avg_similarity.mean()
    user_profile_avg_similarity_dict[f'{user_profiles['fullVisitorId'][i]}'] = avg_similarity
print(user_profile_avg_similarity_dict)

{'4428761147737419762': 0.9998638618417205, '4550925942700572899': 0.9334705172925682, '0599917515908848884': 0.9999807424512039, '2632632793650115212': 0.9288505188429513, '5704384357665807794': 0.7692918475160947, '0815047945706399620': 0.9999615576732535, '2010276049338642648': 0.9762760255937826, '5598259219731501481': 0.9334705172925682, '8240279401720743464': 0.9999775686207473, '9081814713194703635': 0.9443423726853656, '4040924747453387633': 0.9997803132660462, '0530654477551060890': 0.9374178189119502, '7700062233308774111': 0.9999828339598629, '1871001793558067681': 0.9712231268136773, '5905418516094702044': 0.9705956262609119, '5077259965444145629': 0.9992401010380814, '7023319623362041693': 0.997794242202746, '0158674618686560126': 0.9095116340847582, '646303703490756790': 0.9260569743029926, '3852713398226430344': 0.9976826527529697, '5878462867565067758': 0.9555000665432053, '9412801824032373062': 0.9856015616088614, '8844738260383034790': 0.9961464144471645, '39782740996

In [479]:
sorted_dict = sorted(user_profile_avg_similarity_dict.items(), key=lambda x: x[1])

In [484]:
sorted_dict[-1]

('7958854055537008406', 0.9999965150667732)

In [483]:
sorted_dict[-1][-0]

'7958854055537008406'

In [488]:
df.filter((pl.col('fullVisitorId') == sorted_dict[-1][-0]))

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Pack of 9 Decal Set""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Baby on Board Window Deca…","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Happy Window Decal""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Original Window Deca…","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Ninja Window Decal""","""(not set)"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Dress Socks""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Women's Short Sleeve Tee""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Men's Short Sleeve Tee""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Women's Typography Short …","""(not set)"""


In [487]:
df.filter((pl.col('fullVisitorId') == sorted_dict[-1][-0]) & (pl.col('transactionId') != 'null'))

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str


In [465]:
new_df.write_csv("../data/cleaned_google_analytics.csv")

In [466]:
# Need to split the data into subsets & perform the target encoding... k fold target encoding
# num_samples = len(new_df) // 8
# num_samples
# new_df = new_df.with_columns(pl.DataFrame({'kfold':np.repeat(np.arange(1, 9), num_samples)}))
# new_df

### Data Pre-processing Pipeline

In [467]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    df.write_csv("../data/cleaned_google_analytics.csv")
    return df 

In [468]:
preprocessing_pipeline(df)

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google Women's Scoop Neck Tee …","""Apparel"""


# Conclusions 
- 