# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [420]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [421]:
import polars as pl 
from google.cloud import bigquery
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [422]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [423]:
# Create client object
client = bigquery.Client()

In [424]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [425]:
run_query = client.query(QUERY)

In [426]:
df = pl.from_arrow(run_query.to_arrow())

In [427]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [428]:
df.shape

(49512, 13)

In [429]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [430]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501556238,"""7420300501523012460""","""Northern America""","""United States""","""New York""","""Chrome""","""Windows""","""desktop""","""google""",,"""Google 4400mAh Power Bank""","""Home/Electronics/"""


In [431]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [432]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [433]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [434]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [435]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))
df_cleaned

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,i32,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Google Women's Scoop Neck Tee …","""Apparel"""


# Encoding the df

In [436]:
encoded_df = encoding(df)
encoded_df

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName
str,i32,i32,i32,i32,i32,i32,i32,i32
"""2219384770970157334""",79,67,5,6,0,8,0,92
"""2219384770970157334""",79,67,5,6,0,8,0,49
"""2219384770970157334""",79,67,5,6,0,8,0,202
"""2219384770970157334""",79,67,5,6,0,8,0,126
"""2219384770970157334""",79,67,5,6,0,8,0,203
…,…,…,…,…,…,…,…,…
"""7483600664917507409""",94,37,2,4,0,0,1,8
"""7483600664917507409""",94,37,2,4,0,0,1,21
"""7483600664917507409""",94,37,2,4,0,0,1,23
"""7483600664917507409""",94,37,2,4,0,0,1,156


In [437]:
user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('city'), 
    pl.col('browser'),pl.col('operatingSystem'),pl.col('deviceCategory'),
    pl.col('source'),pl.col('transactionId'),pl.col('v2ProductName')
)
user_profiles

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName
str,list[i32],list[i32],list[i32],list[i32],list[i32],list[i32],list[i32],list[i32]
"""3109676882483240775""","[3, 3, … 3]","[62, 62, … 62]","[2, 2, … 2]","[4, 4, … 4]","[0, 0, … 0]","[8, 8, … 8]","[0, 0, … 0]","[224, 2, … 223]"
"""3712894779177944818""","[96, 96, … 96]","[67, 67, … 67]","[2, 2, … 2]","[6, 6, … 6]","[0, 0, … 0]","[8, 8, … 8]","[0, 0, … 0]","[16, 28, … 235]"
"""3809164675881912585""","[94, 94, … 94]","[67, 67, … 67]","[2, 2, … 2]","[1, 1, … 1]","[1, 1, … 1]","[8, 8, … 8]","[0, 0, … 0]","[97, 137, … 23]"
"""4218367025544931420""","[94, 94, … 94]","[41, 41, … 41]","[2, 2, … 2]","[4, 4, … 4]","[0, 0, … 0]","[0, 0, … 0]","[0, 0, … 1]","[220, 198, … 203]"
"""7387044852520288742""","[62, 62, … 62]","[67, 67, … 67]","[2, 2, … 2]","[6, 6, … 6]","[0, 0, … 0]","[8, 8, … 8]","[0, 0, … 0]","[160, 185, … 160]"
…,…,…,…,…,…,…,…,…
"""3861214969019187548""","[65, 65, … 65]","[67, 67, … 67]","[4, 4, … 4]","[6, 6, … 6]","[0, 0, … 0]","[20, 20, … 20]","[0, 0, … 0]","[224, 2, … 223]"
"""5857801864608667779""","[43, 43, … 43]","[67, 67, … 67]","[10, 10, … 10]","[8, 8, … 8]","[1, 1, … 1]","[8, 8, … 8]","[0, 0, … 0]","[97, 17, … 228]"
"""8805708485521378179""","[33, 33, … 33]","[67, 67, … 67]","[10, 10, … 10]","[8, 8, … 8]","[1, 1, … 1]","[8, 8, … 8]","[0, 0, … 0]","[224, 2, … 223]"
"""6951857409958028911""","[86, 86, … 86]","[0, 0, … 0]","[10, 10, … 10]","[4, 4, … 4]","[0, 0, … 0]","[8, 8, … 8]","[0, 0, … 0]","[222, 85, … 206]"


In [438]:
user_profiles = user_profiles.filter(pl.col('fullVisitorId') != '0345672699449577691')

In [439]:
user_profiles_no_id = user_profiles.drop('fullVisitorId')
user_profiles_no_id = user_profiles_no_id.to_numpy()
user_profiles_no_id

array([[array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
        array([62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]),
        array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), ...,
        array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]),
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        array([224,   2, 233, 225, 230, 231, 229, 228, 227, 235, 236, 223])],
       [array([96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96,
               96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96,
               96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96])   ,
        array([67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
               67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
               67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67])   ,
        array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [440]:
active_users = encoded_df.drop('fullVisitorId')
active_users = active_users.to_numpy()
active_users

array([[ 79,  67,   5, ...,   8,   0,  92],
       [ 79,  67,   5, ...,   8,   0,  49],
       [ 79,  67,   5, ...,   8,   0, 202],
       ...,
       [ 94,  37,   2, ...,   0,   1,  23],
       [ 94,  37,   2, ...,   0,   1, 156],
       [ 94,  37,   2, ...,   0,   0,  95]])

In [441]:
test_users = [2,	9,	0,	1,	0,	0,	1,	26]
test_users

[2, 9, 0, 1, 0, 0, 1, 26]

In [442]:
active_users[-1]

array([94, 37,  2,  4,  0,  0,  0, 95])

In [443]:
similarities = calculate_similarity(user_profiles, user_profiles_no_id, test_users)
similarities

{'3109676882483240775': 0.9413165370134315,
 '3712894779177944818': 0.7880224804247149,
 '3809164675881912585': 0.6762142500162333,
 '4218367025544931420': 0.7979005248545431,
 '7387044852520288742': 0.9658445100882688,
 '4819435223377641926': 0.8568708272094696,
 '6892430949290548362': 0.8541517624201568,
 '5628291610198127489': 0.6146060611703222,
 '7236916608806837697': 0.6131596808708618,
 '1451541665479340132': 0.9380339328865497,
 '1980746318225505280': 0.9139475599159841,
 '1748019695344643605': 0.9438453496904584,
 '05926379368303205': 0.944301684015339,
 '4183484508617227714': 0.6075691562024567,
 '0358807689553966707': 0.7527889833106697,
 '6761935814319672319': 0.6863808799707382,
 '9512976956663565169': 0.6628679544890365,
 '2632632793650115212': 0.6720091575937106,
 '4834703919260758766': 0.6582752074829709,
 '8180153759760508529': 0.5324383239163978,
 '6622164556723271867': 0.7675171743464242,
 '7439148476123852114': 0.9436225114824839,
 '879853187466107431': 0.8826409249

In [444]:
sorted_dict = sorted(similarities.items(), key=lambda x: x[1])
sorted_dict[-5:]

[('24486799598615247', 0.9910547429548338),
 ('8319892827536314987', 0.9940268445351574),
 ('9762695135266003658', 0.9969107467745082),
 ('7958854055537008406', 0.9983318428069594),
 ('3998764704348759017', 0.9988188806470808)]

In [445]:
actual_bought = df.filter((pl.col('fullVisitorId') == '0345672699449577691') & (pl.col('transactionId') != 'null')).select(pl.col('v2ProductName')).to_series().to_list()
actual_bought

['Google Blackout Cap',
 "Google Women's Vintage Hero Tee Black",
 "Google Men's Long & Lean Tee Charcoal",
 "Google Men's Vintage Badge Tee Black",
 'Google Blackout Cap',
 "Google Women's Vintage Hero Tee Black",
 "Google Men's Long & Lean Tee Charcoal",
 "Google Men's Vintage Badge Tee Black"]

In [473]:
for i in range(0,len(sorted_dict)):
    if len(df.filter((pl.col('fullVisitorId') == sorted_dict[i][0]) & ((pl.col('transactionId') != "null")))) > 0:
        print(df.filter((pl.col('fullVisitorId') == sorted_dict[i][0]) & ((pl.col('transactionId') != "null"))))
        

shape: (2, 13)
┌──────────┬────────────────┬─────────────────────┬──────────────────┬───┬────────┬─────────────────┬────────────────────────┬───────────────────┐
│ date     ┆ visitStartTime ┆ fullVisitorId       ┆ subContinent     ┆ … ┆ source ┆ transactionId   ┆ v2ProductName          ┆ v2ProductCategory │
│ ---      ┆ ---            ┆ ---                 ┆ ---              ┆   ┆ ---    ┆ ---             ┆ ---                    ┆ ---               │
│ str      ┆ i64            ┆ str                 ┆ str              ┆   ┆ str    ┆ str             ┆ str                    ┆ str               │
╞══════════╪════════════════╪═════════════════════╪══════════════════╪═══╪════════╪═════════════════╪════════════════════════╪═══════════════════╡
│ 20170731 ┆ 1501520591     ┆ 5451348672261739783 ┆ Northern America ┆ … ┆ google ┆ ORD201707312269 ┆ Android Rise 14 oz Mug ┆ Drinkware         │
│ 20170731 ┆ 1501520591     ┆ 5451348672261739783 ┆ Northern America ┆ … ┆ google ┆ ORD201707312269 ┆ A

In [None]:
index = -1
recs = [] 

while len(recs) < 5:
    recommendations = df.filter((pl.col('fullVisitorId') == sorted_dict[index][0]) & ((pl.col('transactionId') != "null")))
    if len(recommendations) > 0:
        print(recommendations)
        recommendations = recommendations.select(pl.col('v2ProductName')).unique().item(0,'v2ProductName')
        recs.append(recommendations)
        
    index -= 1

shape: (2, 13)
┌──────────┬────────────────┬─────────────────────┬──────────────────┬───┬──────────┬─────────────────┬─────────────────────────────────┬───────────────────┐
│ date     ┆ visitStartTime ┆ fullVisitorId       ┆ subContinent     ┆ … ┆ source   ┆ transactionId   ┆ v2ProductName                   ┆ v2ProductCategory │
│ ---      ┆ ---            ┆ ---                 ┆ ---              ┆   ┆ ---      ┆ ---             ┆ ---                             ┆ ---               │
│ str      ┆ i64            ┆ str                 ┆ str              ┆   ┆ str      ┆ str             ┆ str                             ┆ str               │
╞══════════╪════════════════╪═════════════════════╪══════════════════╪═══╪══════════╪═════════════════╪═════════════════════════════════╪═══════════════════╡
│ 20170731 ┆ 1501540907     ┆ 8149979272084433494 ┆ Northern America ┆ … ┆ (direct) ┆ ORD201707312352 ┆ Waze Mood Original Window Deca… ┆ Waze              │
│ 20170731 ┆ 1501540907     ┆ 8149979

In [451]:
df.filter((pl.col('fullVisitorId') == sorted_dict[-2][0]))

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Pack of 9 Decal Set""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Baby on Board Window Deca…","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Happy Window Decal""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Original Window Deca…","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Mood Ninja Window Decal""","""(not set)"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Dress Socks""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Women's Short Sleeve Tee""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Men's Short Sleeve Tee""","""(not set)"""
"""20170731""",1501506815,"""7958854055537008406""","""Northern America""","""Canada""","""Toronto""","""Chrome""","""Windows""","""desktop""","""(direct)""",,"""Waze Women's Typography Short …","""(not set)"""


In [446]:
recommendation(df,sorted_dict)

shape: (0, 13)
┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
│ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2ProductCategory │
│ ---  ┆ ---            ┆ ---           ┆ ---          ┆   ┆ ---    ┆ ---           ┆ ---           ┆ ---               │
│ str  ┆ i64            ┆ str           ┆ str          ┆   ┆ str    ┆ str           ┆ str           ┆ str               │
╞══════╪════════════════╪═══════════════╪══════════════╪═══╪════════╪═══════════════╪═══════════════╪═══════════════════╡
└──────┴────────────────┴───────────────┴──────────────┴───┴────────┴───────────────┴───────────────┴───────────────────┘
shape: (0, 13)
┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
│ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2ProductCateg

[shape: (0, 13)
 ┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
 │ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2ProductCategory │
 │ ---  ┆ ---            ┆ ---           ┆ ---          ┆   ┆ ---    ┆ ---           ┆ ---           ┆ ---               │
 │ str  ┆ i64            ┆ str           ┆ str          ┆   ┆ str    ┆ str           ┆ str           ┆ str               │
 ╞══════╪════════════════╪═══════════════╪══════════════╪═══╪════════╪═══════════════╪═══════════════╪═══════════════════╡
 └──────┴────────────────┴───────────────┴──────────────┴───┴────────┴───────────────┴───────────────┴───────────────────┘,
 shape: (0, 13)
 ┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
 │ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2P

In [447]:
rec = recommendation(df,sorted_dict)

shape: (0, 13)
┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
│ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2ProductCategory │
│ ---  ┆ ---            ┆ ---           ┆ ---          ┆   ┆ ---    ┆ ---           ┆ ---           ┆ ---               │
│ str  ┆ i64            ┆ str           ┆ str          ┆   ┆ str    ┆ str           ┆ str           ┆ str               │
╞══════╪════════════════╪═══════════════╪══════════════╪═══╪════════╪═══════════════╪═══════════════╪═══════════════════╡
└──────┴────────────────┴───────────────┴──────────────┴───┴────────┴───────────────┴───────────────┴───────────────────┘
shape: (0, 13)
┌──────┬────────────────┬───────────────┬──────────────┬───┬────────┬───────────────┬───────────────┬───────────────────┐
│ date ┆ visitStartTime ┆ fullVisitorId ┆ subContinent ┆ … ┆ source ┆ transactionId ┆ v2ProductName ┆ v2ProductCateg

In [448]:
actual_bought=set(actual_bought)
rec=set(rec)

TypeError: unhashable type: 'DataFrame'

In [None]:
actual_bought

{'Google Blackout Cap',
 "Google Men's Long & Lean Tee Charcoal",
 "Google Men's Vintage Badge Tee Black",
 "Google Women's Vintage Hero Tee Black"}

In [None]:
rec

{'Google Car Clip Phone Holder',
 'Google Infant Short Sleeve Tee Red',
 'Google Onesie Red/Graphite',
 'Google Sunglasses',
 'Google Twill Cap'}

In [None]:
set(actual_bought & rec)

set()

In [None]:
df.select(pl.col('v2ProductName'))

v2ProductName
str
"""Google Lunch Bag"""
"""Electronics Accessory Pouch"""
"""Sport Bag"""
"""Google Slim Utility Travel Bag"""
"""Suitcase Organizer Cubes"""
…
"""Android 17oz Stainless Steel S…"
"""Android Men's Short Sleeve Her…"
"""Android Men's Take Charge Shor…"
"""Google Women's Scoop Neck Tee …"


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [None]:
tfidf = vectorizer.fit_transform(df['v2ProductName'])
tfidf

<49512x818 sparse matrix of type '<class 'numpy.float64'>'
	with 442458 stored elements in Compressed Sparse Row format>

In [None]:
recall(actual_bought, rec, 3)

TypeError: 'set' object is not subscriptable

In [None]:
# for k in range(1, 9):
    # print(f"Recall@{k} = {recall(actual_bought, rec, k)}")

{'Leatherette Journal'}
Recall@1 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal'}
Recall@2 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@3 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@4 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@5 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@6 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@7 = 0
{"Google Women's Vintage Hero Tee Black", 'Leatherette Journal', 'Suitcase Organizer Cubes'}
Recall@8 = 0


In [None]:
df.write_csv("../data/cleaned_google_analytics.csv")

### Data Pre-processing Pipeline

In [None]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    encoded_df = encoding(df)
    
    user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
        pl.col('country'), pl.col('browser'),
        pl.col('transactionId'), pl.col('v2ProductName')
    )
    user_profiles_no_id = user_profiles.drop('fullVisitorId')
    user_profiles_no_id = user_profiles_no_id.to_numpy()
    active_users = encoded_df.drop('fullVisitorId')
    active_users = active_users.to_numpy()

    similarities = calculate_similarity(user_profiles, user_profiles_no_id, active_users)
    sorted_dict = sorted(similarities.items(), key=lambda x: x[1])

    
    df.write_csv("../data/cleaned_google_analytics.csv")
    return recommendation(df,sorted_dict) 

In [None]:
preprocessing_pipeline(df)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

# Conclusions 
- 