# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [119]:
import polars as pl 
import polars.selectors as cs 
from google.cloud import bigquery
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [120]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [121]:
# Create client object
client = bigquery.Client()

In [122]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [123]:
run_query = client.query(QUERY)

In [124]:
df = pl.from_arrow(run_query.to_arrow())



In [125]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [126]:
df.shape

(49512, 13)

In [127]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [128]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501515689,"""4175687200071793448""","""Northern Europe""","""Ireland""","""Dublin""","""Chrome""","""Windows""","""desktop""","""google""",,"""Google Men's 100% Cotton Short…","""Home/Shop by Brand/Google/"""


In [129]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [130]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [131]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [132]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [133]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))
df_cleaned

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,i32,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",0,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""",1,"""Google Women's Scoop Neck Tee …","""Apparel"""


# Encoding the entire df  

In [None]:
test_encoding = df_cleaned.select(pl.col('fullVisitorId'),pl.col('country'), 
                                  pl.col('browser'),pl.col('transactionId'),pl.col('v2ProductName'))
test_encoding

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Google Lunch Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Electronics Accessory Pouch"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Sport Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Google Slim Utility Travel Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Suitcase Organizer Cubes"""
…,…,…,…,…
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Take Charge Shor…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Google Women's Scoop Neck Tee …"


In [None]:
test_encoding = test_encoding.filter(pl.col('transactionId') == 1)
test_encoding

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""186231215995844689""","""United States""","""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""186231215995844689""","""United States""","""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Blackout Cap"""
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Women's Vintage Hero Te…"
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Men's Long & Lean Tee C…"
…,…,…,…,…
"""7483600664917507409""","""United States""","""Chrome""",1,"""Google 25 oz Red Stainless Ste…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Take Charge Shor…"


In [None]:
le = LabelEncoder()
le.fit(test_encoding.select(pl.col('country')))

  y = column_or_1d(y, warn=True)


In [None]:
encoded_countries = le.transform(test_encoding.select(pl.col('country')))
encoded_countries

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [None]:
test_encoding = test_encoding.with_columns(pl.Series('country', encoded_countries))
test_encoding

fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,str,i32,str
"""186231215995844689""",2,"""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""186231215995844689""",2,"""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""0345672699449577691""",2,"""Chrome""",1,"""Google Blackout Cap"""
"""0345672699449577691""",2,"""Chrome""",1,"""Google Women's Vintage Hero Te…"
"""0345672699449577691""",2,"""Chrome""",1,"""Google Men's Long & Lean Tee C…"
…,…,…,…,…
"""7483600664917507409""",2,"""Chrome""",1,"""Google 25 oz Red Stainless Ste…"
"""7483600664917507409""",2,"""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""",2,"""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""",2,"""Chrome""",1,"""Android Men's Take Charge Shor…"


In [None]:
le.fit(test_encoding.select(pl.col('v2ProductName')))
encoded_products = le.transform(test_encoding.select(pl.col('v2ProductName')))
test_encoding = test_encoding.with_columns(pl.Series('v2ProductName', encoded_products))
test_encoding

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,str,i32,i32
"""186231215995844689""",2,"""Chrome""",1,44
"""186231215995844689""",2,"""Chrome""",1,44
"""0345672699449577691""",2,"""Chrome""",1,26
"""0345672699449577691""",2,"""Chrome""",1,75
"""0345672699449577691""",2,"""Chrome""",1,46
…,…,…,…,…
"""7483600664917507409""",2,"""Chrome""",1,23
"""7483600664917507409""",2,"""Chrome""",1,5
"""7483600664917507409""",2,"""Chrome""",1,10
"""7483600664917507409""",2,"""Chrome""",1,11


In [None]:
le.fit(test_encoding.select(pl.col('browser')))
encoded_browser = le.transform(test_encoding.select(pl.col('browser')))
test_encoding = test_encoding.with_columns(pl.Series('browser', encoded_browser))
test_encoding

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


fullVisitorId,country,browser,transactionId,v2ProductName
str,i32,i32,i32,i32
"""186231215995844689""",2,0,1,44
"""186231215995844689""",2,0,1,44
"""0345672699449577691""",2,0,1,26
"""0345672699449577691""",2,0,1,75
"""0345672699449577691""",2,0,1,46
…,…,…,…,…
"""7483600664917507409""",2,0,1,23
"""7483600664917507409""",2,0,1,5
"""7483600664917507409""",2,0,1,10
"""7483600664917507409""",2,0,1,11


In [None]:
user_profiles = test_encoding.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('browser'),
    pl.col('transactionId'), pl.col('v2ProductName')
)
user_profiles

fullVisitorId,country,browser,transactionId,v2ProductName
str,list[i32],list[i32],list[i32],list[i32]
"""9687976620185337812""","[0, 0, … 0]","[0, 0, … 0]","[1, 1, … 1]","[77, 82, … 82]"
"""1921325710867532207""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[38, 69, … 97]"
"""1681975491896946073""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[39, 41, … 39]"
"""442359446750687328""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[14, 55, … 55]"
"""7455554400123317161""","[2, 2]","[0, 0]","[1, 1]","[28, 28]"
…,…,…,…,…
"""7463172420271311409""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[27, 3, … 41]"
"""4408662176448539530""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[79, 1, … 1]"
"""5139927098906360724""","[2, 2, … 2]","[0, 0, … 0]","[1, 1, … 1]","[43, 53, … 54]"
"""4952396078130707023""","[2, 2]","[0, 0]","[1, 1]","[34, 34]"


In [None]:
user_profiles_no_id = user_profiles.drop('fullVisitorId')
user_profiles_no_id = user_profiles_no_id.to_numpy()
active_users = df.drop('fullVisitorId')
active_users = df.to_numpy()
active_users

array(['20170731', 1501537217, '7483600664917507409', 'Northern America',
       'United States', 'Mountain View', 'Chrome', 'Macintosh', 'desktop',
       '(direct)', None,
       "Google Men's 100% Cotton Short Sleeve Hero Tee Navy", '(not set)'],
      dtype=object)

In [None]:
user_profile_avg_similarity_dict_test = {}
for i in range(0,len(user_profiles)):
    test_user_profile = np.vstack(user_profiles_no_id[i]).T
    avg_similarity = cosine_similarity(test_user_profile, [active_users[-1]])
    avg_similarity = avg_similarity.mean()
    user_profile_avg_similarity_dict_test[f'{user_profiles['fullVisitorId'][i]}'] = avg_similarity
print(user_profile_avg_similarity_dict_test)

ValueError: could not convert string to float: 'Northern America'

In [None]:
sorted_dict = sorted(user_profile_avg_similarity_dict_test.items(), key=lambda x: x[1])
sorted_dict[-5:]

[('8321566838784998459', 0.999988213949983),
 ('0842088345978971117', 0.9999895880848787),
 ('1573919593896895308', 0.9999930220712488),
 ('813207140579582070', 0.9999985949965018),
 ('4064135835973062555', 0.9999989174633006)]

In [None]:
recommendations = df.filter((pl.col('fullVisitorId') == sorted_dict[-1][0]) & ((pl.col('transactionId') != "null")))
recommendations = recommendations.select(pl.col('v2ProductName')).unique()
recommendations

v2ProductName
str
"""Google Women's Lightweight Mic…"


In [None]:
df

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google Women's Scoop Neck Tee …","""Apparel"""


In [None]:
df = df.select(pl.col('fullVisitorId'),pl.col('country'), 
            pl.col('browser'),pl.col('transactionId'),
            pl.col('v2ProductName'))

df = df.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))

In [None]:
df

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Google Lunch Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Electronics Accessory Pouch"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Sport Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Google Slim Utility Travel Bag"""
"""2219384770970157334""","""Slovakia""","""Firefox""",0,"""Suitcase Organizer Cubes"""
…,…,…,…,…
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Take Charge Shor…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Google Women's Scoop Neck Tee …"


In [None]:
purchases_df = df.filter(pl.col('transactionId') == 1)
purchases_df

fullVisitorId,country,browser,transactionId,v2ProductName
str,str,str,i32,str
"""186231215995844689""","""United States""","""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""186231215995844689""","""United States""","""Chrome""",1,"""Google Men's Bike Short Sleeve…"
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Blackout Cap"""
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Women's Vintage Hero Te…"
"""0345672699449577691""","""United States""","""Chrome""",1,"""Google Men's Long & Lean Tee C…"
…,…,…,…,…
"""7483600664917507409""","""United States""","""Chrome""",1,"""Google 25 oz Red Stainless Ste…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android 17oz Stainless Steel S…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Short Sleeve Her…"
"""7483600664917507409""","""United States""","""Chrome""",1,"""Android Men's Take Charge Shor…"


In [None]:
encoder = LabelEncoder()

for i in purchases_df.columns:
    if i == 'transactionId':
        continue
    encoder.fit(purchases_df.select(pl.col(f'{i}')))
    encoded_data = encoder.transform(purchases_df.select(pl.col(f'{i}')))
    purchases_df = purchases_df.with_columns(pl.Series(f'{i}', encoded_data))
    

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
purchases_df

fullVisitorId,country,browser,transactionId,v2ProductName
i32,i32,i32,i32,i32
9,2,0,1,44
9,2,0,1,44
3,2,0,1,26
3,2,0,1,75
3,2,0,1,46
…,…,…,…,…
46,2,0,1,23
46,2,0,1,5
46,2,0,1,10
46,2,0,1,11


In [134]:
encoding(df)

ComputeError: cannot compare string with numeric type (i32)

In [None]:
# Need to split the data into subsets & perform the target encoding... k fold target encoding
# num_samples = len(new_df) // 8
# num_samples
# new_df = new_df.with_columns(pl.DataFrame({'kfold':np.repeat(np.arange(1, 9), num_samples)}))
# new_df

In [None]:
df.write_csv("../data/cleaned_google_analytics.csv")

### Data Pre-processing Pipeline

In [None]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    df.write_csv("../data/cleaned_google_analytics.csv")
    return df 

In [None]:
preprocessing_pipeline(df)

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""
2017-07-31,1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google Women's Scoop Neck Tee …","""Apparel"""


# Conclusions 
- 