# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [487]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [488]:
import polars as pl 
from google.cloud import bigquery
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import sys 
import os 


# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from preprocessing import *

In [489]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [490]:
# Create client object
client = bigquery.Client()

In [491]:
QUERY = ("SELECT `date`, `visitStartTime`, `fullVisitorId`, `geoNetwork`.`subContinent`, `geoNetwork`.`country`,"
        "`geoNetwork`.`city`,`device`.`browser`, `device`.`operatingSystem`,`device`.`deviceCategory`,"
        "`trafficSource`.`source`, `h`.`item`.`transactionId`,`hp`.`v2ProductName`,`hp`.`v2ProductCategory` "
        "FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170731`, UNNEST(hits) AS h,"
        "UNNEST(h.product) AS hp;")

In [492]:
run_query = client.query(QUERY)

In [493]:
df = pl.from_arrow(run_query.to_arrow())

In [494]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
"""20170731""",1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [495]:
df.shape

(49512, 13)

In [496]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [497]:
df.sample()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501541115,"""4786020913442860213""","""Northern America""","""United States""","""not available in demo dataset""","""Safari""","""iOS""","""tablet""","""(direct)""",,"""UpCycled Handlebar Bag""","""Home/Bags/"""


In [498]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""49512""",49512.0,"""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""49512""","""462""","""49512""","""49512"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""49050""","""0""","""0"""
"""mean""",,1501500000.0,,,,,,,,,,,
"""std""",,21502.167394,,,,,,,,,,,
"""min""","""20170731""",1501500000.0,"""0002457163364254438""","""Australasia""","""Algeria""","""(not set)""","""Amazon Silk""","""(not set)""","""desktop""","""(direct)""","""ORD201707311786""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",,1501500000.0,,,,,,,,,,,
"""50%""",,1501500000.0,,,,,,,,,,,
"""75%""",,1501500000.0,,,,,,,,,,,
"""max""","""20170731""",1501600000.0,"""9997362993085245352""","""Western Europe""","""Vietnam""","""not available in demo dataset""","""YaBrowser""","""iOS""","""tablet""","""youtube.com""","""ORD201707312663""","""YouTube Youth Short Sleeve Tee…","""Wearables/Men's T-Shirts/"""


In [499]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,49050,0,0


In [500]:
df.filter(pl.col('transactionId') != 'null')

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501525374,"""186231215995844689""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707311786""","""Google Men's Bike Short Sleeve…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Blackout Cap""","""Headgear"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Women's Vintage Hero Te…","""Apparel"""
"""20170731""",1501549028,"""0345672699449577691""","""Northern America""","""United States""","""Kirkland""","""Chrome""","""Chrome OS""","""desktop""","""(direct)""","""ORD201707312107""","""Google Men's Long & Lean Tee C…","""Apparel"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Google 25 oz Red Stainless Ste…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android 17oz Stainless Steel S…","""Drinkware"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Short Sleeve Her…","""Apparel"""
"""20170731""",1501537217,"""7483600664917507409""","""Northern America""","""United States""","""Mountain View""","""Chrome""","""Macintosh""","""desktop""","""(direct)""","""ORD201707312663""","""Android Men's Take Charge Shor…","""Apparel"""


In [501]:
df_cleaned = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
df_cleaned.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,str,str,str,str,str,str,str,str,str,str
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Lunch Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Electronics Accessory Pouch""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Sport Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Google Slim Utility Travel Bag""","""Home/Bags/More Bags/"""
2017-07-31,1501520683,"""2219384770970157334""","""Eastern Europe""","""Slovakia""","""not available in demo dataset""","""Firefox""","""Windows""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/More Bags/"""


In [502]:
df_cleaned = df_cleaned.with_columns(pl.when(pl.col('transactionId') != 'null').then(1).otherwise(0).alias("transactionId"))

In [503]:
category_means = df_cleaned.group_by(pl.col('v2ProductName')).agg(
        pl.col('transactionId').mean()
)

In [504]:
for i in df_cleaned.columns:
    print(i)

date
visitStartTime
fullVisitorId
subContinent
country
city
browser
operatingSystem
deviceCategory
source
transactionId
v2ProductName
v2ProductCategory


In [505]:
df_encoded = df_cleaned

In [506]:
for i in df_encoded.columns:
    if i == 'date' or i == 'visitStartTime' or i == 'fullVisitorId' or i == 'transactionId':
        continue 
    print(i) 
    category_means = df_encoded.group_by(pl.col(f'{i}')).agg(
        pl.col('transactionId').mean()
    )
    df_encoded = df_encoded.join(category_means, on=f'{i}', how='left')
    df_encoded = df_encoded.with_columns(pl.col("transactionId_right").alias(f"{i}")).drop("transactionId_right")


subContinent
country
city
browser
operatingSystem
deviceCategory
source
v2ProductName
v2ProductCategory


In [507]:
df_encoded

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
date,i64,str,f64,f64,f64,f64,f64,f64,f64,i32,f64,f64
2017-07-31,1501520683,"""2219384770970157334""",0.0,0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0,0.0
2017-07-31,1501520683,"""2219384770970157334""",0.0,0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0,0.0
2017-07-31,1501520683,"""2219384770970157334""",0.0,0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0,0.0
2017-07-31,1501520683,"""2219384770970157334""",0.0,0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.008,0.0
2017-07-31,1501520683,"""2219384770970157334""",0.0,0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.00738,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…
2017-07-31,1501537217,"""7483600664917507409""",0.013947,0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.007491,0.254777
2017-07-31,1501537217,"""7483600664917507409""",0.013947,0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.004301,0.262
2017-07-31,1501537217,"""7483600664917507409""",0.013947,0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.054054,0.262
2017-07-31,1501537217,"""7483600664917507409""",0.013947,0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.04,0.262


# Encoding the df

In [508]:
encoded_df = target_encoding(df)
encoded_df

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName
str,f64,f64,f64,f64,f64,f64,i32,f64
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.0
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.008
"""2219384770970157334""",0.0,0.006208,0.001795,0.005962,0.011631,0.003665,0,0.00738
…,…,…,…,…,…,…,…,…
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.007491
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.004301
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.054054
"""7483600664917507409""",0.014312,0.013889,0.011785,0.01692,0.011631,0.020215,1,0.04


In [509]:
# encoded_df = label_encoding(df)
# encoded_df

In [510]:
user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
    pl.col('country'), pl.col('city'), 
    pl.col('browser'),pl.col('operatingSystem'),pl.col('deviceCategory'),
    pl.col('source'),pl.col('transactionId'),pl.col('v2ProductName')
)
user_profiles

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName
str,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[i32],list[f64]
"""6684428411656994293""","[0.014312, 0.014312, … 0.014312]","[0.006208, 0.006208, … 0.006208]","[0.011785, 0.011785, … 0.011785]","[0.006482, 0.006482, … 0.006482]","[0.003792, 0.003792, … 0.003792]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 1]","[0.00382, 0.005525, … 0.00102]"
"""9592374441860245919""","[0.014312, 0.014312, … 0.014312]","[0.006208, 0.006208, … 0.006208]","[0.011785, 0.011785, … 0.011785]","[0.01692, 0.01692, … 0.01692]","[0.011631, 0.011631, … 0.011631]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.00382, 0.0, … 0.230769]"
"""0280589015308186459""","[0.014312, 0.014312, … 0.014312]","[0.078788, 0.078788, … 0.078788]","[0.011785, 0.011785, … 0.011785]","[0.01692, 0.01692, … 0.01692]","[0.011631, 0.011631, … 0.011631]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 1]","[0.009804, 0.0, … 0.095238]"
"""141510936377234435""","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.011785, 0.011785, … 0.011785]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.0, 0.009434, … 0.00907]"
"""4580381703402544047""","[0.014312, 0.014312, … 0.014312]","[0.013889, 0.013889, … 0.013889]","[0.011785, 0.011785, … 0.011785]","[0.01692, 0.01692, … 0.01692]","[0.011631, 0.011631, … 0.011631]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.007663, 0.021739, … 0.02381]"
…,…,…,…,…,…,…,…,…
"""6466453014205648891""","[0.0, 0.0, … 0.0]","[0.0, 0.0, … 0.0]","[0.000797, 0.000797, … 0.000797]","[0.000566, 0.000566, … 0.000566]","[0.003792, 0.003792, … 0.003792]","[0.003665, 0.003665, … 0.003665]","[0, 0, … 0]","[0.00382, 0.0, … 0.0]"
"""7599785965042605635""","[0.0, 0.0, 0.0]","[0.006208, 0.006208, 0.006208]","[0.011785, 0.011785, 0.011785]","[0.006482, 0.006482, 0.006482]","[0.003792, 0.003792, 0.003792]","[0.020215, 0.020215, 0.020215]","[0, 0, 0]","[0.00382, 0.0, 0.012987]"
"""7839060813395976488""","[0.014312, 0.014312, … 0.014312]","[0.009153, 0.009153, … 0.009153]","[0.011785, 0.011785, … 0.011785]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.0, 0.0, … 0.0]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"
"""7958854055537008406""","[0.003591, 0.003591, … 0.003591]","[0.0, 0.0, … 0.0]","[0.011785, 0.011785, … 0.011785]","[0.005962, 0.005962, … 0.005962]","[0.011631, 0.011631, … 0.011631]","[0.020215, 0.020215, … 0.020215]","[0, 0, … 0]","[0.0, 0.0, … 0.0]"


In [None]:
user_profiles = user_profiles.filter(pl.col('fullVisitorId') != '327480040583975209')

In [512]:
user_profiles_no_id = user_profiles.drop('fullVisitorId')
user_profiles_no_id = user_profiles_no_id.to_numpy()
user_profiles_no_id

array([[array([0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195, 0.01431195, 0.01431195, 0.01431195, 0.01431195,
               0.01431195

In [513]:
active_users = encoded_df.drop('fullVisitorId')
active_users = active_users.to_numpy()
active_users

array([[0.        , 0.00620763, 0.00179533, ..., 0.00366492, 0.        ,
        0.        ],
       [0.        , 0.00620763, 0.00179533, ..., 0.00366492, 0.        ,
        0.        ],
       [0.        , 0.00620763, 0.00179533, ..., 0.00366492, 0.        ,
        0.        ],
       ...,
       [0.01431195, 0.01388889, 0.01178473, ..., 0.02021548, 1.        ,
        0.05405405],
       [0.01431195, 0.01388889, 0.01178473, ..., 0.02021548, 1.        ,
        0.04      ],
       [0.01431195, 0.01388889, 0.01178473, ..., 0.02021548, 0.        ,
        0.05181347]])

In [533]:
test_users = list(encoded_df.filter(pl.col('fullVisitorId') == '327480040583975209').row(-1))
test_users

['327480040583975209',
 0.0,
 0.0,
 0.0017953321364452424,
 0.0059616381544841885,
 0.011630818887559475,
 0.0,
 0,
 0.008791208791208791]

In [515]:
similarities = calculate_similarity(user_profiles, user_profiles_no_id, test_users)
similarities

{'6684428411656994293': 0.40050657049958216,
 '9592374441860245919': 0.36533043871039006,
 '0280589015308186459': 0.46366573013238743,
 '141510936377234435': 0.2517919498096569,
 '4580381703402544047': 0.45127566931836066,
 '1613325791840871416': 0.6849115263510296,
 '6898267827792251004': 0.2583821472083936,
 '1725357188078545604': 0.4433912656616324,
 '527144094637192790': 0.20396723434784259,
 '9133494466022174769': 0.26810378711048516,
 '5292294760335216459': 0.46340735125257443,
 '2626535900570185226': 0.2089910441729416,
 '5046604766860733596': 0.13586080102434694,
 '4819435223377641926': 0.1053745004991688,
 '5814587779009501825': 0.2089910441729416,
 '3978274099697910696': 0.27884391805338615,
 '5995194327047604452': 0.20519650108376997,
 '893012663667461238': 0.25372768561759274,
 '7029957300776763943': 0.2089910441729416,
 '4767330263280425166': 0.2089910441729416,
 '4819740319213617590': 0.24761332663763955,
 '6319276542426499571': 0.3793105237338588,
 '7395695273680845703':

In [516]:
sorted_dict = sorted(similarities.items(), key=lambda x: x[1])
sorted_dict[-5:]

[('9050516262502639530', 0.8974812395355899),
 ('2969460869770139627', 0.904213283199848),
 ('2822863863340859098', 0.906042781313841),
 ('4750798256681672234', 0.9262239661737786),
 ('2932521422004466324', 0.9415537437772193)]

In [517]:
actual_bought = df.filter((pl.col('fullVisitorId') == '0345672699449577691') & (pl.col('transactionId') != 'null')).select(pl.col('v2ProductName')).to_series().to_list()
actual_bought

['Google Blackout Cap',
 "Google Women's Vintage Hero Tee Black",
 "Google Men's Long & Lean Tee Charcoal",
 "Google Men's Vintage Badge Tee Black",
 'Google Blackout Cap',
 "Google Women's Vintage Hero Tee Black",
 "Google Men's Long & Lean Tee Charcoal",
 "Google Men's Vintage Badge Tee Black"]

In [518]:
rec = recommendation(df,sorted_dict)
rec

['Google 22 oz Water Bottle',
 'Google 25 oz Red Stainless Steel Bottle',
 "Google Men's Bike Short Sleeve Tee Charcoal",
 "Google Women's Scoop Neck Tee Black",
 "Android Men's Long & Lean Badge Tee Charcoal"]

In [519]:
precision(actual_bought, rec, 3)

0.0

In [520]:
df.write_csv("../data/cleaned_google_analytics.csv")

### Data Pre-processing Pipeline

In [521]:
def preprocessing_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: pre-processed dataframe
    """

    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format="%Y%m%d"))
    encoded_df = encoding(df)
    
    user_profiles = encoded_df.group_by(pl.col('fullVisitorId')).agg(
        pl.col('country'), pl.col('browser'),
        pl.col('transactionId'), pl.col('v2ProductName')
    )
    user_profiles_no_id = user_profiles.drop('fullVisitorId')
    user_profiles_no_id = user_profiles_no_id.to_numpy()
    active_users = encoded_df.drop('fullVisitorId')
    active_users = active_users.to_numpy()

    similarities = calculate_similarity(user_profiles, user_profiles_no_id, active_users)
    sorted_dict = sorted(similarities.items(), key=lambda x: x[1])

    
    df.write_csv("../data/cleaned_google_analytics.csv")
    return recommendation(df,sorted_dict) 

In [522]:
# preprocessing_pipeline(df)

# Conclusions 
- 