# Exploratory Data Analysis

The main objectives of this notebook are: 
- Explore the dataset through univariate & multivariate analysis 
- Investigate distribution of data 
- Extract relevant insights to determine steps for feature engineering

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import polars as pl
import plotly.express as px
import sys 
import os 

# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from eda import *

''

In [58]:
# df = pl.read_csv("../data/cleaned_google_analytics.csv",ignore_errors=True)
# df = pl.read_csv("../data/google_analytics_data.csv",ignore_errors=True)
df = pl.read_csv("../data/entire_google_analytics_data.csv", ignore_errors=True)

In [59]:
df.head()

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
i64,i64,i64,str,str,str,str,str,str,str,str,str,str
20170320,1490013196,7.41694360311984e+17,"""Southern Europe""","""Spain""","""not available in demo dataset""","""Chrome""","""Windows""","""desktop""","""google""",,"""YouTube Hard Cover Journal""","""Home/Shop by Brand/YouTube/"""
20170320,1490075224,7.517161247320425e+17,"""Northern America""","""United States""","""New York""","""Safari""","""iOS""","""mobile""","""google""",,"""Google Alpine Style Backpack""","""Home/Bags/"""
20170320,1490069949,3.875753008535006e+18,"""Northern America""","""United States""","""not available in demo dataset""","""Safari (in-app)""","""iOS""","""mobile""","""m.facebook.com""",,"""YouTube Custom Decals""","""Home/Shop by Brand/YouTube/"""
20170320,1490030549,,"""Southern Europe""","""Spain""","""Barcelona""","""Chrome""","""Macintosh""","""desktop""","""google""",,"""Suitcase Organizer Cubes""","""Home/Bags/"""
20170210,1486780166,1.321377067218363e+18,"""Northern America""","""United States""","""not available in demo dataset""","""Chrome""","""Android""","""mobile""","""google""",,"""YouTube RFID Journal""","""Home/Shop by Brand/YouTube/"""


In [60]:
df.shape

(18605964, 13)

In [61]:
df.columns

['date',
 'visitStartTime',
 'fullVisitorId',
 'subContinent',
 'country',
 'city',
 'browser',
 'operatingSystem',
 'deviceCategory',
 'source',
 'transactionId',
 'v2ProductName',
 'v2ProductCategory']

In [62]:
df.dtypes

[Int64,
 Int64,
 Int64,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String,
 String]

In [63]:
df.describe()

statistic,date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str
"""count""",18605964.0,18605964.0,17307736.0,"""18605964""","""18605964""","""18605964""","""18605964""","""18605964""","""18605964""","""18605964""","""73899""","""18605964""","""18605964"""
"""null_count""",0.0,0.0,1298228.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""18532065""","""0""","""0"""
"""mean""",20165000.0,1482700000.0,4.1259e+18,,,,,,,,,,
"""std""",4692.864838,9722900.0,2.8348e+18,,,,,,,,,,
"""min""",20160801.0,1470000000.0,5104000000000.0,"""(not set)""","""(not set)""","""'s Hertogenbosch""","""(not set)""","""(not set)""","""desktop""","""(direct)""","""ORD2016080112""","""1 oz Hand Sanitizer""","""${escCatTitle}"""
"""25%""",20160912.0,1473700000.0,1.4003e+18,,,,,,,,,,
"""50%""",20161205.0,1480900000.0,4.0154e+18,,,,,,,,,,
"""75%""",20170402.0,1491100000.0,6.6052e+18,,,,,,,,,,
"""max""",20170801.0,1501700000.0,9.2234e+18,"""Western Europe""","""Åland Islands""","""not available in demo dataset""","""osee2unifiedRelease""","""iOS""","""tablet""","""youtube.com""","""ORD201708012585""","""YouTube Youth Short Sleeve Tee…","""YouTube/"""


In [64]:
df.select(pl.all().is_null().sum())

date,visitStartTime,fullVisitorId,subContinent,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductName,v2ProductCategory
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,1298228,0,0,0,0,0,0,0,18532065,0,0


In [65]:
# Unique visitors 
print("Unique visitors:")
print(df.select(pl.col('fullVisitorId').n_unique()))

Unique visitors:
shape: (1, 1)
┌───────────────┐
│ fullVisitorId │
│ ---           │
│ u32           │
╞═══════════════╡
│ 334649        │
└───────────────┘


In [66]:
print_popular_stats(df, "city")

city,count
str,u32
"""not available in demo dataset""",9878431
"""Mountain View""",1478940
"""New York""",967686
"""San Francisco""",628764
"""Sunnyvale""",490302
"""(not set)""",423665
"""San Jose""",341064
"""Los Angeles""",275088
"""Chicago""",260635
"""London""",177440


In [67]:
# Total transactions 
print("Number of transactions: ",df.select(pl.col("transactionId").is_not_null().sum()))

Number of transactions:  shape: (1, 1)
┌───────────────┐
│ transactionId │
│ ---           │
│ u32           │
╞═══════════════╡
│ 73899         │
└───────────────┘


In [87]:
categories = df.filter(pl.col('transactionId') != 'null').select(pl.col('v2ProductCategory'))
categories = categories.select(pl.col("v2ProductCategory").value_counts(sort=True)).unnest("v2ProductCategory")
fig = px.pie(categories[:10], values="count", names="v2ProductCategory", title="Top 10 Sold Categories")
fig.show()

In [69]:
pie_categories = df.filter(pl.col('transactionId') != 'null').select(pl.col('v2ProductCategory'))
pie_categories = pie_categories.select(pl.col("v2ProductCategory").value_counts(sort=True)).unnest("v2ProductCategory")
pie_categories.write_csv("../utils/pie_categories.csv")

In [70]:
pie_products=df.filter(pl.col('transactionId') != 'null').select(pl.col('v2ProductName'))
pie_products=pie_products.select(pl.col("v2ProductName").value_counts(sort=True)).unnest("v2ProductName")
pie_products.write_csv("../utils/pie_products.csv")

In [71]:
viewed_products = df.select(pl.col("v2ProductName").value_counts(sort=True)).unnest("v2ProductName")
viewed_categories = df.select(pl.col("v2ProductCategory").value_counts(sort=True)).unnest("v2ProductCategory")
viewed_products.write_csv("../utils/viewed_products.csv")
viewed_categories.write_csv("../utils/viewed_categories.csv")

In [72]:
fig = px.bar(print_popular_stats(df, "country"), x='country', y='count', title='User Country Origin')
fig.show()

In [73]:
fig = px.bar(print_popular_stats(df, "source"), x='source', y='count', title='Most Popular Sources')
fig.show()

In [74]:
fig = px.bar(print_popular_stats(df, "deviceCategory"), x='deviceCategory', y='count', title='Most Popular User Devices')
fig.show()

In [75]:
fig = px.bar(print_popular_stats(df, "operatingSystem"), x='operatingSystem', y='count', title='Most Popular User Devices')
fig.show()

In [76]:
daily_visitors=df.group_by(pl.col('date')).agg(pl.col('fullVisitorId').unique().len()).sort("date")
daily_visitors = daily_visitors.rename({"fullVisitorId":"Visitor Count"})
daily_visitors

date,Visitor Count
i64,u32
20160801,837
20160802,921
20160803,1084
20160804,1252
20160805,1049
…,…
20170728,1425
20170729,1032
20170730,1174
20170731,1534


In [77]:
daily_visitors = daily_visitors.with_columns(
    pl.col("date")
    # Convert to string to parse as date
    .cast(pl.Utf8) 
    .str.strptime(pl.Date, format="%Y%m%d")  
    # Format as day-month-year
    .dt.strftime("%d-%m-%Y") 
    .alias("date")
)
daily_visitors

date,Visitor Count
str,u32
"""01-08-2016""",837
"""02-08-2016""",921
"""03-08-2016""",1084
"""04-08-2016""",1252
"""05-08-2016""",1049
…,…
"""28-07-2017""",1425
"""29-07-2017""",1032
"""30-07-2017""",1174
"""31-07-2017""",1534


In [78]:
fig = px.bar(daily_visitors, x='date', y='Visitor Count', title='Daily Visitors')
fig.show()

In [79]:
daily_visitors.write_csv("../utils/daily_visitors.csv")

In [80]:
daily_purchases=df.filter(pl.col('transactionId') != 'null').group_by(
    pl.col('date')).agg(pl.len()).sort("date")
daily_purchases

date,len
i64,u32
20160801,226
20160802,124
20160804,78
20160805,376
20160806,148
…,…
20170728,318
20170729,148
20170730,194
20170731,440


In [81]:
daily_purchases.write_csv("../utils/daily_purchases.csv")

In [82]:
popular_products = df.filter(pl.col('transactionId') != 'null').group_by(
    pl.col('date','v2ProductName')).agg(pl.len()).sort("date","len", descending=[False, True])
popular_products

date,v2ProductName,len
i64,str,u32
20160801,"""Google Sunglasses""",22
20160801,"""Gift Card - $25.00""",20
20160801,"""22 oz Mini Mountain Bottle""",12
20160801,"""Engraved Ceramic Google Mug""",10
20160801,"""Google Men's 100% Cotton Short…",8
…,…,…
20170801,"""Google Leather Perforated Jour…",2
20170801,"""Waze Baby on Board Window Deca…",2
20170801,"""Seat Pack Organizer""",2
20170801,"""Google Tote Bag""",2


In [83]:
encoded_df = pl.read_csv('../data/encoded_df.csv',ignore_errors=True)
encoded_df.head()

fullVisitorId,country,city,browser,operatingSystem,deviceCategory,source,transactionId,v2ProductCategory
i64,f64,f64,f64,f64,f64,f64,i64,f64
2219384770970157334,0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
2219384770970157334,0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
2219384770970157334,0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
2219384770970157334,0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0
2219384770970157334,0.0,0.005886,0.001795,0.00545,0.01104,0.003369,0,0.0


In [84]:
fig = px.imshow(encoded_df['country':].corr(), x=encoded_df.columns[1:], y=encoded_df.columns[1:], 
                    text_auto='.2f', title="Feature Correlations")
fig.show()