# General Dataset Analyse

## Import

In [17]:
import os
import pyspark
import pyspark.sql.functions as f
import plotly.express as px
import plotly.graph_objects as go


## Read

In [18]:
#Pyspark
spark = pyspark.sql.SparkSession.builder.appName("app1").getOrCreate()
# sdf = spark.read.csv("data/*.csv", header=True, inferSchema=True)
sdf = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)
sdf.show()

+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                null|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|elect

## Prepare


In [19]:
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))

In [20]:
sdf.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



## Description

### event_time
- Timestamp in UTC
- no None-Values

In [21]:
sdf.select("event_time").show(5)
sdf.describe("event_time").show()

+-------------------+
|         event_time|
+-------------------+
|2019-11-01 01:00:00|
|2019-11-01 01:00:00|
|2019-11-01 01:00:01|
|2019-11-01 01:00:01|
|2019-11-01 01:00:01|
+-------------------+
only showing top 5 rows

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+



### event_type
- View
- Cart
- Purchase

In [None]:
print("Description:")
sdf.describe("event_type").show()

print("Unique Values:")
sdf.select("event_type").distinct().show()

print("Value Distribution:")
sdf_event_type_dist = sdf.groupBy("event_type").count()
sdf_event_type_dist.show()

Description:
+-------+----------+
|summary|event_type|
+-------+----------+
|  count|  67501979|
|   mean|      null|
| stddev|      null|
|    min|      cart|
|    max|      view|
+-------+----------+

Unique Values:
+----------+
|event_type|
+----------+
|  purchase|
|      view|
|      cart|
+----------+

Value Distribution:


In [None]:
# Plot Event Types
df = sdf_event_type_dist.toPandas()
fig = px.pie(df, values='count', names='event_type', title='Distribution of Customer Actions')
fig.show()

## Analyse

In [None]:
# prep for plot
sdf_time_dist = sdf.select("event_time", "event_type")
sdf_time_dist = sdf_time_dist.withColumn("year", f.year("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("month", f.month("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("weekofyear", f.weekofyear("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofyear", f.dayofyear("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofweek", f.dayofweek("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofmonth", f.dayofmonth("event_time"))
sdf_time_dist.show()

In [None]:
sdf_time_dist_month = sdf_time_dist.groupBy("event_type", "month", "dayofmonth").count()
sdf_time_dist_month = sdf_time_dist_month.withColumnRenamed("count", "cnt")
sdf_time_dist_month = sdf_time_dist_month.sort("event_type", "month", "dayofmonth")
sdf_time_dist_month.show()

In [None]:
# Timestamp Distribution (per event_type) over every day of month

df = sdf_time_dist_month.toPandas()

fig = go.Figure()
fig.update_layout(
    template="simple_white",
    xaxis=dict(title_text="Day of Month"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
)
colors = ["#2A66DE", "#FFC32B", "#20aa1b"]

for r, c in zip(df.event_type.unique(), colors):
    plot_df = df[df.event_type == r]
    fig.add_trace(
        go.Bar(x=[plot_df.dayofmonth, plot_df.month], y=plot_df.cnt, name=r, marker_color=c),
    )
fig


In [None]:
sdf_time_dist.show()

In [None]:
sdf_time_dist_week = sdf_time_dist.groupBy("event_type", "weekofyear", "dayofweek").count()
sdf_time_dist_week = sdf_time_dist_week.withColumnRenamed("count", "cnt")
sdf_time_dist_week = sdf_time_dist_week.sort("event_type", "weekofyear", "dayofweek")
sdf_time_dist_week.show()

In [None]:
# Timestamp Distribution (per event_type) over every day of week

df = sdf_time_dist_week.toPandas()

fig = go.Figure()
fig.update_layout(
    template="simple_white",
    xaxis=dict(title_text="Day of Week"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
)
colors = ["#2A66DE", "#FFC32B", "#20aa1b"]

for r, c in zip(df.event_type.unique(), colors):
    plot_df = df[df.event_type == r]
    fig.add_trace(
        go.Bar(x=[plot_df.dayofweek, plot_df.weekofyear], y=plot_df.cnt, name=r, marker_color=c),
    )
fig