# General Dataset Analyse

## Import

In [8]:
import os
import pyspark
import pyspark.sql.functions as f


## Read

In [9]:
#Pyspark
spark = pyspark.sql.SparkSession.builder.appName("app1").getOrCreate()
# sdf = spark.read.csv("data/*.csv", header=True, inferSchema=True)
# sdf = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)
sdf.show()

+-----------------------+----+--------+-------------------+----------------------+--------+------+---------+------------------------------------+
|2019-11-01 00:00:00 UTC|view| 1003461|2053013555631882655|electronics.smartphone|  xiaomi|489.07|520088904|4d3b30da-a5e4-49df-b1a8-ba5943f1dd33|
+-----------------------+----+--------+-------------------+----------------------+--------+------+---------+------------------------------------+
|   2019-11-01 00:00:...|view| 5000088|2053013566100866035|  appliances.sewing...|  janome|293.65|530496790|                8e5f4f83-366c-4f7...|
|   2019-11-01 00:00:...|view|17302664|2053013553853497655|                  null|   creed| 28.31|561587266|                755422e7-9040-477...|
|   2019-11-01 00:00:...|view| 3601530|2053013563810775923|  appliances.kitche...|      lg|712.87|518085591|                3bfb58cd-7892-48c...|
|   2019-11-01 00:00:...|view| 1004775|2053013555631882655|  electronics.smart...|  xiaomi|183.27|558856683|                

## Prepare


In [10]:
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))

AnalysisException: Cannot resolve column name "event_time" among (2019-11-01 00:00:00 UTC, view, 1003461, 2053013555631882655, electronics.smartphone, xiaomi, 489.07, 520088904, 4d3b30da-a5e4-49df-b1a8-ba5943f1dd33)

In [None]:
sdf.printSchema()

## Description

### event_time

In [None]:
sdf.describe("event_time").show()

### event_type

In [None]:
sdf.describe("event_type").show()

## Analyse


In [None]:
# Event Type Distribution
sdf_1910_event_type_dist = sdf.groupBy("event_type").count()
sdf_1910_event_type_dist.show()

In [None]:
# Plot Event Types
import plotly.express as px
df = sdf_1910_event_type_dist.toPandas()
fig = px.pie(df, values='count', names='event_type', title='Distribution of Customer Actions')
fig.show()

In [None]:
# prep for plot
sdf_time_dist = sdf.select("event_time", "event_type")
sdf_time_dist = sdf_time_dist.withColumn("year", f.year("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("month", f.month("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("weekofyear", f.weekofyear("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofyear", f.dayofyear("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofweek", f.dayofweek("event_time"))
sdf_time_dist = sdf_time_dist.withColumn("dayofmonth", f.dayofmonth("event_time"))
sdf_time_dist.show()

In [None]:
sdf_time_dist_month = sdf_time_dist.groupBy("event_type", "month", "dayofmonth").count()
sdf_time_dist_month = sdf_time_dist_month.withColumnRenamed("count", "cnt")
sdf_time_dist_month = sdf_time_dist_month.sort("event_type", "month", "dayofmonth")
sdf_time_dist_month.show()

In [None]:
# Timestamp Distribution (per event_type) over all

df = sdf_time_dist_month.toPandas()

import plotly.graph_objects as go

fig = go.Figure()

fig.update_layout(
    template="simple_white",
    xaxis=dict(title_text="Month"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
)

colors = ["#2A66DE", "#FFC32B", "#20aa1b"]

for r, c in zip(df.event_type.unique(), colors):
    plot_df = df[df.event_type == r]
    fig.add_trace(
        go.Bar(x=[plot_df.dayofmonth, plot_df.month], y=plot_df.cnt, name=r, marker_color=c),
    )

fig