In [30]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import plotly.express as px
import plotly.graph_objects as go
import time

In [31]:
# read raw data
spark = pyspark.sql.SparkSession.builder.appName("app1").getOrCreate()
# sdf = spark.read.csv("data/*.csv", header=True, inferSchema=True)
# sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)
# sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)

In [32]:
# join both months together
# sdf = sdf_201910.union(sdf_201911)
sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)
sdf.show()

+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                null|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|elect

## Preparation

Prepare and enhance data for analysis and modelling.

In [33]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

# Feature Splitting
sdf = sdf.withColumn("category_class", f.substring_index(sdf.category_code, '.', 1))

sdf = sdf.withColumn("year", f.year("event_time"))
sdf = sdf.withColumn("month", f.month("event_time"))
sdf = sdf.withColumn("weekofyear", f.weekofyear("event_time"))
sdf = sdf.withColumn("dayofyear", f.dayofyear("event_time"))
sdf = sdf.withColumn("dayofweek", f.dayofweek("event_time"))
sdf = sdf.withColumn("dayofmonth", f.dayofmonth("event_time"))

# None Handling
sdf = sdf.fillna(value="not defined")

sdf.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = false)
 |-- product_id: string (nullable = false)
 |-- category_id: string (nullable = false)
 |-- category_code: string (nullable = false)
 |-- brand: string (nullable = false)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = false)
 |-- user_session: string (nullable = false)
 |-- category_class: string (nullable = false)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)



## Dataframe Creation

create several dataframes with different aggregation level to answer different questions/ tasks.

In [34]:
# raw
sdf_raw = sdf
sdf_raw.show()

+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+
|         event_time|event_type|product_id|        category_id|       category_code|      brand| price|  user_id|        user_session|category_class|year|month|weekofyear|dayofyear|dayofweek|dayofmonth|
+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|     xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|   electronics|2019|   11|        44|      305|        6|         1|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|     janome|293.65|530496790|8e5f4f83-366c-4f7...|    appliances|2019|   11|        44|      305|       

In [35]:
# creating spark dataframe for unique number of users each month

sdf_user_id_by_month = sdf_raw.select("user_id", "month").distinct().groupBy("month").count()

In [36]:
# Data Prep for user graph

pdf_usr_id_by_mnth = sdf_user_id_by_month.toPandas()
pdf_usr_id_by_mnth = pdf_usr_id_by_mnth[pdf_usr_id_by_mnth.month != 12]
pdf_usr_id_by_mnth.month = pdf_usr_id_by_mnth.month.astype(str)
px.bar(pdf_usr_id_by_mnth, x="month", y="count", title="Unique users each month")

In October 3.02 million people visited the site and in November the total user count grew to 3.696 million. Which nets
a difference of around 700,000 users.

In [37]:
sdf_user_id_both_months = sdf_raw.select("user_id", "month").distinct().groupby("user_id").sum()

In [38]:
cnt = sdf_user_id_both_months.where(sdf_user_id_both_months["sum(month)"] == 21).count()
print("Amount of users that visited the site in both months:", cnt)

Amount of users that visited the site in both months: 0


Not all users in October and November are to be expected to only visit in the respective month. A total of 1,400,979 users
visited the page in both October and November.

### user_session

In [39]:
sdf_usr_ses = sdf_raw.select("user_session", "month").distinct()

sdf_ses_by_mnth = sdf_usr_ses.groupby("month").count()

In [40]:
pdf_usr_ses_by_mnth = sdf_usr_ses.toPandas()
pdf_usr_ses_by_mnth = pdf_usr_ses_by_mnth[pdf_usr_ses_by_mnth.month != 12]
pdf_usr_ses_by_mnth.month = pdf_usr_id_by_mnth.month.astype(str)
px.bar(pdf_usr_id_by_mnth, x="month", y="count", title="Unique sessions each month")

Overview:
- How many sessions does a user have on average?
- How many products does a user buy / view / put in cart on average?
- How many interactions does a user have within one session on average?
- Avrg Sessions per (Week)Day => Boxplot?


#### How many sessions does a user have on average?

In [41]:
sdf_agg_user = sdf_raw.select("user_id", "user_session").distinct().groupBy("user_id").count()
sdf_agg_user.select("count").describe().show()

+-------+-----+
|summary|count|
+-------+-----+
|  count|  126|
|   mean|  1.0|
| stddev|  0.0|
|    min|    1|
|    max|    1|
+-------+-----+



In [42]:
print("Statistical distribution of Sessions per User in October")
sdf_agg_user_mnth = sdf_raw.select("user_id", "user_session", "month").distinct().groupBy("user_id", "month").count()
sdf_agg_user_mnth.where(sdf_agg_user_mnth.month == 10).select("count").describe().show()

print("Statistical distribution of Sessions per User in November")
sdf_agg_user_mnth = sdf_raw.select("user_id", "user_session", "month").distinct().groupBy("user_id", "month").count()
sdf_agg_user_mnth.where(sdf_agg_user_mnth.month == 11).select("count").describe().show()

Statistical distribution of Sessions per User in October
+-------+-----+
|summary|count|
+-------+-----+
|  count|    0|
|   mean| null|
| stddev| null|
|    min| null|
|    max| null|
+-------+-----+

Statistical distribution of Sessions per User in November
+-------+-----+
|summary|count|
+-------+-----+
|  count|  126|
|   mean|  1.0|
| stddev|  0.0|
|    min|    1|
|    max|    1|
+-------+-----+


Finished operation in 6.7s


#### How many products does a user buy / view / put in cart on average?

In [43]:
sdf_amount_events = sdf_raw.select("user_id", "event_type").groupBy("event_type").count()
amount_usr = sdf_raw.select("user_id").distinct().count()
pdf_avrg_events_usr = sdf_amount_events.toPandas()

pdf_avrg_events_usr["count"] = pdf_avrg_events_usr["count"].div(amount_usr)
print(pdf_avrg_events_usr)

  event_type     count
0   purchase  0.015873
1       view  1.698413
2       cart  0.007937


A user views on average XX item, puts XX items in their cart and buys XX items.


In [44]:
sdf_agg_user_type = sdf_raw.select("user_id", "event_type").groupBy("user_id").count()
most_active_user = sdf_agg_user_type.sort("count", ascending=False).take(1)

print(f"The most active user has the ID {most_active_user[0][0]} with a total of {most_active_user[0][1]} interactions.")

The most active user has the ID 566255262 with a total of 6 interactions.


#### How many interactions does a user have within one session on average? - done

In [45]:
sdf_amount_interactions_per_sess = sdf_raw.select("user_session", "event_type").groupBy("user_session", "event_type").count()
# sdf_amount_interactions_per_sess.show()
print("Average amount of views per session:")
sdf_amount_interactions_per_sess.where(sdf_amount_interactions_per_sess.event_type == "view").agg({"count":"mean"}).show()
print("Average amount of 'Add to Cart' per session:")
sdf_amount_interactions_per_sess.where(sdf_amount_interactions_per_sess.event_type == "cart").agg({"count":"mean"}).show()
print("Average amount of purchases per session:")
sdf_amount_interactions_per_sess.where(sdf_amount_interactions_per_sess.event_type == "purchase").agg({"count":"mean"}).show()

Average amount of views per session:
+----------+
|avg(count)|
+----------+
|     1.712|
+----------+

Average amount of 'Add to Cart' per session:
+----------+
|avg(count)|
+----------+
|       1.0|
+----------+

Average amount of purchases per session:
+----------+
|avg(count)|
+----------+
|       1.0|
+----------+


Finished operation in 3.68s


#### Avrg Sessions per week day

In [None]:
# User session distribution over every day of week

sdf_usr_ses_dist_week = sdf_raw.select("user_session", "dayofyear", "dayofweek").groupBy("dayofyear","dayofweek").count()
pdf_usr_sess_time_dist = sdf_usr_ses_dist_week.toPandas()
fig_usr_time = px.box(pdf_usr_sess_time_dist, x="dayofweek", y="count", title="Average Sessions per Weekday")
fig_usr_time.update_layout(
    xaxis_title="Day of Week"
)