In [None]:
import pyspark
from pyspark.sql.functions import col
from pyspark.sql.functions import when

In [None]:
spark = pyspark.sql.SparkSession.builder.appName("eCommerce data").getOrCreate()

#### read data from all files of all months

In [None]:
output_path = "results/combined-by-category/"
input_path = "data/"
df = spark.read.csv(input_path, header=True)

#### take a look at the dataframe

In [None]:
df.show(5)

#### get total number of events

In [None]:
df.count()

#### filter out null categories

In [None]:
df_not_null = df.where(col("category_code") != "null")

#### only keep one entry per user, product, and event

In [None]:
df_final = df_not_null.dropDuplicates(subset=['user_id', 'product_id', 'event_type'])

#### correct data by replacing category_code "construction.tools.light" by "electronics.smartphone"

In [None]:
df_corrected = df_final.withColumn("category_code",
                                      when(col("category_code") == "construction.tools.light",
                                           "electronics.smartphone").otherwise(col("category_code")))

#### from rows of corrected data only filter 'view' event

In [None]:
df_view = df_corrected.where(col("event_type") == "view")

#### group results by category code, count and sort then write results to file

In [None]:
df_view.groupBy('category_code').count().sort(col("count")).write.csv(output_path + "view-by-category")

#### from rows of corrected data only filter 'purchase' event

In [None]:
df_purchase = df_corrected.where(col("event_type") == "purchase")

#### group results by category code, count and sort then write results to file

In [None]:
df_purchase.groupBy('category_code').count().sort(col("count")).write.csv(output_path + "purchase-by-category")

#### from rows of corrected data only filter 'cart' event

In [None]:
df_cart = df_corrected.where(col("event_type") == "cart")

#### group results by category code, count and sort then write results to file

In [None]:
df_cart.groupBy('category_code').count().sort(col("count")).write.csv(output_path + "cart-by-category")