In [1]:
import pyspark
from pyspark.sql.functions import col
from pyspark.sql.functions import when

In [2]:
spark = pyspark.sql.SparkSession.builder.appName("eCommerce data").getOrCreate()

In [3]:
output_path = "results/combined-by-category-brand/"
path = "data/"
df = spark.read.csv(path, header=True)

#### get total number of events for this month

In [5]:
df.count()

411709736

#### filter out null brands or categories

In [4]:
df_not_null = df.where((col("brand") != "null") & (col("category_code") != "null"))

#### only keep one entry per user, product, and event

In [5]:
df_final = df_not_null.dropDuplicates(subset=['user_id', 'product_id', 'event_type'])

#### correct data by replacing category_code "construction.tools.light" by "electronics.smartphone"

In [6]:
df_corrected = df_final.withColumn("category_code",
                                      when(col("category_code") == "construction.tools.light",
                                           "electronics.smartphone").otherwise(col("category_code")))

#### from rows of corrected data only filter 'view' event

In [7]:
df_view = df_corrected.where(col("event_type") == "view")

#### group results by category code and brand, count and sort then write results to file

In [10]:
df_view.groupBy('category_code', 'brand').count().sort(col("count")).write.csv(output_path + "view-by-category-brand")

#### from rows of corrected data only filter 'purchase' event

In [11]:
df_purchase = df_corrected.where(col("event_type") == "purchase")

#### group results by category code, count and sort then write results to file

In [12]:
df_purchase.groupBy('category_code', 'brand').count().sort(col("count")).write.csv(output_path + "purchase-by-category-brand")

#### from rows of corrected data only filter 'cart' event

In [13]:
df_cart = df_corrected.where(col("event_type") == "cart")

#### group results by category code, count and sort then write results to file

In [14]:
df_cart.groupBy('category_code', 'brand').count().sort(col("count")).write.csv(output_path + "cart-by-category-brand")