In [2]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import plotly.express as px
import plotly.graph_objects as go

In [3]:
 spark = pyspark.sql.SparkSession \
     .builder \
     .appName("app_great") \
     .getOrCreate()
 sc = spark.sparkContext

In [4]:
sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

# join both months together
sdf = sdf_201910.union(sdf_201911)

In [5]:

sdf = sdf.withColumn("category_class", f.split(sdf["category_code"], "\.").getItem(0))
sdf = sdf.withColumn("category_sub_class", f.split(sdf["category_code"], "\.").getItem(1))
sdf = sdf.withColumn("category_sub_sub_class", f.split(sdf["category_code"], "\.").getItem(2))

sdf = sdf.withColumn("year", f.year("event_time"))
sdf = sdf.withColumn("month", f.month("event_time"))
sdf = sdf.withColumn("weekofyear", f.weekofyear("event_time"))
sdf = sdf.withColumn("dayofyear", f.dayofyear("event_time"))
sdf = sdf.withColumn("dayofweek", f.dayofweek("event_time"))
sdf = sdf.withColumn("dayofmonth", f.dayofmonth("event_time"))
sdf = sdf.withColumn("hour", f.hour("event_time"))

sdf = sdf.withColumn('turnover', f.when(f.col('event_type') == 'purchase', f.col('price')).otherwise(0))
sdf = sdf.withColumn('bougth_quantity', f.when(f.col('event_type') == 'purchase', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('viewed_quantity', f.when(f.col('event_type') == 'view', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('cart_quantity', f.when(f.col('event_type') == 'cart', f.lit(1)).otherwise(0))
# None Handling
# sdf = sdf.fillna(value="not defined")

sdf_raw= sdf
sdf.createOrReplaceTempView("Data")

## Most interactions

In [8]:
spark.sql("SELECT product_id, Count(*)  \
          FROM Data \
            GROUP BY product_id \
            ORDER BY Count(*) DESC")\
          .show(10)

+----------+--------+
|product_id|count(1)|
+----------+--------+
|   1004856| 1136760|
|   1005115| 1026435|
|   1004767| 1008311|
|   4804056|  605012|
|   1004870|  537856|
|   1004833|  532695|
|   1005105|  522086|
|   1004249|  519997|
|   1005160|  476435|
|   1002544|  475186|
+----------+--------+
only showing top 10 rows



In [9]:
spark.sql("SELECT category_id, Count(*)  \
          FROM Data \
            GROUP BY category_id \
            ORDER BY Count(*) DESC")\
          .show(10)

+-------------------+--------+
|        category_id|count(1)|
+-------------------+--------+
|2053013555631882655|27882231|
|2053013553559896355| 5166320|
|2053013554415534427| 3299707|
|2053013558920217191| 3297395|
|2053013554658804075| 2917065|
|2053013563810775923| 2273270|
|2053013565983425517| 2272464|
|2053013563651392361| 2235234|
|2053013553341792533| 1824496|
|2053013563911439225| 1575096|
+-------------------+--------+
only showing top 10 rows



In [1]:
spark.sql("SELECT category_code, Count(*)  \
          FROM Data \
            GROUP BY category_code \
            ORDER BY Count(*) DESC")\
          .show(10)

NameError: name 'spark' is not defined

In [11]:
spark.sql("SELECT category_class, Count(*)  \
          FROM Data \
            GROUP BY category_class \
            ORDER BY Count(*) DESC")\
          .show(10)

+--------------+--------+
|category_class|count(1)|
+--------------+--------+
|   electronics|40141709|
|          null|35413780|
|    appliances|13457119|
|     computers| 6505575|
|       apparel| 4554025|
|     furniture| 3358989|
|          auto| 2227274|
|  construction| 1822989|
|          kids| 1327003|
|   accessories|  628276|
+--------------+--------+
only showing top 10 rows



In [12]:
spark.sql("SELECT brand, Count(*)  \
          FROM Data \
            GROUP BY brand \
            ORDER BY Count(*) DESC")\
          .show(10)

+-------+--------+
|  brand|count(1)|
+-------+--------+
|   null|15331243|
|samsung|13172020|
|  apple|10381933|
| xiaomi| 7721825|
| huawei| 2521331|
|lucente| 1840936|
|     lg| 1659394|
|  bosch| 1532149|
|   oppo| 1294585|
|   sony| 1255101|
+-------+--------+
only showing top 10 rows



## Most Views

In [13]:
spark.sql("SELECT product_id, SUM(viewed_quantity)  \
          FROM Data \
            GROUP BY product_id \
            ORDER BY SUM(viewed_quantity) DESC")\
          .show(10)

+----------+--------------------+
|product_id|sum(viewed_quantity)|
+----------+--------------------+
|   1004856|              942167|
|   1005115|              910725|
|   1004767|              861675|
|   4804056|              497431|
|   1005105|              473651|
|   1004870|              462532|
|   1004249|              462353|
|   1004833|              450464|
|   1005160|              437340|
|   1002544|              409169|
+----------+--------------------+
only showing top 10 rows



In [21]:
spark.sql("SELECT category_id, SUM(viewed_quantity)  \
          FROM Data \
            GROUP BY category_id \
            ORDER BY SUM(viewed_quantity) DESC")\
          .show(10)

+-------------------+--------------------+
|        category_id|sum(viewed_quantity)|
+-------------------+--------------------+
|2053013555631882655|            25451835|
|2053013553559896355|             4776821|
|2053013558920217191|             3189382|
|2053013554415534427|             3105713|
|2053013554658804075|             2663452|
|2053013563651392361|             2171796|
|2053013565983425517|             2160880|
|2053013563810775923|             2145086|
|2053013553341792533|             1725008|
|2053013561579406073|             1532333|
+-------------------+--------------------+
only showing top 10 rows



In [22]:
spark.sql("SELECT category_code, SUM(viewed_quantity)  \
          FROM Data \
            GROUP BY category_code \
            ORDER BY SUM(viewed_quantity) DESC")\
          .show(10)

+--------------------+--------------------+
|       category_code|sum(viewed_quantity)|
+--------------------+--------------------+
|                null|            34073918|
|electronics.smart...|            25451835|
|  electronics.clocks|             3267223|
|  computers.notebook|             3209430|
|electronics.video.tv|             3127266|
|electronics.audio...|             2663452|
|       apparel.shoes|             2596322|
|appliances.kitche...|             2225429|
|appliances.enviro...|             2217058|
|appliances.kitche...|             2145086|
+--------------------+--------------------+
only showing top 10 rows



In [18]:
spark.sql("SELECT category_class, SUM(viewed_quantity)  \
          FROM Data \
            GROUP BY category_class \
            ORDER BY SUM(viewed_quantity) DESC")\
          .show(10)

+--------------+--------------------+
|category_class|sum(viewed_quantity)|
+--------------+--------------------+
|   electronics|            37026582|
|          null|            34073918|
|    appliances|            12837916|
|     computers|             6297977|
|       apparel|             4477518|
|     furniture|             3295250|
|          auto|             2157706|
|  construction|             1759762|
|          kids|             1292002|
|   accessories|              616475|
+--------------+--------------------+
only showing top 10 rows



In [9]:
spark.sql("SELECT brand, SUM(viewed_quantity)  \
          FROM Data \
            GROUP BY brand \
            ORDER BY SUM(viewed_quantity) DESC")\
          .show(10)

+-------+--------------------+
|  brand|sum(viewed_quantity)|
+-------+--------------------+
|   null|            14922708|
|samsung|            11898628|
|  apple|             9374247|
| xiaomi|             7232401|
| huawei|             2358235|
|lucente|             1775749|
|     lg|             1574848|
|  bosch|             1480771|
|   oppo|             1203440|
|   sony|             1193071|
+-------+--------------------+
only showing top 10 rows



## Most Add_to_carts

In [14]:
spark.sql("SELECT product_id, SUM(cart_quantity)  \
          FROM Data \
            GROUP BY product_id \
            ORDER BY SUM(cart_quantity) DESC")\
          .show(10)

+----------+------------------+
|product_id|sum(cart_quantity)|
+----------+------------------+
|   1004856|            133328|
|   1004767|            102217|
|   1005115|             80923|
|   4804056|             77400|
|   1004833|             56048|
|   1004870|             54036|
|   1002544|             43790|
|   5100816|             40887|
|   1004249|             39673|
|   1005100|             38976|
+----------+------------------+
only showing top 10 rows



In [10]:
spark.sql("SELECT category_id, SUM(cart_quantity)  \
          FROM Data \
            GROUP BY category_id \
            ORDER BY SUM(cart_quantity) DESC")\
          .show(10)

+-------------------+------------------+
|        category_id|sum(cart_quantity)|
+-------------------+------------------+
|2053013555631882655|           1709731|
|2053013553559896355|            277667|
|2053013554658804075|            182276|
|2053013554415534427|            142271|
|2053013563810775923|             92264|
|2053013565983425517|             81453|
|2053013558920217191|             74233|
|2053013553341792533|             69837|
|2053013553375346967|             47899|
|2053013563911439225|             43941|
+-------------------+------------------+
only showing top 10 rows



In [11]:
spark.sql("SELECT category_code, SUM(cart_quantity)  \
          FROM Data \
            GROUP BY category_code \
            ORDER BY SUM(cart_quantity) DESC")\
          .show(10)

+--------------------+------------------+
|       category_code|sum(cart_quantity)|
+--------------------+------------------+
|electronics.smart...|           1709731|
|                null|            932219|
|electronics.audio...|            182276|
|electronics.video.tv|            142691|
|appliances.kitche...|             92264|
|  electronics.clocks|             89633|
|appliances.enviro...|             82099|
|  computers.notebook|             74724|
|appliances.kitche...|             65228|
|       apparel.shoes|             40074|
+--------------------+------------------+
only showing top 10 rows



In [12]:
spark.sql("SELECT category_class, SUM(cart_quantity)  \
          FROM Data \
            GROUP BY category_class \
            ORDER BY SUM(cart_quantity) DESC")\
          .show(10)

+--------------+------------------+
|category_class|sum(cart_quantity)|
+--------------+------------------+
|   electronics|           2198460|
|          null|            932219|
|    appliances|            445181|
|     computers|            145266|
|       apparel|             54290|
|          auto|             48229|
|  construction|             46727|
|     furniture|             43896|
|          kids|             23353|
|   accessories|              8060|
+--------------+------------------+
only showing top 10 rows



In [13]:
spark.sql("SELECT brand, SUM(cart_quantity)  \
          FROM Data \
            GROUP BY brand \
            ORDER BY SUM(cart_quantity) DESC")\
          .show(10)

+--------+------------------+
|   brand|sum(cart_quantity)|
+--------+------------------+
| samsung|            900469|
|   apple|            698749|
|  xiaomi|            364516|
|    null|            277048|
|  huawei|            115892|
|cordiant|             65317|
|    oppo|             65174|
|      lg|             62940|
|    sony|             44992|
|   artel|             40400|
+--------+------------------+
only showing top 10 rows



## Most purchases

In [15]:
spark.sql("SELECT product_id, SUM(bougth_quantity)  \
          FROM Data \
            GROUP BY product_id \
            ORDER BY SUM(bougth_quantity) DESC")\
          .show(10)

+----------+--------------------+
|product_id|sum(bougth_quantity)|
+----------+--------------------+
|   1004856|               61265|
|   1004767|               44419|
|   1005115|               34787|
|   4804056|               30181|
|   1004833|               26183|
|   1002544|               22227|
|   1004870|               21288|
|   1004249|               17971|
|   1005105|               15776|
|   1004836|               15549|
+----------+--------------------+
only showing top 10 rows



In [14]:
spark.sql("SELECT category_id, SUM(bougth_quantity)  \
          FROM Data \
            GROUP BY category_id \
            ORDER BY SUM(bougth_quantity) DESC")\
          .show(10)

+-------------------+--------------------+
|        category_id|sum(bougth_quantity)|
+-------------------+--------------------+
|2053013555631882655|              720665|
|2053013553559896355|              111832|
|2053013554658804075|               71337|
|2053013554415534427|               51723|
|2053013563810775923|               35920|
|2053013558920217191|               33780|
|2053013565983425517|               30131|
|2053013553341792533|               29651|
|2053013563651392361|               24606|
|2053013553375346967|               16830|
+-------------------+--------------------+
only showing top 10 rows



In [6]:
spark.sql("SELECT category_code, SUM(bougth_quantity)  \
          FROM Data \
            GROUP BY category_code \
            ORDER BY SUM(bougth_quantity) DESC")\
          .show(10,truncate=False)

+--------------------------------+--------------------+
|category_code                   |sum(bougth_quantity)|
+--------------------------------+--------------------+
|electronics.smartphone          |720665              |
|null                            |407643              |
|electronics.audio.headphone     |71337               |
|electronics.video.tv            |51839               |
|electronics.clocks              |41143               |
|appliances.kitchen.washer       |35920               |
|computers.notebook              |34023               |
|appliances.environment.vacuum   |30571               |
|appliances.kitchen.refrigerators|24260               |
|apparel.shoes                   |14395               |
+--------------------------------+--------------------+
only showing top 10 rows



In [17]:
spark.sql("SELECT category_class, SUM(bougth_quantity)  \
          FROM Data \
            GROUP BY category_class \
            ORDER BY SUM(bougth_quantity) DESC")\
          .show(10)

+--------------+--------------------+
|category_class|sum(bougth_quantity)|
+--------------+--------------------+
|   electronics|              916667|
|          null|              407643|
|    appliances|              174022|
|     computers|               62332|
|       apparel|               22217|
|          auto|               21339|
|     furniture|               19843|
|  construction|               16500|
|          kids|               11648|
|   accessories|                3741|
+--------------+--------------------+
only showing top 10 rows



In [16]:
spark.sql("SELECT brand, SUM(bougth_quantity)  \
          FROM Data \
            GROUP BY brand \
            ORDER BY SUM(bougth_quantity) DESC")\
          .show(10)

+--------+--------------------+
|   brand|sum(bougth_quantity)|
+--------+--------------------+
| samsung|              372923|
|   apple|              308937|
|    null|              131487|
|  xiaomi|              124908|
|  huawei|               47204|
|cordiant|               27534|
| lucente|               26137|
|    oppo|               25971|
|      lg|               21606|
|    sony|               17038|
+--------+--------------------+
only showing top 10 rows



## Most turnover

In [16]:
spark.sql("SELECT product_id, SUM(turnover)  \
          FROM Data \
            GROUP BY product_id \
            ORDER BY SUM(turnover) DESC")\
          .show(10)

+----------+--------------------+
|product_id|       sum(turnover)|
+----------+--------------------+
|   1005115|3.3032381669999957E7|
|   1005105|2.1684603370000023E7|
|   1004249|1.3545407539999992E7|
|   1005135|1.2654328769999998E7|
|   1004767|1.1004748490000015E7|
|   1002544|1.0458895980000004E7|
|   1004856|   7917932.739999985|
|   1005116|   7161938.290000007|
|   1002524|  6965532.8900000015|
|   1004870|   6057424.029999999|
+----------+--------------------+
only showing top 10 rows



In [17]:
spark.sql("SELECT category_id, SUM(turnover)  \
          FROM Data \
            GROUP BY category_id \
            ORDER BY SUM(turnover) DESC")\
          .show(10)

+-------------------+--------------------+
|        category_id|       sum(turnover)|
+-------------------+--------------------+
|2053013555631882655|3.3487128497999793E8|
|2053013554415534427|2.0837420189999986E7|
|2053013558920217191| 1.965284848000001E7|
|2053013563810775923|1.0460552920000006E7|
|2053013553341792533|   9738098.979999999|
|2053013554658804075|   9208629.480000021|
|2053013553559896355|   6956439.380000008|
|2053013563911439225|   6579577.310000001|
|2053013563651392361|   5279694.420000004|
|2053013565983425517|          4422258.17|
+-------------------+--------------------+
only showing top 10 rows



In [18]:
spark.sql("SELECT category_code, SUM(turnover)  \
          FROM Data \
            GROUP BY category_code \
            ORDER BY SUM(turnover) DESC")\
          .show(10)

+--------------------+--------------------+
|       category_code|       sum(turnover)|
+--------------------+--------------------+
|electronics.smart...|3.3487128497999793E8|
|                null| 5.280544380999989E7|
|electronics.video.tv|2.0880559019999992E7|
|  computers.notebook|1.9658316960000005E7|
|  electronics.clocks|1.1371042719999995E7|
|appliances.kitche...|1.0460552920000006E7|
|electronics.audio...|   9208629.480000021|
|appliances.kitche...|   8552734.309999997|
|appliances.enviro...|   4478737.029999998|
|  electronics.tablet|          3131226.79|
+--------------------+--------------------+
only showing top 10 rows



In [19]:
spark.sql("SELECT category_class, SUM(turnover)  \
          FROM Data \
            GROUP BY category_class \
            ORDER BY SUM(turnover) DESC")\
          .show(10)

+--------------+--------------------+
|category_class|       sum(turnover)|
+--------------+--------------------+
|   electronics| 3.817142865699979E8|
|          null| 5.280544380999989E7|
|    appliances|3.2223623589999974E7|
|     computers|2.5373205340000022E7|
|     furniture|   4216980.010000001|
|          auto|   2649036.469999999|
|  construction|  2013385.7200000002|
|       apparel|  1805962.8699999992|
|          kids|  1401903.9300000002|
|         sport|           718251.75|
+--------------+--------------------+
only showing top 10 rows



In [20]:
spark.sql("SELECT brand, SUM(turnover)  \
          FROM Data \
            GROUP BY brand \
            ORDER BY SUM(turnover) DESC")\
          .show(10)

+-------+--------------------+
|  brand|       sum(turnover)|
+-------+--------------------+
|  apple|2.3872179370000052E8|
|samsung|1.0127741347999987E8|
| xiaomi|       2.045389925E7|
|   null|1.9566319819999993E7|
| huawei|   9664104.090000004|
|     lg|   8626906.719999999|
|   acer|          6924026.05|
|lucente|          6651658.94|
|   sony|          6341082.98|
|   oppo|   5901500.519999996|
+-------+--------------------+
only showing top 10 rows

