# Initial Setup

In [1]:
# basic imports 

import os # OS e.g directory structure
import sys
import numpy as np # linear algebra
import scipy as sc  # scientific computing
import pandas as pd # data processing, file I/O
import seaborn as sns  # visualization
import matplotlib.pyplot as plt # visualization
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Spark related imports

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.fpm import PrefixSpan
from pyspark.ml.evaluation import RegressionEvaluator


# Data Exploration

In [3]:
! echo "Oct-2019"
! head -n 5 dataset/2019-Oct.csv
! tail -n 5 dataset/2019-Oct.csv
! echo "Nov-2019"
! head -n 5 dataset/2019-Nov.csv
! tail -n 5 dataset/2019-Nov.csv

Oct-2019
event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.20,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.10,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-31 23:59:58 UTC,view,2300275,2053013560530830019,electronics.camera.video,gopro,527.40,537931532,22c57267-da98-4f28-9a9c-18bb5b385193
2019-10-31 23:59:58 UTC,view,10800172,2053013554994348409,,redmond,61.75,527322328,5054190a-46cb-4211-a8f1-16fc1a060ed8
2019-10-31 23:59:58 UTC,view,5701038,2053013553970938175,auto.accessories.player,kenwood

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("abd_classification").getOrCreate()

In [5]:
sales = spark.read.csv("dataset/2019-*-small.csv", header="true", inferSchema="true", sep=",")
#sales_nov = spark.read.csv("dataset/2019-Nov-small.csv", header="true", inferSchema="true", sep=",")

In [6]:
#sales_oct.printSchema()
#so = sales_oct.count()

#sales_nov.printSchema()
#sv = sales_nov.count()

sales.printSchema()
sales.count()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



199998

In [7]:
sales.groupBy('event_type').count().show(truncate=False)

+----------+------+
|event_type|count |
+----------+------+
|purchase  |3077  |
|view      |194617|
|cart      |2304  |
+----------+------+



In [8]:
# https://stackoverflow.com/questions/40163106/cannot-find-col-function-in-pyspark

sales.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sales.columns]).show()

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|         0|         0|         0|          0|        66427|30168|    0|      0|           0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



In [9]:
sales.describe("price").show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|           199998|
|   mean|284.1862630626338|
| stddev|351.4834010830769|
|    min|              0.0|
|    max|          2574.07|
+-------+-----------------+



In [10]:
no_price = sales.select('*').where(col('price')==0)
no_price.show()
no_price.count()

+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:22:...|      view|   7000684|2053013560346280633|       kids.carriage| null|  0.0|555462472|c378efe2-75b4-48f...|
|2019-10-01 02:24:...|      view|   4100157|2053013561218695907|                null| null|  0.0|531057348|6b9c87ab-f991-45c...|
|2019-10-01 02:28:...|      view|  23301316|2053013561956893455|                null| null|  0.0|

182

Groupby user session using market basket since we have not user valuation

In [11]:
# https://stackoverflow.com/questions/48406304/groupby-and-concat-array-columns-pyspark
sales.groupBy("event_type",'user_session').agg(collect_list('product_id').alias('products')).show(20, truncate=False)

+----------+------------------------------------+------------------------------------+
|event_type|user_session                        |products                            |
+----------+------------------------------------+------------------------------------+
|cart      |07bf934b-baf2-40e1-9b06-2bb8aa5ecaee|[12300396]                          |
|cart      |20cd729b-6e4b-42d7-a3f5-c5b73a96f0e4|[8901769]                           |
|cart      |7724414e-3350-48cd-82a5-7cb8aad08f60|[1004856, 1004856]                  |
|cart      |89e5eee1-2925-456d-be16-1a33ff952918|[1002544]                           |
|cart      |b38a813c-1266-4922-a599-6abadef85fb6|[1004856]                           |
|cart      |ff1c1e4a-7cfb-467b-ae0a-490a4186e870|[1004833, 1004833, 1004833, 1004833]|
|purchase  |05d1d427-5e88-40eb-beba-5b1cbd15b557|[1004838]                           |
|purchase  |06053f76-9379-4b26-b370-c6b39701c635|[1004833]                           |
|purchase  |1041423c-30c4-4c68-ad8b-d4e34ce

In [12]:
sales.select("event_type").distinct().show(truncate=False)

+----------+
|event_type|
+----------+
|purchase  |
|view      |
|cart      |
+----------+



In [13]:
sales.select("category_code").distinct().show()

+--------------------+
|       category_code|
+--------------------+
|apparel.shoes.sli...|
|    computers.ebooks|
|computers.periphe...|
|electronics.video...|
|appliances.kitche...|
|     sport.snowboard|
|electronics.camer...|
|       apparel.shirt|
|electronics.audio...|
|appliances.kitche...|
|appliances.kitche...|
|  electronics.tablet|
|appliances.kitche...|
|auto.accessories....|
|apparel.shoes.moc...|
|       apparel.jeans|
|computers.periphe...|
|furniture.living_...|
| stationery.cartrige|
|furniture.kitchen...|
+--------------------+
only showing top 20 rows



Drop column event_time, price and category_code

In [14]:
sales_short = sales.drop("price", "event_time", "category_code")

In [15]:
sales_short.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [16]:
df_user_brand = sales_short.select('user_id','brand')
print(f'duplicate rows = {df_user_brand.count()-df_user_brand.dropDuplicates().count()}')
df_user_brand = df_user_brand.dropDuplicates()

#indexer = StringIndexer(inputCol="category_id", outputCol="category_id-Index")
#df_user_cat_id_indexed = indexer.fit(df_user_cat_id).transform(df_user_cat_id)

duplicate rows = 120652


recomendation based on brand, product viewd and purchased

In [17]:
df_session_basket = sales_short.groupBy('user_session').agg(collect_list('product_id').alias('products'))
df_session_basket.printSchema()

root
 |-- user_session: string (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: integer (containsNull = false)



In [18]:
df_session_basket.show()

+--------------------+--------------------+
|        user_session|            products|
+--------------------+--------------------+
|0043d905-2c15-49e...|  [1004665, 8902408]|
|03ae6182-46fb-4ce...|[1200617, 1201505...|
|04934e84-18dc-491...|  [1004838, 1005205]|
|051fa0c1-a1c1-427...|[3600661, 3600666...|
|06d381f2-b696-487...|           [2200937]|
|081bb131-ed56-4b5...|          [11500476]|
|0b9c0af2-0c06-423...|  [1004741, 1005160]|
|0c19ae22-a419-482...|[18300429, 183004...|
|0cb766a0-ff39-413...|[12700936, 127009...|
|0fce1f10-24c6-44e...|[1005157, 1005157...|
|1099e1cc-9303-41c...|          [26500614]|
|1218cd22-94b3-489...|          [26200134]|
|141db30c-0704-4bf...|[28101059, 281009...|
|15f425fb-9d08-4b2...|[26400672, 26401613]|
|178c2117-c55e-475...|[1004659, 1004659...|
|17d470d3-7df3-476...|           [1004505]|
|18870398-f6ed-4f6...|           [1801882]|
|194fc2ad-6a50-4dc...|[1004856, 1004856...|
|19797cab-6179-489...|          [45700023]|
|197e3f0d-7282-486...|          

In [19]:
df_user_basket = df_user_brand.groupBy('user_id').agg(collect_list('brand').alias('products'))
df_user_basket.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [20]:
df_user_basket.show()

+---------+--------------------+
|  user_id|            products|
+---------+--------------------+
|445162060|    [flama, shivaki]|
|509481306|          [moulinex]|
|512366675|[anytek, incar, i...|
|512513760|        [jbl, baden]|
|512552482|[bosch, marshal, ...|
|512697003|[lanvin, versace,...|
|512818893|                  []|
|512907846|              [zinc]|
|512975815|           [tarkett]|
|513101609|     [trebl, denzel]|
|513136911|[braun, xiaomi, v...|
|513161211|           [higashi]|
|513165061|                  []|
|513168011|   [huawei, samsung]|
|513172205|         [microsoft]|
|513216657|             [apple]|
|513247621|  [lider, pasabahce]|
|513266555|[makita, dewalt, ...|
|513291250|             [bosch]|
|513360681|  [bts, samsung, sv]|
+---------+--------------------+
only showing top 20 rows



In [21]:
dftrain, dftest = df_user_basket.randomSplit([0.8, 0.2], 42)

# caching data ... but just the train
dftrain.cache()

fpGrowth = FPGrowth(itemsCol="products", minSupport=0.001, minConfidence=0.0)
model = fpGrowth.fit(dftrain)

In [22]:
# session vs product? sessions vs brand?

predictions = model.transform(dftest)
predictions.show(truncate=False)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df_user_basket).show()

+---------+-------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id  |products                                         |prediction                                                                                                                                                                                                                                            |
+---------+-------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|512366675|[anytek, incar, intego]                          |[]          

+---------+--------------------+--------------------+
|  user_id|            products|          prediction|
+---------+--------------------+--------------------+
|445162060|    [flama, shivaki]|                  []|
|509481306|          [moulinex]|                  []|
|512366675|[anytek, incar, i...|                  []|
|512513760|        [jbl, baden]|[respect, rieker,...|
|512552482|[bosch, marshal, ...|[force, xiaomi, s...|
|512697003|[lanvin, versace,...|                  []|
|512818893|                  []|                  []|
|512907846|              [zinc]|                  []|
|512975815|           [tarkett]|                  []|
|513101609|     [trebl, denzel]|                  []|
|513136911|[braun, xiaomi, v...|[apple, samsung, ...|
|513161211|           [higashi]|                  []|
|513165061|                  []|                  []|
|513168011|   [huawei, samsung]|[apple, xiaomi, l...|
|513172205|         [microsoft]|                  []|
|513216657|             [app