# Initial Setup

In [1]:
# basic imports 

import os # OS e.g directory structure
import sys
import numpy as np # linear algebra
import scipy as sc  # scientific computing
import pandas as pd # data processing, file I/O
import seaborn as sns  # visualization
import matplotlib.pyplot as plt # visualization
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Spark related imports

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


# Data Exploration

In [None]:
! echo "Oct-2019"
! head -n 5 dataset/2019-Oct.csv
! tail -n 5 dataset/2019-Oct.csv
! echo "Nov-2019"
! head -n 5 dataset/2019-Nov.csv
! tail -n 5 dataset/2019-Nov.csv

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("abd_classification").getOrCreate()

In [4]:
sales_oct = spark.read.csv("dataset/2019-Oct-small.csv", header="true", inferSchema="true", sep=",")
sales_nov = spark.read.csv("dataset/2019-Nov-small.csv", header="true", inferSchema="true", sep=",")

In [None]:
sales_oct_small, _ = sales_oct.randomSplit([0.005, 0.995], 42)
sales_nov_small, _ = sales_nov.randomSplit([0.005, 0.995], 42)

In [5]:
sales_oct.printSchema()
so = sales_oct.count()

sales_nov.printSchema()
sv = sales_nov.count()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [18]:
print(so)
print(sv)

99999
99999


In [26]:
sales_oct.groupBy('event_type').count().show(truncate=False)

+----------+-----+
|event_type|count|
+----------+-----+
|purchase  |1655 |
|view      |97129|
|cart      |1215 |
+----------+-----+



Available event types

In [None]:
sales_oct.select("user_id").show(truncate=False)

In [None]:
sales_nov.show(20, truncate=False)

In [7]:
sales_oct.describe()

DataFrame[summary: string, event_time: string, event_type: string, product_id: string, category_id: string, category_code: string, brand: string, price: string, user_id: string, user_session: string]

In [8]:
# https://stackoverflow.com/questions/40163106/cannot-find-col-function-in-pyspark

sales_nov.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sales_nov.columns]).show()
sales_oct.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sales_oct.columns]).show()

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|         0|         0|         0|          0|        33840|15776|    0|      0|           0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|         0|         0|         0|          0|        32587|14392|    0|      0|           0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



In [None]:
sales_oct.describe("price").show()

In [10]:
no_price = sales_oct.select('*').where(col('price')==0)
no_price.show()
no_price.count()

+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:22:...|      view|   7000684|2053013560346280633|       kids.carriage| null|  0.0|555462472|c378efe2-75b4-48f...|
|2019-10-01 02:24:...|      view|   4100157|2053013561218695907|                null| null|  0.0|531057348|6b9c87ab-f991-45c...|
|2019-10-01 02:28:...|      view|  23301316|2053013561956893455|                null| null|  0.0|

119

Groupby user session using market basket since we have not user valuation

In [11]:
# https://stackoverflow.com/questions/48406304/groupby-and-concat-array-columns-pyspark
sales_oct.groupBy('user_session').agg(collect_list('product_id').alias('products')).show(20, truncate=False)

+------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|user_session                        |products                                                                                                                |
+------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|0043d905-2c15-49e8-bd54-cb3db0de78ca|[1004665, 8902408]                                                                                                      |
|051fa0c1-a1c1-4276-8d57-fc0f3f5e2933|[3600661, 3600666, 3601537]                                                                                             |
|06d381f2-b696-4870-8a03-9d0aab539f72|[2200937]                                                                                                               |
|0fce1f10-24c6-44ef-a07a-dde9b18403dd|[1

In [12]:
sales_oct.groupBy('user_session').agg(collect_list('brand').alias('brand')).show(30, truncate=False)

+------------------------------------+------------------------------------------------------------------------------------------------+
|user_session                        |brand                                                                                           |
+------------------------------------+------------------------------------------------------------------------------------------------+
|0043d905-2c15-49e8-bd54-cb3db0de78ca|[samsung, chicco]                                                                               |
|051fa0c1-a1c1-4276-8d57-fc0f3f5e2933|[samsung, samsung, samsung]                                                                     |
|06d381f2-b696-4870-8a03-9d0aab539f72|[canon]                                                                                         |
|0fce1f10-24c6-44ef-a07a-dde9b18403dd|[xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi, xiaomi]|
|1218cd22-94b3-4890-ad1a-bcee5b5bf460|[]        

In [16]:
sales_oct.select("event_type").distinct().show(truncate=False)

+----------+
|event_type|
+----------+
|purchase  |
|view      |
|cart      |
+----------+

