# Initial Setup

In [1]:
# basic imports 

import os # OS e.g directory structure
import sys
import numpy as np # linear algebra
import scipy as sc  # scientific computing
import pandas as pd # data processing, file I/O
import seaborn as sns  # visualization
import matplotlib.pyplot as plt # visualization
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Spark related imports

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


# Data Exploration

In [None]:
! echo "Oct-2019"
! head -n 5 dataset/2019-Oct.csv
! tail -n 5 dataset/2019-Oct.csv
! echo "Nov-2019"
! head -n 5 dataset/2019-Nov.csv
! tail -n 5 dataset/2019-Nov.csv

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("abd_classification").getOrCreate()

In [28]:
sales = spark.read.csv("dataset/2019-*-small.csv", header="true", inferSchema="true", sep=",")
#sales_nov = spark.read.csv("dataset/2019-Nov-small.csv", header="true", inferSchema="true", sep=",")

In [29]:
#sales_oct.printSchema()
#so = sales_oct.count()

#sales_nov.printSchema()
#sv = sales_nov.count()

sales.printSchema()
sales.count()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



199998

In [30]:
sales.groupBy('event_type').count().show(truncate=False)

+----------+------+
|event_type|count |
+----------+------+
|purchase  |3077  |
|view      |194617|
|cart      |2304  |
+----------+------+



In [31]:
# https://stackoverflow.com/questions/40163106/cannot-find-col-function-in-pyspark

sales.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sales.columns]).show()

+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|         0|         0|         0|          0|        66427|30168|    0|      0|           0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



In [32]:
sales.describe("price").show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|           199998|
|   mean|284.1862630626338|
| stddev|351.4834010830769|
|    min|              0.0|
|    max|          2574.07|
+-------+-----------------+



In [33]:
no_price = sales.select('*').where(col('price')==0)
no_price.show()
no_price.count()

+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-----+-----+---------+--------------------+
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:21:...|      view|  53000001|2146660886926852416|                null| null|  0.0|512450748|03267357-d0e5-483...|
|2019-10-01 02:22:...|      view|   7000684|2053013560346280633|       kids.carriage| null|  0.0|555462472|c378efe2-75b4-48f...|
|2019-10-01 02:24:...|      view|   4100157|2053013561218695907|                null| null|  0.0|531057348|6b9c87ab-f991-45c...|
|2019-10-01 02:28:...|      view|  23301316|2053013561956893455|                null| null|  0.0|

182

Groupby user session using market basket since we have not user valuation

In [41]:
# https://stackoverflow.com/questions/48406304/groupby-and-concat-array-columns-pyspark
sales.groupBy("event_type",'user_session').agg(collect_list('product_id').alias('products')).show(20, truncate=False)

+----------+------------------------------------+------------------------------------+
|event_type|user_session                        |products                            |
+----------+------------------------------------+------------------------------------+
|cart      |07bf934b-baf2-40e1-9b06-2bb8aa5ecaee|[12300396]                          |
|cart      |20cd729b-6e4b-42d7-a3f5-c5b73a96f0e4|[8901769]                           |
|cart      |7724414e-3350-48cd-82a5-7cb8aad08f60|[1004856, 1004856]                  |
|cart      |89e5eee1-2925-456d-be16-1a33ff952918|[1002544]                           |
|cart      |b38a813c-1266-4922-a599-6abadef85fb6|[1004856]                           |
|cart      |ff1c1e4a-7cfb-467b-ae0a-490a4186e870|[1004833, 1004833, 1004833, 1004833]|
|purchase  |05d1d427-5e88-40eb-beba-5b1cbd15b557|[1004838]                           |
|purchase  |06053f76-9379-4b26-b370-c6b39701c635|[1004833]                           |
|purchase  |1041423c-30c4-4c68-ad8b-d4e34ce

In [35]:
sales.select("event_type").distinct().show(truncate=False)

+----------+
|event_type|
+----------+
|purchase  |
|view      |
|cart      |
+----------+

