# Customer Analysis - Model Customer Behavior


## Import

In [1]:
# General Pyspark Implementation
import pyspark
import pyspark.sql.functions as f

# Machine Learning libary in Pyspark
import pyspark.ml as ml

## Read

In [2]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("Product") \
    .config("spark.executor.memory", f"16g") \
    .config("spark.driver.memory", f"16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", f"16g") \
    .config("spark.sql.debug.maxToStringFields", f"16") \
    .getOrCreate()

# sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

# sdf = sdf_201910.union(sdf_201911)
# sdf = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)
sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)
sdf.show(5)

+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                null| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:...|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:...|      view|   1004775|2053013555631882655|electronics.smart..

## Preparation

In [3]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

sdf.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



## Data Creation

In [4]:
sdf_base = sdf.select("user_id", "user_session", "event_type", "product_id", "price", "event_time")

sdf_base = sdf_base.withColumn("views", f.when(sdf_base.event_type == "view", 1).otherwise(0))
sdf_base = sdf_base.withColumn("purchases", f.when(sdf_base.event_type == "purchase", 1).otherwise(0))
sdf_base = sdf_base.withColumn("carts", f.when(sdf_base.event_type == "cart", 1).otherwise(0))


## Modelling - Behaviour of users

Algorithm used: FPGrowth

### View patterns

#### Data Preparation

In [5]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_view = sdf_base.where(sdf_base.views == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_view.show()


+---------+--------------------+
|  user_id|               items|
+---------+--------------------+
|518045858|[26404402, 26402378]|
|512416379|[34600011, 34600119]|
|566280399|           [4804420]|
|564599988|[22700205, 227000...|
|512651494|[24900193, 42900023]|
|515240495|          [30000218]|
|515782589|           [1800729]|
|519041881|          [12708306]|
|518398361|           [1004856]|
|517811633|  [2200958, 2201037]|
|566280536|          [17200558]|
|512367687|          [14700422]|
|513200477|           [3901174]|
|527667630|          [28600033]|
|539701280|           [2700598]|
|517702267|[52000058, 52000038]|
|514028527|[1307115, 1306421...|
|570363385|          [31501037]|
|565596995|          [12705258]|
|559362811|           [2900936]|
+---------+--------------------+
only showing top 20 rows



#### Model creation

In [6]:
#Create model
fp_view = ml.fpm.FPGrowth(minSupport=0.0001, minConfidence=0.8)

#### Model training

In [7]:
# train
fpm_view = fp_view.fit(sdf_user_product_list_view)
fpm_view.setPredictionCol("newPrediction")
fpm_view.freqItemsets.show(5)

+------------------+----+
|             items|freq|
+------------------+----+
|         [1003316]|   1|
|         [1005116]|   2|
|         [1004237]|   2|
|         [1003317]|   1|
|[1003317, 1004258]|   1|
+------------------+----+
only showing top 5 rows



#### Results

In [8]:
# eval
fpm_view.associationRules.show()

+--------------------+----------+----------+-----+--------------------+
|          antecedent|consequent|confidence| lift|             support|
+--------------------+----------+----------+-----+--------------------+
|          [17302664]|[17301421]|       1.0|151.0|0.006622516556291391|
|          [17302664]|[17301495]|       1.0|151.0|0.006622516556291391|
|          [17302664]|[17301611]|       1.0|151.0|0.006622516556291391|
|[1307340, 1307065...| [1307095]|       1.0|151.0|0.006622516556291391|
|[17301611, 17301495]|[17302664]|       1.0|151.0|0.006622516556291391|
|[17301611, 17301495]|[17301421]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100050]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100033]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100058]|       1.0|151.0|0.006622516556291391|
|          [15100148]|[15100252]|       1.0|151.0|0.006622516556291391|
|[41100058, 411000...|[41100033]|       1.0|151.0|0.006622516556

### Cart patterns

#### Data Preparation

In [9]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_carts = sdf_base.where(sdf_base.carts == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_carts.show()

+---------+----------+
|  user_id|     items|
+---------+----------+
|533326659| [1005014]|
|570362657|[27800054]|
+---------+----------+



#### Model creation

In [10]:
#Create model
fp_carts = ml.fpm.FPGrowth(minSupport=0.0001, minConfidence=0.8)

#### Model training

In [11]:
# train
fpm_carts = fp_carts.fit(sdf_user_product_list_carts)
fpm_carts.setPredictionCol("newPrediction")
fpm_carts.freqItemsets.show(5)

+----------+----+
|     items|freq|
+----------+----+
| [1005014]|   1|
|[27800054]|   1|
+----------+----+



#### Results

In [12]:
# eval
fpm_view.associationRules.show()

+--------------------+----------+----------+-----+--------------------+
|          antecedent|consequent|confidence| lift|             support|
+--------------------+----------+----------+-----+--------------------+
|          [17302664]|[17301421]|       1.0|151.0|0.006622516556291391|
|          [17302664]|[17301495]|       1.0|151.0|0.006622516556291391|
|          [17302664]|[17301611]|       1.0|151.0|0.006622516556291391|
|[1307340, 1307065...| [1307095]|       1.0|151.0|0.006622516556291391|
|[17301611, 17301495]|[17302664]|       1.0|151.0|0.006622516556291391|
|[17301611, 17301495]|[17301421]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100050]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100033]|       1.0|151.0|0.006622516556291391|
|          [41100055]|[41100058]|       1.0|151.0|0.006622516556291391|
|          [15100148]|[15100252]|       1.0|151.0|0.006622516556291391|
|[41100058, 411000...|[41100033]|       1.0|151.0|0.006622516556

### Purchasing patterns

#### Data Preparation

In [13]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_purchases = sdf_base.where(sdf_base.purchases == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_purchases.show()

+---------+----------+
|  user_id|     items|
+---------+----------+
|559368633|[13200605]|
|513351129| [1005161]|
+---------+----------+



#### Model creation

In [14]:
#Create model
fp_purchases = ml.fpm.FPGrowth(minSupport=0.0001, minConfidence=0.8)

#### Model training

In [15]:
# train
fpm_purchases = fp_purchases.fit(sdf_user_product_list_purchases)
fpm_purchases.setPredictionCol("newPrediction")
fpm_purchases.freqItemsets.show(5)

+----------+----+
|     items|freq|
+----------+----+
| [1005161]|   1|
|[13200605]|   1|
+----------+----+



#### Results

In [16]:
# eval
fpm_purchases.associationRules.show()



+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

