# Customer Analysis - Model Customer Behavior


## Import

In [1]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import plotly.express as px
import plotly.graph_objects as go

## Read

In [2]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("Product") \
    .config("spark.executor.memory", f"16g") \
    .config("spark.driver.memory", f"16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", f"16g") \
    .config("spark.sql.debug.maxToStringFields", f"16") \
    .getOrCreate()

# sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

# sdf = sdf_201910.union(sdf_201911)
sdf = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)
sdf.show(5)

+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:...|      view|  44600062|2103807459595387724|                null|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:...|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:...|      view|  17200506|2053013559792632471|furniture.living_...|    null|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:...|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:...|      view|   1004237|205301355563188265

## Preparation

In [3]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

sdf.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



## Data Creation

In [4]:
sdf_session = sdf.select("user_id", "user_session", "event_type", "product_id", "price", "event_time")

sdf_session = sdf_session.withColumn("views", f.when(sdf_session.event_type == "view", 1).otherwise(0))
sdf_session = sdf_session.withColumn("purchases", f.when(sdf_session.event_type == "purchase", 1).otherwise(0))
sdf_session = sdf_session.withColumn("carts", f.when(sdf_session.event_type == "cart", 1).otherwise(0))

# sdf_session = sdf_session.withColumn("first_event", sdf_session.event_time)
# sdf_session = sdf_session.withColumn("last_event", sdf_session.event_time)

In [5]:
sdf_session_product_list = sdf_session.where(sdf_session.views == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg( f.concat_ws(',', f.collect_list('product_id')).alias('lst'))
sdf_session_product_list.show()

+---------+--------------------+
|  user_id|                 lst|
+---------+--------------------+
|292006376|             1004957|
|318145786|   23300385,23301205|
|344636566|            12720599|
|367369532|26500644,26500544...|
|372635850|2800433,2800382,2...|
|381624492|     1004849,1004739|
|406803663|1004849,1004565,1...|
|410429591|1201520,1201368,1...|
|411770871|7203438,7203909,7...|
|424536916|            43900002|
|426419884|     1004237,1004234|
|427847739|             6000236|
|428266518|10800148,10800206...|
|430707571|             6301402|
|433044560|1004741,1005233,1...|
|434170823|26300666,26300354...|
|435722681|            21400835|
|441522689|1004839,1004838,1...|
|442872957|             1003711|
|446867511|            25900022|
+---------+--------------------+
only showing top 20 rows



In [6]:
sdf_session_product_list.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- lst: string (nullable = false)



In [7]:
array_column = f.split(sdf_session_product_list["lst"],",")
sdf_session_product_list = sdf_session_product_list.withColumn("items", array_column)

In [8]:
sdf_session_product_list = sdf_session_product_list.select("items")
sdf_session_product_list.show(5)

+--------------------+
|               items|
+--------------------+
|           [1004957]|
|[23300385, 23301205]|
|          [12720599]|
|[26500644, 265005...|
|[2800433, 2800382...|
+--------------------+
only showing top 5 rows



## Data Split

## Modelling

### FPGrowth

In [9]:
import pyspark.ml as ml
#Create model
fp = ml.fpm.FPGrowth(minSupport=0.0001, minConfidence=0.8)

In [10]:
# train
fpm = fp.fit(sdf_session_product_list)
fpm.setPredictionCol("newPrediction")
fpm.freqItemsets.show(5)

+----------+------+
|     items|  freq|
+----------+------+
| [4700478]|  9715|
| [1004779]|  5903|
|[52100005]|  4398|
| [3100074]|  3540|
| [1004856]|197840|
+----------+------+
only showing top 5 rows



In [11]:
# eval

fpm.associationRules.show()

+--------------------+----------+------------------+------------------+--------------------+
|          antecedent|consequent|        confidence|              lift|             support|
+--------------------+----------+------------------+------------------+--------------------+
|  [1004565, 1004870]| [1004767]|0.6085790884718498|10.475503614718928|0.001201801378497947|
|  [1005100, 1004741]| [1004856]|0.6121191942733477|  9.35050434992576|0.001216691538749...|
|[1004873, 1004836...| [1004767]|0.7420546932742055|12.773026166955862|0.001328864079308...|
|  [1003316, 1002544]| [1005115]|0.6085232903865213|10.755291226779603|0.001015839821582...|
|  [1004858, 1004870]| [1004856]|0.6386427898209237| 9.755668896085261|0.001121063620691...|
|  [1004857, 1004209]| [1004856]|  0.70578231292517| 10.78126719248152|0.001098562934089533|
|  [1004903, 1005100]| [1004856]|0.7117612409189084| 10.87259906499323|0.001199485131347758|
|  [1004246, 1005105]| [1005115]|0.7249247743229689|12.812618988500278

## General Comparison

## Hyperparamter & extended Stuff