# Customer Analysis - Model Customer Behavior


## Import

In [None]:
# General Pyspark Implementation
import pyspark
import pyspark.sql.functions as f

# Machine Learning libary in Pyspark
import pyspark.ml as ml

## Read

In [None]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("Product") \
    .config("spark.executor.memory", f"16g") \
    .config("spark.driver.memory", f"16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", f"16g") \
    .config("spark.sql.debug.maxToStringFields", f"16") \
    .getOrCreate()

# sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

# sdf = sdf_201910.union(sdf_201911)
sdf = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)
sdf.show(5)

## Preparation

In [None]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

sdf.printSchema()

## Data Creation

In [None]:
sdf_base = sdf.select("user_id", "user_session", "event_type", "product_id", "price", "event_time")

sdf_base = sdf_base.withColumn("views", f.when(sdf_base.event_type == "view", 1).otherwise(0))
sdf_base = sdf_base.withColumn("purchases", f.when(sdf_base.event_type == "purchase", 1).otherwise(0))
sdf_base = sdf_base.withColumn("carts", f.when(sdf_base.event_type == "cart", 1).otherwise(0))


## Modelling - Behaviour of users

Algorithm used: FPGrowth

### View patterns

#### Data Preparation

In [5]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_view = sdf_base.where(sdf_base.views == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_view.show()


+---------+--------------------+
|  user_id|               items|
+---------+--------------------+
|130084806|[15200482, 152008...|
|222317983|           [3701005]|
|227842721|[49100019, 491000...|
|273506385|           [1005129]|
|306133243|           [1005159]|
|330587591|[28101103, 281029...|
|332018895|[22700448, 100513...|
|335512209|[6300680, 3150115...|
|344636566|[12719156, 127014...|
|372635850|           [2800382]|
|376375665|[6301622, 4804056...|
|381624492|           [1801634]|
|393249031|           [2601298]|
|395341538|          [26007955]|
|396863890|          [29501514]|
|398277136|[8500186, 1360002...|
|405111134|[28703635, 17301863]|
|406803663|           [1004814]|
|411723376|          [31501194]|
|423846383|[5701019, 5701020...|
+---------+--------------------+
only showing top 20 rows



#### Model creation

In [6]:
#Create model
fp_view = ml.fpm.FPGrowth(minSupport=0.001, minConfidence=0.8)

#### Model training

In [7]:
# train
fpm_view = fp_view.fit(sdf_user_product_list_view)
fpm_view.setPredictionCol("newPrediction")
fpm_view.freqItemsets.show(5)

+------------------+-----+
|             items| freq|
+------------------+-----+
|         [1004957]|13244|
|[1004957, 1005239]| 5436|
|[1004957, 1005160]| 5015|
|[1004957, 1004958]| 5195|
|        [14700405]| 8328|
+------------------+-----+
only showing top 5 rows



#### Results

In [8]:
# eval
fpm_view.associationRules.show()

+--------------------+----------+------------------+------------------+--------------------+
|          antecedent|consequent|        confidence|              lift|             support|
+--------------------+----------+------------------+------------------+--------------------+
|[1003317, 1005116...| [1005115]|0.8472604188703194| 13.23062778125529|0.001083721768439...|
|[1005098, 1004858...| [1004856]|0.8008811824900511|12.210026716973738|0.001524787057466...|
|[1004739, 1005161...| [1005160]|0.8111631696883276|27.036562926250205|0.001077498147796...|
|  [1005118, 1003317]| [1005115]|0.8001285347043702|12.494626898716295|0.001347549165250...|
|[1004857, 1005098...| [1004856]|0.8200350723366945|12.502041952035642|0.001012285427148732|
|[1005161, 1004741...| [1005160]|0.8514134521654366|28.378129377559667|0.001654400722156...|
|[1005159, 1005161...| [1005160]| 0.844776119402985|28.156903030506168|0.001684707048764503|
|[1003317, 1005116...| [1005115]|  0.83670005092514|13.065719455073758

### Cart patterns

#### Data Preparation

In [9]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_carts = sdf_base.where(sdf_base.carts == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_carts.show()

+---------+--------------------+
|  user_id|               items|
+---------+--------------------+
|330587591|          [28102892]|
|335512209|          [12709562]|
|423846383|           [5701246]|
|441092065|          [11900142]|
|463869843|           [8500355]|
|468103953|  [1004038, 1004166]|
|479316462|           [5100564]|
|480046242| [4803759, 13300554]|
|487489100|  [1005169, 1005211]|
|499942185|           [1005101]|
|504807134|          [16200329]|
|504866319|  [1801904, 1801691]|
|509180654|[3601605, 1801968...|
|511728441|           [5400367]|
|512373616|[2700752, 3601248...|
|512381561|[23000053, 12720610]|
|512385518|           [1307236]|
|512386605|          [12710257]|
|512386977|[29502299, 114006...|
|512388342|  [1307582, 1307350]|
+---------+--------------------+
only showing top 20 rows



#### Model creation

In [10]:
#Create model
fp_carts = ml.fpm.FPGrowth(minSupport=0.001, minConfidence=0.8)

#### Model training

In [11]:
# train
fpm_carts = fp_carts.fit(sdf_user_product_list_carts)
fpm_carts.setPredictionCol("newPrediction")
fpm_carts.freqItemsets.show(5)

+------------------+-----+
|             items| freq|
+------------------+-----+
|         [3601485]|  878|
|         [1004856]|36048|
|         [1004767]|29550|
|[1004767, 1004856]| 3243|
|         [1005252]|  877|
+------------------+-----+
only showing top 5 rows



#### Results

In [12]:
# eval
fpm_view.associationRules.show()

+--------------------+----------+------------------+------------------+--------------------+
|          antecedent|consequent|        confidence|              lift|             support|
+--------------------+----------+------------------+------------------+--------------------+
|[1003317, 1005116...| [1005115]|0.8472604188703194| 13.23062778125529|0.001083721768439...|
|[1005098, 1004858...| [1004856]|0.8008811824900511|12.210026716973738|0.001524787057466...|
|[1004739, 1005161...| [1005160]|0.8111631696883276|27.036562926250205|0.001077498147796...|
|  [1005118, 1003317]| [1005115]|0.8001285347043702|12.494626898716295|0.001347549165250...|
|[1004857, 1005098...| [1004856]|0.8200350723366945|12.502041952035642|0.001012285427148732|
|[1005161, 1004741...| [1005160]|0.8514134521654366|28.378129377559667|0.001654400722156...|
|[1005159, 1005161...| [1005160]| 0.844776119402985|28.156903030506168|0.001684707048764503|
|[1003317, 1005116...| [1005115]|  0.83670005092514|13.065719455073758

### Purchasing patterns

#### Data Preparation

In [13]:
# Grouping by user_id and creating column where all product_ids are concatenated into a list
sdf_user_product_list_purchases = sdf_base.where(sdf_base.purchases == 1).select("user_id", "product_id").distinct()\
    .groupBy("user_id")\
    .agg(f.collect_list('product_id')).withColumnRenamed('collect_list(product_id)', 'items')
sdf_user_product_list_purchases.show()

+---------+--------------------+
|  user_id|               items|
+---------+--------------------+
|423846383|           [5701246]|
|468103953|           [1004166]|
|480046242|          [13300554]|
|509180654|  [1801968, 3601278]|
|512369688|          [38900028]|
|512373616|[2700752, 3601248...|
|512385518|           [1307236]|
|512386977|          [53300009]|
|512388760|[21400205, 214002...|
|512389344|[1005105, 1005129...|
|512394562|[12711053, 100483...|
|512395692|  [4804295, 5600355]|
|512397473|          [26300084]|
|512412345|           [5301368]|
|512414883|          [12718088]|
|512420136|[12706857, 127071...|
|512420187|          [26400273]|
|512420588|           [4804056]|
|512429276|[12800418, 620080...|
|512436169|           [3200409]|
+---------+--------------------+
only showing top 20 rows



#### Model creation

In [None]:
#Create model
fp_purchases = ml.fpm.FPGrowth(minSupport=0.0001, minConfidence=0.8)

#### Model training

In [None]:
# train
fpm_purchases = fp_purchases.fit(sdf_user_product_list_purchases)
fpm_purchases.setPredictionCol("newPrediction")
fpm_purchases.freqItemsets.show(5)

#### Results

In [None]:
# eval
fpm_purchases.associationRules.show()

### Saving models

In [None]:
fpm_view.save("models/fpm_model_view")
fpm_carts.save("models/fpm_model_carts")
fpm_purchases.save("models/fpm_model_purchases")