# Customer Analysis - Model Customer Behavior


## Import

In [1]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import pyspark.sql.types as T
import plotly.express as px
import plotly.graph_objects as go

from pyspark.ml.clustering import KMeans


## Read

In [2]:
spark = pyspark.sql.SparkSession \
    .builder \
    .master("local") \
    .appName("app_great") \
    .config("spark.executor.memory", f"16g") \
    .config("spark.driver.memory", f"16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", f"16g") \
    .config("spark.sql.debug.maxToStringFields", f"16") \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
# # Testing / Debug

# sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)


In [4]:

sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

sdf = sdf_201910.union(sdf_201911)
sdf = sdf_201911

# sdf.show(5)

## Preparation

In [5]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

# Feature Splitting
sdf = sdf.withColumn("category_class", f.split(sdf["category_code"], "\.").getItem(0))
sdf = sdf.withColumn("category_sub_class", f.split(sdf["category_code"], "\.").getItem(1))
sdf = sdf.withColumn("category_sub_sub_class", f.split(sdf["category_code"], "\.").getItem(2))

sdf = sdf.withColumn("year", f.year("event_time"))
sdf = sdf.withColumn("month", f.month("event_time"))
sdf = sdf.withColumn("weekofyear", f.weekofyear("event_time"))
sdf = sdf.withColumn("dayofyear", f.dayofyear("event_time"))
sdf = sdf.withColumn("dayofweek", f.dayofweek("event_time"))
sdf = sdf.withColumn("dayofmonth", f.dayofmonth("event_time"))
sdf = sdf.withColumn("hour", f.hour("event_time"))

# None Handling
# sdf = sdf.fillna(value="not defined")

sdf.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)
 |-- category_class: string (nullable = true)
 |-- category_sub_class: string (nullable = true)
 |-- category_sub_sub_class: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- hour: integer (nullable = true)



## Data Creation

In [6]:
sdf_session = sdf.select("user_id", "user_session", "event_type", "product_id", "price", "event_time")

sdf_session = sdf_session.withColumn("bought_product", f.when(sdf_session.event_type == "purchase", sdf_session["product_id"]).otherwise(None))

sdf_session = sdf_session.withColumn("views", f.when(sdf_session.event_type == "view", 1).otherwise(0))
sdf_session = sdf_session.withColumn("purchases", f.when(sdf_session.event_type == "purchase", 1).otherwise(0))
sdf_session = sdf_session.withColumn("carts", f.when(sdf_session.event_type == "cart", 1).otherwise(0))

sdf_session = sdf_session.withColumn("first_event", sdf_session.event_time)
sdf_session = sdf_session.withColumn("last_event", sdf_session.event_time)

In [7]:
sdf_session.show()

+---------+--------------------+----------+----------+------+-------------------+--------------+-----+---------+-----+-------------------+-------------------+
|  user_id|        user_session|event_type|product_id| price|         event_time|bought_product|views|purchases|carts|        first_event|         last_event|
+---------+--------------------+----------+----------+------+-------------------+--------------+-----+---------+-----+-------------------+-------------------+
|520088904|4d3b30da-a5e4-49d...|      view|   1003461|489.07|2019-11-01 01:00:00|          null|    1|        0|    0|2019-11-01 01:00:00|2019-11-01 01:00:00|
|530496790|8e5f4f83-366c-4f7...|      view|   5000088|293.65|2019-11-01 01:00:00|          null|    1|        0|    0|2019-11-01 01:00:00|2019-11-01 01:00:00|
|561587266|755422e7-9040-477...|      view|  17302664| 28.31|2019-11-01 01:00:01|          null|    1|        0|    0|2019-11-01 01:00:01|2019-11-01 01:00:01|
|518085591|3bfb58cd-7892-48c...|      view|   

In [8]:
sdf_session_agg = sdf_session.groupBy("user_id", "user_session").agg(f.avg("price"), f.sum("views"), f.sum("purchases"), f.sum("carts"), f.min("event_time"), f.max("event_time"), f.collect_list("bought_product"))
sdf_session_agg = sdf_session_agg.withColumn("duration", (sdf_session_agg["max(event_time)"] - sdf_session_agg["min(event_time)"]))
sdf_session_agg = sdf_session_agg.withColumn("sum(events)", (sdf_session_agg["sum(views)"] + sdf_session_agg["sum(purchases)"] + sdf_session_agg["sum(carts)"]))
sdf_session_agg = sdf_session_agg.withColumn("turnover", f.when(sdf_session_agg["sum(purchases)"] > 0, (sdf_session_agg["sum(purchases)"] *  sdf_session_agg["avg(price)"])).otherwise(0))

sdf_session_agg = sdf_session_agg.withColumn("successfull", f.when(sdf_session_agg["sum(purchases)"] > 0, 1).otherwise(0))
sdf_session_agg.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)
 |-- avg(price): double (nullable = true)
 |-- sum(views): long (nullable = true)
 |-- sum(purchases): long (nullable = true)
 |-- sum(carts): long (nullable = true)
 |-- min(event_time): timestamp (nullable = true)
 |-- max(event_time): timestamp (nullable = true)
 |-- collect_list(bought_product): array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- duration: interval (nullable = true)
 |-- sum(events): long (nullable = true)
 |-- turnover: double (nullable = true)
 |-- successfull: integer (nullable = false)



In [9]:
# sdf_session_agg.show()

In [10]:
sdf_session_agg.where(sdf_session_agg["turnover"] > 0).show()

+---------+--------------------+------------------+----------+--------------+----------+-------------------+-------------------+----------------------------+--------------------+-----------+------------------+-----------+
|  user_id|        user_session|        avg(price)|sum(views)|sum(purchases)|sum(carts)|    min(event_time)|    max(event_time)|collect_list(bought_product)|            duration|sum(events)|          turnover|successfull|
+---------+--------------------+------------------+----------+--------------+----------+-------------------+-------------------+----------------------------+--------------------+-----------+------------------+-----------+
|481246452|f09b7081-1537-483...|             63.39|         2|             1|         1|2019-11-13 05:09:54|2019-11-13 05:19:25|                  [25900023]|9 minutes 31 seconds|          4|             63.39|          1|
|509180654|9d482756-c80d-4e1...| 364.8894736842106|        16|             1|         2|2019-11-25 06:20:27|2019

In [11]:
sdf_customer_profile = sdf_session_agg.groupBy("user_id").agg(f.sum("sum(events)").alias("sum_events"), f.sum("sum(views)").alias("sum_views"), f.sum("sum(purchases)").alias("sum_purchases"), f.sum("sum(carts)").alias("sum_carts"), f.sum("turnover").alias("sum_turnover"), f.count("user_session").alias("count_session"), f.sum("successfull").alias("sum_successfull"), f.collect_list("collect_list(bought_product)").alias("bought_product"))

sdf_customer_profile = sdf_customer_profile.withColumn("avg_turnover_per_session", (sdf_customer_profile["sum_turnover"] / sdf_customer_profile["count_session"]))
sdf_customer_profile = sdf_customer_profile.withColumn("avg_events_per_session", (sdf_customer_profile["sum_events"] / sdf_customer_profile["count_session"]))


sdf_customer_profile.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- sum_events: long (nullable = true)
 |-- sum_views: long (nullable = true)
 |-- sum_purchases: long (nullable = true)
 |-- sum_carts: long (nullable = true)
 |-- sum_turnover: double (nullable = true)
 |-- count_session: long (nullable = false)
 |-- sum_successfull: long (nullable = true)
 |-- bought_product: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: string (containsNull = false)
 |-- avg_turnover_per_session: double (nullable = true)
 |-- avg_events_per_session: double (nullable = true)



In [12]:
sdf_customer_profile.where(sdf_customer_profile["avg_turnover_per_session"] > 0).show()

+---------+----------+---------+-------------+---------+------------------+-------------+---------------+--------------------+------------------------+----------------------+
|  user_id|sum_events|sum_views|sum_purchases|sum_carts|      sum_turnover|count_session|sum_successfull|      bought_product|avg_turnover_per_session|avg_events_per_session|
+---------+----------+---------+-------------+---------+------------------+-------------+---------------+--------------------+------------------------+----------------------+
|423846383|        15|       12|            1|        2|             24.45|            3|              1| [[5701246], [], []]|                    8.15|                   5.0|
|468103953|        25|       22|            1|        2| 169.6809090909091|            2|              1|     [[1004166], []]|       84.84045454545455|                  12.5|
|480046242|        25|       22|            1|        2| 138.7111111111111|            4|              1|[[], [], [1330055...

## Data Split

In [17]:
(trainingData, testData) = sdf_customer_profile.where(sdf_customer_profile["avg_turnover_per_session"] > 0).randomSplit([0.7, 0.3], seed=123)

In [18]:
df_train = trainingData.toPandas()


In [32]:
df_test = testData.toPandas()


## Modelling

Task:
Cluster (unsupervised) the customer in groups:

- high 
- medium
- low
- NULL with potential
- NULL without potential


Target Feature -> Turnover

Next Steps: Find similiarites inside these customer groups:

- Buys same products
- similar behavior -> same marketing

### K-Means

In [23]:
from sklearn.cluster import KMeans

In [21]:
df_train.to_csv("customer_profile.csv")

In [22]:
# Only Customer with Turnover!
df_train.head()

Unnamed: 0,user_id,sum_events,sum_views,sum_purchases,sum_carts,sum_turnover,count_session,sum_successfull,bought_product,avg_turnover_per_session,avg_events_per_session
0,423846383,15,12,1,2,24.45,3,1,"[[5701246], [], []]",8.15,5.0
1,468103953,25,22,1,2,169.680909,2,1,"[[1004166], []]",84.840455,12.5
2,509180654,33,28,2,3,823.71614,4,2,"[[], [], [1801968], [3601278]]",205.929035,8.25
3,512369688,94,93,1,0,99.861364,20,1,"[[], [], [], [], [], [], [38900028], [], [], [...",4.993068,4.7
4,512373616,25,15,5,5,940.827231,4,2,"[[2700752, 2501514, 4502254], [], [], [1000038...",235.206808,6.25


In [30]:
X = df_train[["sum_events", "sum_views", "sum_purchases", "sum_carts", "sum_turnover", "count_session", "sum_successfull"]]

In [24]:
# model
kmeans_model = KMeans(n_clusters=6, random_state=123)

In [31]:
# train
kmeans_model.fit(X)

KMeans(n_clusters=6, random_state=123)

In [15]:
# eval


In [33]:
kmeans.cluster_centers_

AttributeError: 'KMeans' object has no attribute 'cluster_centers_'

### Random Forrest

## General Comparison

## Hyperparamter & extended Stuff