# Customer Analysis - Model Customer Behavior


## Import

In [1]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import pyspark.sql.types as T
import plotly.express as px
import plotly.graph_objects as go

## Read

In [2]:
spark = pyspark.sql.SparkSession \
    .builder \
    .master("local") \
    .appName("app_great") \
    .config("spark.executor.memory", f"16g") \
    .config("spark.driver.memory", f"16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", f"16g") \
    .config("spark.sql.debug.maxToStringFields", f"16") \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
# Testing / Debug

# sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)


In [4]:

sdf_201911 = spark.read.csv("data/2019-Nov.csv", header=True, inferSchema=True)
# sdf_201910 = spark.read.csv("data/2019-Oct.csv", header=True, inferSchema=True)

# sdf = sdf_201910.union(sdf_201911)
sdf = sdf_201911


## Preparation

In [5]:
# Feature Splitting
# sdf = sdf.withColumn("category_class", f.substring_index(sdf.category_code, '.', 1))

sdf = sdf.withColumn("category_class", f.split(sdf["category_code"], "\.").getItem(0))
sdf = sdf.withColumn("category_sub_class", f.split(sdf["category_code"], "\.").getItem(1))
sdf = sdf.withColumn("category_sub_sub_class", f.split(sdf["category_code"], "\.").getItem(2))

sdf = sdf.withColumn("year", f.year("event_time"))
sdf = sdf.withColumn("month", f.month("event_time"))
sdf = sdf.withColumn("weekofyear", f.weekofyear("event_time"))
sdf = sdf.withColumn("dayofyear", f.dayofyear("event_time"))
sdf = sdf.withColumn("dayofweek", f.dayofweek("event_time"))
sdf = sdf.withColumn("dayofmonth", f.dayofmonth("event_time"))
sdf = sdf.withColumn("hour", f.hour("event_time"))

sdf = sdf.withColumn('turnover', f.when(f.col('event_type') == 'purchase', f.col('price')).otherwise(0))

sdf = sdf.withColumn('purchases', f.when(f.col('event_type') == 'purchase', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('views', f.when(f.col('event_type') == 'view', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('carts', f.when(f.col('event_type') == 'cart', f.lit(1)).otherwise(0))


# None Handling
# sdf = sdf.fillna(value="not defined")

sdf.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- category_class: string (nullable = true)
 |-- category_sub_class: string (nullable = true)
 |-- category_sub_sub_class: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- turnover: double (nullable = true)
 |-- purchases: integer (nullable = false)
 |-- views: integer (nullable = false)
 |-- carts: integer (nullable = false)



In [6]:
# sdf.show(1, vertical=True, truncate=False)

## Data Creation

In [7]:
sdf_session = sdf.select("user_id", "user_session", "event_type", "product_id", "price", "event_time",'purchases',
'views', 'carts') 

sdf_session = sdf_session.withColumn("bought_product", f.when(sdf_session.event_type == "purchase", sdf_session["product_id"]).otherwise(None))

sdf_session = sdf_session.withColumn("first_event", sdf_session.event_time)
sdf_session = sdf_session.withColumn("last_event", sdf_session.event_time)

In [8]:
# sdf_session.show()

In [9]:
sdf_session_agg = sdf_session.groupBy("user_id", "user_session").agg(f.avg("price"), f.sum("views"), f.sum("purchases"), f.sum("carts"), f.min("event_time"), f.max("event_time"), f.collect_list("bought_product"))
sdf_session_agg = sdf_session_agg.withColumn("duration", (sdf_session_agg["max(event_time)"] - sdf_session_agg["min(event_time)"]))
sdf_session_agg = sdf_session_agg.withColumn("sum(events)", (sdf_session_agg["sum(views)"] + sdf_session_agg["sum(purchases)"] + sdf_session_agg["sum(carts)"]))
sdf_session_agg = sdf_session_agg.withColumn("turnover", f.when(sdf_session_agg["sum(purchases)"] > 0, (sdf_session_agg["sum(purchases)"] *  sdf_session_agg["avg(price)"])).otherwise(0))
sdf_session_agg = sdf_session_agg.withColumn("avg(price)", f.round(sdf_session_agg["avg(price)"],2) )

sdf_session_agg = sdf_session_agg.withColumn("successfull", f.when(sdf_session_agg["sum(purchases)"] > 0, 1).otherwise(0))
sdf_session_agg.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- avg(price): double (nullable = true)
 |-- sum(views): long (nullable = true)
 |-- sum(purchases): long (nullable = true)
 |-- sum(carts): long (nullable = true)
 |-- min(event_time): string (nullable = true)
 |-- max(event_time): string (nullable = true)
 |-- collect_list(bought_product): array (nullable = false)
 |    |-- element: integer (containsNull = false)
 |-- duration: double (nullable = true)
 |-- sum(events): long (nullable = true)
 |-- turnover: double (nullable = true)
 |-- successfull: integer (nullable = false)



In [10]:
# sdf_session_agg.show(truncate=False)

In [11]:
# sdf_session_agg.where(sdf_session_agg["turnover"] > 0).show()

In [12]:
sdf_customer_profile = sdf_session_agg.groupBy("user_id").agg(f.sum("sum(events)").alias("sum_events"), f.sum("sum(views)").alias("sum_views"), f.sum("sum(purchases)").alias("sum_purchases"), f.sum("sum(carts)").alias("sum_carts"), f.sum("turnover").alias("sum_turnover"), f.count("user_session").alias("count_session"), f.sum("successfull").alias("sum_successfull"), f.collect_list("collect_list(bought_product)").alias("bought_product"), f.collect_list("user_session").alias("user_sessions"))

sdf_customer_profile = sdf_customer_profile.withColumn("avg_turnover_per_session", (sdf_customer_profile["sum_turnover"] / sdf_customer_profile["count_session"]))
sdf_customer_profile = sdf_customer_profile.withColumn("avg_events_per_session", (sdf_customer_profile["sum_events"] / sdf_customer_profile["count_session"]))


sdf_customer_profile.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- sum_events: long (nullable = true)
 |-- sum_views: long (nullable = true)
 |-- sum_purchases: long (nullable = true)
 |-- sum_carts: long (nullable = true)
 |-- sum_turnover: double (nullable = true)
 |-- count_session: long (nullable = false)
 |-- sum_successfull: long (nullable = true)
 |-- bought_product: array (nullable = false)
 |    |-- element: array (containsNull = false)
 |    |    |-- element: integer (containsNull = false)
 |-- user_sessions: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- avg_turnover_per_session: double (nullable = true)
 |-- avg_events_per_session: double (nullable = true)



In [13]:
# sdf_customer_profile.where(sdf_customer_profile["avg_turnover_per_session"] > 0).show(truncate=False)

## Data Split

In [14]:
(trainingData, testData, devData) = sdf_customer_profile.where(sdf_customer_profile["avg_turnover_per_session"] > 0).randomSplit([0.6, 0.3, 0.1], seed=123)

In [15]:
devData = devData.limit(10000)
devData.show(5)

+-------+----------+---------+-------------+---------+------------+-------------+---------------+--------------+-------------+------------------------+----------------------+
|user_id|sum_events|sum_views|sum_purchases|sum_carts|sum_turnover|count_session|sum_successfull|bought_product|user_sessions|avg_turnover_per_session|avg_events_per_session|
+-------+----------+---------+-------------+---------+------------+-------------+---------------+--------------+-------------+------------------------+----------------------+
+-------+----------+---------+-------------+---------+------------+-------------+---------------+--------------+-------------+------------------------+----------------------+



## Modelling

### K-Means

Task:
Cluster (unsupervised) the customer in groups:

- high 
- medium
- low
- NULL with potential
- NULL without potential


Target Feature -> Turnover

In [16]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.feature import VectorAssembler


In [17]:
dataset = sdf_customer_profile.select("sum_events", "sum_views", "sum_purchases", "sum_carts", "sum_turnover", "count_session", "sum_successfull", "avg_turnover_per_session", "avg_events_per_session")

In [18]:
dataset.show()

+----------+---------+-------------+---------+------------+-------------+---------------+------------------------+----------------------+
|sum_events|sum_views|sum_purchases|sum_carts|sum_turnover|count_session|sum_successfull|avg_turnover_per_session|avg_events_per_session|
+----------+---------+-------------+---------+------------+-------------+---------------+------------------------+----------------------+
|         1|        1|            0|        0|         0.0|            1|              0|                     0.0|                   1.0|
|         1|        1|            0|        0|         0.0|            1|              0|                     0.0|                   1.0|
|         2|        2|            0|        0|         0.0|            1|              0|                     0.0|                   2.0|
|         1|        1|            0|        0|         0.0|            1|              0|                     0.0|                   1.0|
|         1|        1|            

In [19]:
features=("sum_views", "sum_turnover", "sum_purchases", "sum_carts", "count_session")

assembler = VectorAssembler(inputCols=features,outputCol="features")

dataset=assembler.transform(dataset)
dataset.select("features").show(truncate=False)

+-----------------------+
|features               |
+-----------------------+
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[2.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[4.0,1.0])    |
|(5,[0,4],[2.0,1.0])    |
|(5,[0,4],[2.0,1.0])    |
|(5,[0,4],[3.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[3.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|(5,[0,4],[2.0,1.0])    |
|(5,[0,4],[1.0,1.0])    |
|[2.0,566.3,1.0,0.0,1.0]|
|(5,[0,4],[1.0,1.0])    |
+-----------------------+
only showing top 20 rows



In [20]:
# Trains a k-means model.
kmeans = KMeans().setK(4).setSeed(123)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))


# Evaluate clustering.
# cost = model.computeCost(dataset)
# print("Within Set Sum of Squared Errors = " + str(cost))

# Shows the result.
print("Cluster Centers: ")
ctr=[]
centers = model.clusterCenters()
for center in centers:
    ctr.append(center)
    print(center)

Silhouette with squared euclidean distance = 0.8898038925375745
Cluster Centers: 
[1.22137405 0.         0.         0.01526718 1.        ]
[  2.  566.3   1.    0.    1. ]
[  1.   211.92   1.     0.     1.  ]
[3.85 0.   0.   0.   1.  ]


In [21]:
predictions.show()

+----------+---------+-------------+---------+------------+-------------+---------------+------------------------+----------------------+--------------------+----------+
|sum_events|sum_views|sum_purchases|sum_carts|sum_turnover|count_session|sum_successfull|avg_turnover_per_session|avg_events_per_session|            features|prediction|
+----------+---------+-------------+---------+------------+-------------+---------------+------------------------+----------------------+--------------------+----------+
|         1|        1|            0|        0|         0.0|            1|              0|                     0.0|                   1.0| (5,[0,4],[1.0,1.0])|         0|
|         1|        1|            0|        0|         0.0|            1|              0|                     0.0|                   1.0| (5,[0,4],[1.0,1.0])|         0|
|         2|        2|            0|        0|         0.0|            1|              0|                     0.0|                   2.0| (5,[0,4],[2.

In [22]:
predictions.select("prediction").distinct().show()

+----------+
|prediction|
+----------+
|         1|
|         3|
|         2|
|         0|
+----------+



In [23]:
centers = pd.DataFrame(ctr,columns=features)

In [24]:
centers.head()

Unnamed: 0,sum_views,sum_turnover,sum_purchases,sum_carts,count_session
0,1.221374,0.0,0.0,0.015267,1.0
1,2.0,566.3,1.0,0.0,1.0
2,1.0,211.92,1.0,0.0,1.0
3,3.85,0.0,0.0,0.0,1.0


In [33]:
# Visualise result
fig = px.scatter(centers, x="sum_views", y="sum_carts", size="count_session", color="sum_turnover", hover_data=["sum_purchases"])
fig.show()

In [34]:
fig = px.scatter(predictions.toPandas(), x="sum_views", y="sum_carts", size="count_session", color="sum_turnover", hover_data=["sum_purchases"])
fig.show()

In [27]:
df = px.data.iris()
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species",
                 size='petal_length', hover_data=['petal_width'])
fig.show()

In [28]:
# Ellbow Curve


### Gaussian Mixture

In [29]:

# loads data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(dataset)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

AnalysisException: Path does not exist: file:/d:/customeranalysis/src/data/mllib/sample_kmeans_data.txt

Next Steps: Find similiarites inside these customer groups:

- Buys same products
- similar behavior -> same marketing