# Customer Analysis - Explore Customer Behavior

In [1]:
import os
import pyspark
import pandas as pd
import pyspark.sql.functions as f
import plotly.express as px
import plotly.graph_objects as go

In [77]:
spark = pyspark.sql.SparkSession.builder.appName("app1").getOrCreate()
sdf = spark.read.csv("data/test_data.csv", header=True, inferSchema=True)

In [78]:
# Datatypes
sdf = sdf.withColumn("event_time", sdf["event_time"].cast(pyspark.sql.types.TimestampType()))
sdf = sdf.withColumn("category_id", sdf["category_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.StringType()))
sdf = sdf.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.StringType()))

# Feature Splitting
sdf = sdf.withColumn("category_class", f.substring_index(sdf.category_code, '.', 1))

# sdf = sdf.withColumn("category_class", f.split(sdf["category_code"], ".").getItem(0))
# sdf = sdf.withColumn("category_sub_class", f.split(sdf["category_code"], ".").getItem(1))
# sdf = sdf.withColumn("category_sub_sub_class", f.split(sdf["category_code"], ".").getItem(2))

sdf = sdf.withColumn("year", f.year("event_time"))
sdf = sdf.withColumn("month", f.month("event_time"))
sdf = sdf.withColumn("weekofyear", f.weekofyear("event_time"))
sdf = sdf.withColumn("dayofyear", f.dayofyear("event_time"))
sdf = sdf.withColumn("dayofweek", f.dayofweek("event_time"))
sdf = sdf.withColumn("dayofmonth", f.dayofmonth("event_time"))

# None Handling
sdf = sdf.fillna(value="not defined")


In [89]:
sdf = sdf.withColumn('Umsatz', f.when(f.col('event_type') == 'purchase', f.col('price')).otherwise(0))
sdf = sdf.withColumn('bougth_quantity', f.when(f.col('event_type') == 'purchase', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('viewed_quantity', f.when(f.col('event_type') == 'view', f.lit(1)).otherwise(0))
sdf = sdf.withColumn('cart_quantity', f.when(f.col('event_type') == 'cart', f.lit(1)).otherwise(0))

In [90]:
sdf_raw = sdf
sdf.createOrReplaceTempView("Data")
sdf_raw.show()

+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+------+---------------+---------------+-------------+
|         event_time|event_type|product_id|        category_id|       category_code|      brand| price|  user_id|        user_session|category_class|year|month|weekofyear|dayofyear|dayofweek|dayofmonth|Umsatz|bougth_quantity|viewed_quantity|cart_quantity|
+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+------+---------------+---------------+-------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|     xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|   electronics|2019|   11|        44|      305|        6|         1|   0.0|              0|       

## Produkte und Umsatz

In [44]:
 sdf_product_overview = spark.sql("SELECT Product_id, \
                                MEAN(price), \
                                SUM(viewed_quantity) ,\
                                SUM(cart_quantity),\
                                SUM(bougth_quantity), \
                                SUM(Umsatz) AS Umsatz\
                        FROM Data \
                        GROUP BY Product_id \
                        ORDER BY SUM(Umsatz) DESC")
sdf_product_overview.show()

+----------+-----------+--------------------+------------------+--------------------+------+
|Product_id|mean(price)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+----------+-----------+--------------------+------------------+--------------------+------+
|  13200605|      566.3|                   2|                 0|                   1| 566.3|
|   1005161|     211.92|                   1|                 0|                   1|211.92|
|   5801656|     133.83|                   1|                 0|                   0|   0.0|
|   3701016|     108.09|                   1|                 0|                   0|   0.0|
|   4804194|      69.24|                   1|                 0|                   0|   0.0|
|   1307012|    2342.15|                   1|                 0|                   0|   0.0|
|  12708306|      30.43|                   1|                 0|                   0|   0.0|
|  25600085|     134.37|                   1|                 0|      

In [45]:
px.bar(sdf_product_overview.limit(10).toPandas(), x='Product_id', y='Umsatz')

In [46]:
 sdf_brand_overview = spark.sql("SELECT brand, \
                                        COUNT(Product_id), \
                                        MEAN(price), \
                                        SUM(viewed_quantity) ,\
                                        SUM(cart_quantity),\
                                        SUM(bougth_quantity), \
                                        SUM(Umsatz) AS Umsatz\
                                FROM Data \
                                GROUP BY brand \
                                ORDER BY Umsatz DESC")
sdf_brand_overview.show()


+-----------+-----------------+------------------+--------------------+------------------+--------------------+------+
|      brand|count(Product_id)|       mean(price)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+-----------+-----------------+------------------+--------------------+------------------+--------------------+------+
|not defined|               41| 176.8419512195122|                  40|                 0|                   1| 566.3|
|     xiaomi|               15|224.28133333333332|                  14|                 0|                   1|211.92|
|   goodloot|                1|              8.73|                   1|                 0|                   0|   0.0|
|   cordiant|                2|              41.7|                   2|                 0|                   0|   0.0|
|    eveline|                1|              7.59|                   1|                 0|                   0|   0.0|
|    element|                1|            133.8

In [48]:
px.bar(sdf_brand_overview.limit(10).toPandas(), x='brand', y='Umsatz')

In [49]:
 sdf_category_id_overview = spark.sql("SELECT category_id, \
                                                COUNT(Product_id), \
                                                MEAN(price), \
                                                SUM(viewed_quantity) ,\
                                                SUM(cart_quantity),\
                                                SUM(bougth_quantity), \
                                                SUM(Umsatz) AS Umsatz\
                                        FROM Data \
                                        GROUP BY category_id \
                                        ORDER BY Umsatz DESC")
sdf_category_id_overview.show()

+-------------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|        category_id|count(Product_id)|       mean(price)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+-------------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|2053013557192163841|                3|             566.3|                   2|                 0|                   1| 566.3|
|2053013555631882655|               38| 474.5957894736841|                  36|                 1|                   1|211.92|
|2146660887346282824|                1|              8.73|                   1|                 0|                   0|   0.0|
|2053013558920217191|               18| 924.2788888888889|                  18|                 0|                   0|   0.0|
|2085718636156158307|                2|             40.93|                   2|                 0|             

In [50]:
px.bar(sdf_category_id_overview.limit(10).toPandas(), x='category_id', y='Umsatz')

In [51]:
 sdf_category_code_overview = spark.sql("SELECT category_code, \
                                        COUNT(Product_id), \
                                        MEAN(price), \
                                        SUM(viewed_quantity) ,\
                                        SUM(cart_quantity),\
                                        SUM(bougth_quantity), \
                                        SUM(Umsatz) AS Umsatz\
                                FROM Data \
                                GROUP BY category_code \
                                ORDER BY Umsatz DESC")
sdf_category_code_overview.show()

+--------------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|       category_code|count(Product_id)|       mean(price)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+--------------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|furniture.bedroom...|                4|           458.825|                   3|                 0|                   1| 566.3|
|electronics.smart...|               38| 474.5957894736841|                  36|                 1|                   1|211.92|
|auto.accessories....|                2|135.60500000000002|                   2|                 0|                   0|   0.0|
|construction.tool...|                4|          130.7775|                   4|                 0|                   0|   0.0|
|electronics.camer...|                1|            386.08|                   1|                 0|     

In [53]:
px.bar(sdf_category_code_overview.limit(10).toPandas(), x='category_code', y='Umsatz')

In [54]:
 sdf_category_class_overview = spark.sql("SELECT category_class, \
                                        COUNT(Product_id), \
                                        MEAN(price), \
                                        SUM(viewed_quantity) ,\
                                        SUM(cart_quantity),\
                                        SUM(bougth_quantity), \
                                        SUM(Umsatz) AS Umsatz\
                                FROM Data \
                                GROUP BY category_class \
                                ORDER BY Umsatz DESC")
sdf_category_class_overview.show()

+--------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|category_class|count(Product_id)|       mean(price)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+--------------+-----------------+------------------+--------------------+------------------+--------------------+------+
|     furniture|               14|241.24142857142857|                  13|                 0|                   1| 566.3|
|   electronics|               57|446.76929824561387|                  55|                 1|                   1|211.92|
|  construction|                8|110.10125000000002|                   8|                 0|                   0|   0.0|
|          auto|                7|192.08428571428573|                   7|                 0|                   0|   0.0|
|    appliances|               14|341.70214285714286|                  14|                 0|                   0|   0.0|
|     computers|        

In [55]:
px.bar(sdf_category_class_overview.limit(10).toPandas(), x='category_class', y='Umsatz')

## Users und Umsatz

In [57]:
 sdf_user_overview = spark.sql("SELECT user_id, \
                                        COUNT(user_session), \
                                        SUM(viewed_quantity) ,\
                                        SUM(cart_quantity),\
                                        SUM(bougth_quantity), \
                                        SUM(Umsatz) AS Umsatz\
                                FROM Data \
                                GROUP BY user_id \
                                ORDER BY Umsatz DESC")
sdf_user_overview.show()

+---------+-------------------+--------------------+------------------+--------------------+------+
|  user_id|count(user_session)|sum(viewed_quantity)|sum(cart_quantity)|sum(bougth_quantity)|Umsatz|
+---------+-------------------+--------------------+------------------+--------------------+------+
|559368633|                  3|                   2|                 0|                   1| 566.3|
|513351129|                  2|                   1|                 0|                   1|211.92|
|552249856|                  2|                   2|                 0|                   0|   0.0|
|518398361|                  1|                   1|                 0|                   0|   0.0|
|517811633|                  2|                   2|                 0|                   0|   0.0|
|566280399|                  1|                   1|                 0|                   0|   0.0|
|512416379|                  2|                   2|                 0|                   0|   0.0|


In [59]:
px.bar(sdf_user_overview.limit(10).toPandas(), x='user_id', y='Umsatz')

## Zeit und Umsatz

In [68]:

sdf_month_Umsatz = spark.sql("SELECT month, \
                                    dayofmonth, \
                                    SUM(Umsatz) AS Umsatz\
                            FROM Data \
                            GROUP BY month, dayofmonth \
                            ORDER BY Umsatz DESC")
sdf_month_Umsatz.show()


+-----+----------+-----------------+
|month|dayofmonth|           Umsatz|
+-----+----------+-----------------+
|   11|         1|778.2199999999999|
+-----+----------+-----------------+



In [69]:
df = sdf_month_Umsatz.toPandas()
fig = go.Figure()
fig.update_layout(
    template="simple_white",
    xaxis=dict(title_text="Day of Month"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
)

fig.add_trace(
        go.Bar(x=[df.dayofmonth, df.month], y=df.Umsatz),
    )
fig

In [71]:
sdf_week_Umsatz = spark.sql("SELECT weekofyear, \
                                    dayofweek, \
                                    SUM(Umsatz) AS Umsatz\
                            FROM Data \
                            GROUP BY weekofyear, dayofweek \
                            ORDER BY Umsatz DESC")
sdf_week_Umsatz.show()

+----------+---------+-----------------+
|weekofyear|dayofweek|           Umsatz|
+----------+---------+-----------------+
|        44|        6|778.2199999999999|
+----------+---------+-----------------+



In [73]:
df = sdf_week_Umsatz.toPandas()
fig = go.Figure()
fig.update_layout(
    template="simple_white",
    xaxis=dict(title_text="Day of Month"),
    yaxis=dict(title_text="Count"),
    barmode="stack",
)

fig.add_trace(
        go.Bar(x=[df.dayofweek, df.weekofyear], y=df.Umsatz),
    )
fig

## Korrelationsmatrix

In [108]:
sdf_corr = sdf_raw
#sdf = sdf.withColumn('Umsatz', f.when(f.col('event_type') == 'purchase', f.col('price')).otherwise(0))
#sdf = sdf.withColumn('bougth_quantity', f.when(f.col('event_type') == 'purchase', f.lit(1)).otherwise(0))
#sdf = sdf.withColumn('viewed_quantity', f.when(f.col('event_type') == 'view', f.lit(1)).otherwise(0))
#sdf = sdf.withColumn('cart_quantity', f.when(f.col('event_type') == 'cart', f.lit(1)).otherwise(0))
sdf_corr = sdf_corr.withColumn("product_id", sdf["product_id"].cast(pyspark.sql.types.FloatType()))
sdf_corr = sdf_corr.withColumn("user_id", sdf["user_id"].cast(pyspark.sql.types.FloatType()))

In [95]:
sdf_corr.show()

+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+------+---------------+---------------+-------------+
|         event_time|event_type|product_id|        category_id|       category_code|      brand| price|  user_id|        user_session|category_class|year|month|weekofyear|dayofyear|dayofweek|dayofmonth|Umsatz|bougth_quantity|viewed_quantity|cart_quantity|
+-------------------+----------+----------+-------------------+--------------------+-----------+------+---------+--------------------+--------------+----+-----+----------+---------+---------+----------+------+---------------+---------------+-------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|     xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|   electronics|2019|   11|        44|      305|        6|         1|   0.0|              0|       

In [109]:
sdf_corr.toPandas().corr().style.background_gradient(cmap='bwr')

Unnamed: 0,product_id,price,user_id,year,month,weekofyear,dayofyear,dayofweek,dayofmonth,Umsatz,bougth_quantity,viewed_quantity,cart_quantity
product_id,1.0,-0.320541,0.018191,,,,,,,-0.027387,-0.049715,0.078642,-0.065435
price,-0.320541,1.0,-0.105257,,,,,,,0.033853,0.018818,-0.033742,0.031622
user_id,0.018191,-0.105257,1.0,,,,,,,0.036053,-0.007846,0.015258,-0.015235
year,,,,,,,,,,,,,
month,,,,,,,,,,,,,
weekofyear,,,,,,,,,,,,,
dayofyear,,,,,,,,,,,,,
dayofweek,,,,,,,,,,,,,
dayofmonth,,,,,,,,,,,,,
Umsatz,-0.027387,0.033853,0.036053,,,,,,,1.0,0.909357,-0.740758,-0.005968


In [None]:
# Ideen
- Tageszeiten
- Wochentage
- Category Class
- Anfang, Mitte, Ende des Monats
- price