In [1]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.master("local").appName("SparkSQL").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/11 11:19:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [4]:
schema = StructType(
    [
        StructField('event_time_UTC', StringType()),
        StructField('event_type', StringType()),
        StructField('product_id', StringType()),
        StructField('category_id', StringType()),
        StructField('category_code', StringType()),
        StructField('brand', StringType()),
        StructField('price', StringType()),
        StructField('user_id', StringType()),
        StructField('user_session', StringType()),
    ]
)

In [5]:
df = spark.read.option("header",True).schema(schema).csv("./2019-Nov.csv")#.limit(10000)

In [6]:
df.show(truncate=False)

+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|event_time_UTC         |event_type|product_id|category_id        |category_code                   |brand   |price |user_id  |user_session                        |
+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|2019-11-01 00:00:00 UTC|view      |1003461   |2053013555631882655|electronics.smartphone          |xiaomi  |489.07|520088904|4d3b30da-a5e4-49df-b1a8-ba5943f1dd33|
|2019-11-01 00:00:00 UTC|view      |5000088   |2053013566100866035|appliances.sewing_machine       |janome  |293.65|530496790|8e5f4f83-366c-4f70-860e-ca7417414283|
|2019-11-01 00:00:01 UTC|view      |17302664  |2053013553853497655|null                            |creed   |28.31 |561587266|755422e7-9040-477b-9bd2-6a6e8fd97387|
|2019-11-01 00:0

                                                                                

In [7]:
df.printSchema()

root
 |-- event_time_UTC: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [8]:
df.head(5)

[Row(event_time_UTC='2019-11-01 00:00:00 UTC', event_type='view', product_id='1003461', category_id='2053013555631882655', category_code='electronics.smartphone', brand='xiaomi', price='489.07', user_id='520088904', user_session='4d3b30da-a5e4-49df-b1a8-ba5943f1dd33'),
 Row(event_time_UTC='2019-11-01 00:00:00 UTC', event_type='view', product_id='5000088', category_id='2053013566100866035', category_code='appliances.sewing_machine', brand='janome', price='293.65', user_id='530496790', user_session='8e5f4f83-366c-4f70-860e-ca7417414283'),
 Row(event_time_UTC='2019-11-01 00:00:01 UTC', event_type='view', product_id='17302664', category_id='2053013553853497655', category_code=None, brand='creed', price='28.31', user_id='561587266', user_session='755422e7-9040-477b-9bd2-6a6e8fd97387'),
 Row(event_time_UTC='2019-11-01 00:00:01 UTC', event_type='view', product_id='3601530', category_id='2053013563810775923', category_code='appliances.kitchen.washer', brand='lg', price='712.87', user_id='51808

In [9]:
df.tail(5)

[Row(event_time_UTC='2019-11-30 23:59:58 UTC', event_type='view', product_id='15700137', category_id='2053013559733912211', category_code=None, brand=None, price='277.74', user_id='532714000', user_session='02b4131c-0112-4231-aafa-ceaa08e77c1b'),
 Row(event_time_UTC='2019-11-30 23:59:58 UTC', event_type='view', product_id='28719425', category_id='2053013565639492569', category_code='apparel.shoes', brand='baden', price='62.81', user_id='545223467', user_session='734c5eef-0742-4f8b-9d22-48f75b0bc359'),
 Row(event_time_UTC='2019-11-30 23:59:59 UTC', event_type='view', product_id='1004833', category_id='2053013555631882655', category_code='electronics.smartphone', brand='samsung', price='167.03', user_id='557794415', user_session='6fecf566-ebb0-4e70-a243-cdc13ce044cb'),
 Row(event_time_UTC='2019-11-30 23:59:59 UTC', event_type='view', product_id='2701706', category_id='2053013563911439225', category_code='appliances.kitchen.refrigerators', brand='samsung', price='566.27', user_id='5316074

# 1. 해당 전체 기간에서, KST 기준으로 active user 수가 제일 큰 날짜를 구하세요

In [10]:
df = df.withColumn('event_time', F.col('event_time_UTC')+F.expr("INTERVAL 9 HOURS")).filter(F.month('event_time') == 11)

In [11]:
max_active = df.groupby(F.dayofmonth('event_time').alias('date')).agg(F.count_distinct('user_id').alias('u_count')).orderBy('u_count', ascending=False)

In [12]:
max_user = max_active.collect()

                                                                                

In [13]:
max_user

[Row(date=17, u_count=499430),
 Row(date=16, u_count=476465),
 Row(date=15, u_count=360900),
 Row(date=18, u_count=341859),
 Row(date=14, u_count=305355),
 Row(date=19, u_count=297422),
 Row(date=21, u_count=296069),
 Row(date=20, u_count=291765),
 Row(date=11, u_count=291596),
 Row(date=12, u_count=288691),
 Row(date=13, u_count=285032),
 Row(date=7, u_count=282676),
 Row(date=8, u_count=281517),
 Row(date=29, u_count=280558),
 Row(date=10, u_count=274830),
 Row(date=9, u_count=271257),
 Row(date=5, u_count=270277),
 Row(date=30, u_count=263591),
 Row(date=4, u_count=262783),
 Row(date=26, u_count=260737),
 Row(date=6, u_count=256212),
 Row(date=27, u_count=254825),
 Row(date=22, u_count=253994),
 Row(date=25, u_count=253317),
 Row(date=28, u_count=249214),
 Row(date=23, u_count=248794),
 Row(date=3, u_count=244636),
 Row(date=24, u_count=243357),
 Row(date=2, u_count=229279),
 Row(date=1, u_count=163146)]

In [14]:
target_date = max_user[0].date

In [15]:
target_date #active user 수가 가장 큰 날짜

17

---

# 2. 1의 날짜에서, 세션이 가장 긴 사용자 10명에 대해 "user_id, session_id, 세션시간"를 구하세요

In [16]:
tmp = df.filter(F.dayofmonth(df.event_time) == target_date).orderBy('user_session')

In [17]:
tmp.printSchema()

root
 |-- event_time_UTC: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)
 |-- event_time: string (nullable = true)



In [18]:
spark.catalog.dropTempView('tmp')
tmp.createTempView('tmp')

### sql

In [19]:
spark.sql("""
        select first(user_id) as user_id, 
                user_session as session_id, 
                round(
                    (extract(hours from to_timestamp(max(event_time))-to_timestamp(min(event_time)))*60 
                    + extract(mins from to_timestamp(max(event_time))-to_timestamp(min(event_time)))
                    + extract(seconds from to_timestamp(max(event_time))-to_timestamp(min(event_time)))/60), 1
                ) as session_time_minute
        from tmp
        group by session_id
        order by session_time_minute desc
        limit 10
        """).show(truncate=False)



+---------+------------------------------------+-------------------+
|user_id  |session_id                          |session_time_minute|
+---------+------------------------------------+-------------------+
|565022209|a7c5906e-5dd8-4175-aeca-eb5615844e67|1306.3             |
|568848552|f2d487ec-1a93-47cd-916a-5e46db85472c|1278.5             |
|546179105|c28e0611-7590-49cb-9c19-1d4f6540592a|1264.1             |
|557268031|a7b1bde2-4493-4c4b-9517-8b8a0a9cf494|1236.6             |
|554760857|c5fc6a55-1735-4636-8490-b6dfd060164c|1212.2             |
|543658395|8c85d761-88c9-45f6-b844-4e506c58f9b3|1196.2             |
|532969916|1ffae9b4-50e0-4d76-b354-6df7b8d19445|1167.6             |
|544784642|cfd07110-7047-4ac5-bb17-a4cbd05a491b|1112.3             |
|561409785|a7db81e5-7d03-404b-806b-063148c86e1b|1095.7             |
|524688046|0193ea4b-88eb-40ad-bae9-2b00ecd1481f|1092.3             |
+---------+------------------------------------+-------------------+



                                                                                

### spark method

In [20]:
tmp\
.groupby("user_session")\
.agg(F.first("user_id").alias("user_id"),
     F.col('user_session').alias('session_id'),
     F.round((F.to_timestamp(F.max("event_time")).cast("long") - F.to_timestamp(F.min("event_time")).cast("long"))/60, 1).alias('session_time(minute)')
    )\
.orderBy("session_time(minute)", ascending=False)\
.limit(10)\
.select('user_id', 'session_id', 'session_time(minute)')\
.show(truncate=False)



+---------+------------------------------------+--------------------+
|user_id  |session_id                          |session_time(minute)|
+---------+------------------------------------+--------------------+
|565022209|a7c5906e-5dd8-4175-aeca-eb5615844e67|1306.3              |
|568848552|f2d487ec-1a93-47cd-916a-5e46db85472c|1278.5              |
|546179105|c28e0611-7590-49cb-9c19-1d4f6540592a|1264.1              |
|557268031|a7b1bde2-4493-4c4b-9517-8b8a0a9cf494|1236.6              |
|554760857|c5fc6a55-1735-4636-8490-b6dfd060164c|1212.2              |
|543658395|8c85d761-88c9-45f6-b844-4e506c58f9b3|1196.2              |
|532969916|1ffae9b4-50e0-4d76-b354-6df7b8d19445|1167.6              |
|544784642|cfd07110-7047-4ac5-bb17-a4cbd05a491b|1112.3              |
|561409785|a7db81e5-7d03-404b-806b-063148c86e1b|1095.7              |
|524688046|0193ea4b-88eb-40ad-bae9-2b00ecd1481f|1092.3              |
+---------+------------------------------------+--------------------+



                                                                                

---

# 3. 1의 날짜의 15분단위로 active user 수를 구하세요

### sql

In [21]:
start1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15)/60))"
start2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15, 60))"

query = f"""concat( right(concat('0',{start1}), 2),
                    ":",
                    right(concat('0',{start2}), 2)
                )"""

In [22]:
end1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15+15)/60))"
end2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15+15, 60))"

query2 = f"""concat( right(concat('0',{end1}), 2), 
                    ":",
                    right(concat('0',{end2}), 2)
                    
                )"""

In [23]:
spark.sql(f"""
        SELECT {query} as start, {query2} as end, count(distinct(user_id)) as active_users
        from tmp
        group by floor((hour(event_time)*60 + minute(event_time))/15)
        order by start
        """).show(200, truncate=False)

[Stage 30:>                                                         (0 + 1) / 1]

+-----+-----+------------+
|start|end  |active_users|
+-----+-----+------------+
|00:00|00:15|18397       |
|00:15|00:30|17563       |
|00:30|00:45|14695       |
|00:45|01:00|16119       |
|01:00|01:15|15257       |
|01:15|01:30|15763       |
|01:30|01:45|16024       |
|01:45|02:00|12350       |
|02:00|02:15|12825       |
|02:15|02:30|14053       |
|02:30|02:45|13852       |
|02:45|03:00|14612       |
|03:00|03:15|12782       |
|03:15|03:30|12220       |
|03:30|03:45|14555       |
|03:45|04:00|13697       |
|04:00|04:15|12041       |
|04:15|04:30|10895       |
|04:30|04:45|9556        |
|04:45|05:00|8205        |
|05:00|05:15|7099        |
|05:15|05:30|6040        |
|05:30|05:45|5243        |
|05:45|06:00|4398        |
|06:00|06:15|3800        |
|06:15|06:30|3220        |
|06:30|06:45|2844        |
|06:45|07:00|2480        |
|07:00|07:15|2228        |
|07:15|07:30|606         |
|07:30|07:45|891         |
|07:45|08:00|1308        |
|08:00|08:15|1296        |
|08:15|08:30|1228        |
|

                                                                                

### spark method

In [24]:
tmp.groupby(F.floor(F.floor(F.hour('event_time')*60 + F.minute('event_time'))/15))\
.agg(F.round(F.floor(F.floor(F.hour('event_time')*60 + F.minute('event_time'))/15)*15).alias('m'), F.count_distinct('user_id').alias('active_users'))\
.withColumn("start", F.concat(F.format_string('%02d', F.floor(F.col('m')/60)),F.lit(':'),F.format_string('%02d', F.col('m')%60)))\
.withColumn("end", F.concat(F.format_string("%02d",F.floor((F.col('m')+15)/60)), F.lit(":"), F.format_string("%02d",F.floor((F.col('m')+15)%60))))\
.select('start', 'end', 'active_users')\
.orderBy('start')\
.show(200)

[Stage 36:>                                                         (0 + 1) / 1]

+-----+-----+------------+
|start|  end|active_users|
+-----+-----+------------+
|00:00|00:15|       18397|
|00:15|00:30|       17563|
|00:30|00:45|       14695|
|00:45|01:00|       16119|
|01:00|01:15|       15257|
|01:15|01:30|       15763|
|01:30|01:45|       16024|
|01:45|02:00|       12350|
|02:00|02:15|       12825|
|02:15|02:30|       14053|
|02:30|02:45|       13852|
|02:45|03:00|       14612|
|03:00|03:15|       12782|
|03:15|03:30|       12220|
|03:30|03:45|       14555|
|03:45|04:00|       13697|
|04:00|04:15|       12041|
|04:15|04:30|       10895|
|04:30|04:45|        9556|
|04:45|05:00|        8205|
|05:00|05:15|        7099|
|05:15|05:30|        6040|
|05:30|05:45|        5243|
|05:45|06:00|        4398|
|06:00|06:15|        3800|
|06:15|06:30|        3220|
|06:30|06:45|        2844|
|06:45|07:00|        2480|
|07:00|07:15|        2228|
|07:15|07:30|         606|
|07:30|07:45|         891|
|07:45|08:00|        1308|
|08:00|08:15|        1296|
|08:15|08:30|        1228|
|

                                                                                

---

# 4. 1의 날짜에서 view → cart → purchase 이벤트 진행에 따른 funnel 수치를 구하세요

In [25]:
window = Window.orderBy(F.desc(F.substring('event_type', -1, 1)))

In [26]:
tmp.groupby('event_type').agg(F.count_distinct('user_id').alias('activated_user'))\
.withColumn('lag', F.lag('activated_user', 1, 0).over(window))\
.withColumn('Bounce_Rate', F.when(F.col('lag') == 0, 0).otherwise(F.round(1 - F.col('activated_user')/F.col('lag'), 3)))\
.withColumn('Residual_rate', F.round(F.col('activated_user')/F.max("activated_user").over(window), 3))\
.withColumn('Drop_rate', F.concat(F.round(100*F.lag(F.col('Residual_rate')-F.lag('Residual_rate', -1, 0).over(window), 1, 0).over(window), 1), F.lit("%p")))\
.select("event_type", "activated_user", "Bounce_Rate", "Residual_rate", "Drop_rate").show()

[Stage 42:>                                                         (0 + 1) / 1]

+----------+--------------+-----------+-------------+---------+
|event_type|activated_user|Bounce_Rate|Residual_rate|Drop_rate|
+----------+--------------+-----------+-------------+---------+
|      view|        498268|        0.0|          1.0|    0.0%p|
|      cart|        157923|      0.683|        0.317|   68.3%p|
|  purchase|         92930|      0.412|        0.187|   13.0%p|
+----------+--------------+-----------+-------------+---------+



                                                                                