In [1]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.master("local").appName("SparkSQL").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/11 08:15:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [4]:
schema = StructType(
    [
        StructField('event_time', StringType()),
        StructField('event_type', StringType()),
        StructField('product_id', StringType()),
        StructField('category_id', StringType()),
        StructField('category_code', StringType()),
        StructField('brand', StringType()),
        StructField('price', StringType()),
        StructField('user_id', StringType()),
        StructField('user_session', StringType()),
    ]
)

In [5]:
df = spark.read.option("header",True).schema(schema).csv("./2019-Nov.csv")#.limit(10000)

In [6]:
df.show(truncate=False)

+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|event_time             |event_type|product_id|category_id        |category_code                   |brand   |price |user_id  |user_session                        |
+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|2019-11-01 00:00:00 UTC|view      |1003461   |2053013555631882655|electronics.smartphone          |xiaomi  |489.07|520088904|4d3b30da-a5e4-49df-b1a8-ba5943f1dd33|
|2019-11-01 00:00:00 UTC|view      |5000088   |2053013566100866035|appliances.sewing_machine       |janome  |293.65|530496790|8e5f4f83-366c-4f70-860e-ca7417414283|
|2019-11-01 00:00:01 UTC|view      |17302664  |2053013553853497655|null                            |creed   |28.31 |561587266|755422e7-9040-477b-9bd2-6a6e8fd97387|
|2019-11-01 00:0

                                                                                

In [7]:
df.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [8]:
df.head(5)

[Row(event_time='2019-11-01 00:00:00 UTC', event_type='view', product_id='1003461', category_id='2053013555631882655', category_code='electronics.smartphone', brand='xiaomi', price='489.07', user_id='520088904', user_session='4d3b30da-a5e4-49df-b1a8-ba5943f1dd33'),
 Row(event_time='2019-11-01 00:00:00 UTC', event_type='view', product_id='5000088', category_id='2053013566100866035', category_code='appliances.sewing_machine', brand='janome', price='293.65', user_id='530496790', user_session='8e5f4f83-366c-4f70-860e-ca7417414283'),
 Row(event_time='2019-11-01 00:00:01 UTC', event_type='view', product_id='17302664', category_id='2053013553853497655', category_code=None, brand='creed', price='28.31', user_id='561587266', user_session='755422e7-9040-477b-9bd2-6a6e8fd97387'),
 Row(event_time='2019-11-01 00:00:01 UTC', event_type='view', product_id='3601530', category_id='2053013563810775923', category_code='appliances.kitchen.washer', brand='lg', price='712.87', user_id='518085591', user_sess

In [9]:
df.tail(5)

[Row(event_time='2019-11-30 23:59:58 UTC', event_type='view', product_id='15700137', category_id='2053013559733912211', category_code=None, brand=None, price='277.74', user_id='532714000', user_session='02b4131c-0112-4231-aafa-ceaa08e77c1b'),
 Row(event_time='2019-11-30 23:59:58 UTC', event_type='view', product_id='28719425', category_id='2053013565639492569', category_code='apparel.shoes', brand='baden', price='62.81', user_id='545223467', user_session='734c5eef-0742-4f8b-9d22-48f75b0bc359'),
 Row(event_time='2019-11-30 23:59:59 UTC', event_type='view', product_id='1004833', category_id='2053013555631882655', category_code='electronics.smartphone', brand='samsung', price='167.03', user_id='557794415', user_session='6fecf566-ebb0-4e70-a243-cdc13ce044cb'),
 Row(event_time='2019-11-30 23:59:59 UTC', event_type='view', product_id='2701706', category_id='2053013563911439225', category_code='appliances.kitchen.refrigerators', brand='samsung', price='566.27', user_id='531607492', user_sessio

# 1. 해당 전체 기간에서, KST 기준으로 active user 수가 제일 큰 날짜를 구하세요

In [10]:
max_active = df.groupby(F.dayofmonth('event_time').alias('date')).agg(F.count_distinct('user_id').alias('u_count')).orderBy('u_count', ascending=False)

In [11]:
max_user = max_active.collect()

                                                                                

In [12]:
max_user

[Row(date=17, u_count=487501),
 Row(date=16, u_count=487327),
 Row(date=15, u_count=379786),
 Row(date=18, u_count=319537),
 Row(date=14, u_count=308842),
 Row(date=11, u_count=292400),
 Row(date=20, u_count=291900),
 Row(date=19, u_count=290798),
 Row(date=21, u_count=287079),
 Row(date=29, u_count=286699),
 Row(date=8, u_count=286053),
 Row(date=12, u_count=285994),
 Row(date=13, u_count=285408),
 Row(date=7, u_count=278988),
 Row(date=10, u_count=276198),
 Row(date=4, u_count=275612),
 Row(date=9, u_count=268643),
 Row(date=5, u_count=262338),
 Row(date=25, u_count=257613),
 Row(date=6, u_count=256887),
 Row(date=27, u_count=256546),
 Row(date=30, u_count=255497),
 Row(date=28, u_count=253979),
 Row(date=26, u_count=252527),
 Row(date=22, u_count=249401),
 Row(date=23, u_count=249362),
 Row(date=24, u_count=240526),
 Row(date=3, u_count=240187),
 Row(date=2, u_count=234685),
 Row(date=1, u_count=223108)]

In [13]:
target_date = max_user[0].date

In [14]:
target_date

17

---

# 2. 1의 날짜에서, 세션이 가장 긴 사용자 10명에 대해 "user_id, session_id, 세션시간"를 구하세요

In [15]:
tmp = df.filter(F.dayofmonth(df.event_time) == target_date).orderBy('user_session')

In [16]:
tmp.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [17]:
spark.catalog.dropTempView('tmp')
tmp.createTempView('tmp')

### sql

In [18]:
spark.sql("""
        select first(user_id) as user_id, 
                user_session as session_id, 
                round(
                    (extract(hours from to_timestamp(max(event_time))-to_timestamp(min(event_time)))*60 
                    + extract(mins from to_timestamp(max(event_time))-to_timestamp(min(event_time)))
                    + extract(seconds from to_timestamp(max(event_time))-to_timestamp(min(event_time)))/60), 1
                ) as session_time_minute
        from tmp
        group by session_id
        order by session_time_minute desc
        limit 10
        """).show(truncate=False)



+---------+------------------------------------+-------------------+
|user_id  |session_id                          |session_time_minute|
+---------+------------------------------------+-------------------+
|554374958|99deb4f4-244c-45dc-92c8-313e306f2539|1380.2             |
|543279666|f95cd639-e8b9-4acb-8ede-30f89ef576d5|942.7              |
|570553739|716b81b7-ddee-4cc3-9c36-a54058764eec|903.3              |
|542858277|8befbdcf-f19d-44ff-bd7d-e593fd1f3899|891.5              |
|571492866|0c55689b-d534-46df-802c-0dac3b586f7c|869.7              |
|537823560|d5044df5-456b-4210-ad57-3c03196a250f|866.4              |
|541886641|e21cda3d-1a35-482c-9cd2-afac9c2c56de|864.0              |
|512545111|93e7e655-4aac-4376-a18f-9a4bf5150788|858.8              |
|571080943|e288e4e2-8d14-478a-a850-c326eb76e5b9|830.9              |
|560169653|23e07646-ca7d-4fb8-b67e-325f1adac294|827.8              |
+---------+------------------------------------+-------------------+



                                                                                

### spark method

In [19]:
tmp\
.groupby("user_session")\
.agg(F.first("user_id").alias("user_id"),
     F.col('user_session').alias('session_id'),
     F.round((F.to_timestamp(F.max("event_time")).cast("long") - F.to_timestamp(F.min("event_time")).cast("long"))/60, 1).alias('session_time(minute)')
    )\
.orderBy("session_time(minute)", ascending=False)\
.limit(10)\
.select('user_id', 'session_id', 'session_time(minute)')\
.show(truncate=False)



+---------+------------------------------------+--------------------+
|user_id  |session_id                          |session_time(minute)|
+---------+------------------------------------+--------------------+
|554374958|99deb4f4-244c-45dc-92c8-313e306f2539|1380.2              |
|543279666|f95cd639-e8b9-4acb-8ede-30f89ef576d5|942.7               |
|570553739|716b81b7-ddee-4cc3-9c36-a54058764eec|903.3               |
|542858277|8befbdcf-f19d-44ff-bd7d-e593fd1f3899|891.5               |
|571492866|0c55689b-d534-46df-802c-0dac3b586f7c|869.7               |
|537823560|d5044df5-456b-4210-ad57-3c03196a250f|866.4               |
|541886641|e21cda3d-1a35-482c-9cd2-afac9c2c56de|864.0               |
|512545111|93e7e655-4aac-4376-a18f-9a4bf5150788|858.8               |
|571080943|e288e4e2-8d14-478a-a850-c326eb76e5b9|830.9               |
|560169653|23e07646-ca7d-4fb8-b67e-325f1adac294|827.8               |
+---------+------------------------------------+--------------------+



                                                                                

---

# 3. 1의 날짜의 15분단위로 active user 수를 구하세요

### sql

In [20]:
start1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15)/60))"
start2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15, 60))"

query = f"""concat( right(concat('0',{start1}), 2),
                    ":",
                    right(concat('0',{start2}), 2)
                )"""

In [21]:
end1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15+15)/60))"
end2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15+15, 60))"

query2 = f"""concat( right(concat('0',{end1}), 2), 
                    ":",
                    right(concat('0',{end2}), 2)
                    
                )"""

In [22]:
spark.sql(f"""
        SELECT {query} as start, {query2} as end, count(distinct(user_id)) as active_users
        from tmp
        group by floor((hour(event_time)*60 + minute(event_time))/15)
        order by start
        """).show(200, truncate=False)

[Stage 30:>                                                         (0 + 1) / 1]

+-----+-----+------------+
|start|end  |active_users|
+-----+-----+------------+
|00:00|00:15|1195        |
|00:15|00:30|1302        |
|00:30|00:45|1409        |
|00:45|01:00|1559        |
|01:00|01:15|1867        |
|01:15|01:30|2218        |
|01:30|01:45|2759        |
|01:45|02:00|1853        |
|02:00|02:15|2816        |
|02:15|02:30|5012        |
|02:30|02:45|6316        |
|02:45|03:00|7390        |
|03:00|03:15|8662        |
|03:15|03:30|9896        |
|03:30|03:45|11422       |
|03:45|04:00|12621       |
|04:00|04:15|13958       |
|04:15|04:30|15367       |
|04:30|04:45|16570       |
|04:45|05:00|17640       |
|05:00|05:15|18104       |
|05:15|05:30|19420       |
|05:30|05:45|20132       |
|05:45|06:00|20670       |
|06:00|06:15|20012       |
|06:15|06:30|21134       |
|06:30|06:45|21221       |
|06:45|07:00|20840       |
|07:00|07:15|19900       |
|07:15|07:30|20254       |
|07:30|07:45|20425       |
|07:45|08:00|20337       |
|08:00|08:15|20294       |
|08:15|08:30|20156       |
|

                                                                                

### spark method

In [26]:
tmp.groupby(F.floor(F.floor(F.hour('event_time')*60 + F.minute('event_time'))/15))\
.agg(F.round(F.floor(F.floor(F.hour('event_time')*60 + F.minute('event_time'))/15)*15).alias('m'), F.count_distinct('user_id').alias('active_users'))\
.withColumn("start", F.concat(F.format_string('%02d', F.floor(F.col('m')/60)),F.lit(':'),F.format_string('%02d', F.col('m')%60)))\
.withColumn("end", F.concat(F.format_string("%02d",F.floor((F.col('m')+15)/60)), F.lit(":"), F.format_string("%02d",F.floor((F.col('m')+15)%60))))\
.select('start', 'end', 'active_users')\
.orderBy('start')\
.show(200)



+-----+-----+------------+
|start|  end|active_users|
+-----+-----+------------+
|00:00|00:15|        1195|
|00:15|00:30|        1302|
|00:30|00:45|        1409|
|00:45|01:00|        1559|
|01:00|01:15|        1867|
|01:15|01:30|        2218|
|01:30|01:45|        2759|
|01:45|02:00|        1853|
|02:00|02:15|        2816|
|02:15|02:30|        5012|
|02:30|02:45|        6316|
|02:45|03:00|        7390|
|03:00|03:15|        8662|
|03:15|03:30|        9896|
|03:30|03:45|       11422|
|03:45|04:00|       12621|
|04:00|04:15|       13958|
|04:15|04:30|       15367|
|04:30|04:45|       16570|
|04:45|05:00|       17640|
|05:00|05:15|       18104|
|05:15|05:30|       19420|
|05:30|05:45|       20132|
|05:45|06:00|       20670|
|06:00|06:15|       20012|
|06:15|06:30|       21134|
|06:30|06:45|       21221|
|06:45|07:00|       20840|
|07:00|07:15|       19900|
|07:15|07:30|       20254|
|07:30|07:45|       20425|
|07:45|08:00|       20337|
|08:00|08:15|       20294|
|08:15|08:30|       20156|
|

                                                                                

---

# 4. 1의 날짜에서 view → cart → purchase 이벤트 진행에 따른 funnel 수치를 구하세요

In [24]:
window = Window.orderBy(F.desc(F.substring('event_type', -1, 1)))

In [25]:
tmp.groupby('event_type').agg(F.count_distinct('user_id').alias('activated_user'))\
.withColumn('lag', F.lag('activated_user', 1, 0).over(window))\
.withColumn('Bounce_Rate', F.when(F.col('lag') == 0, 0).otherwise(F.round(1 - F.col('activated_user')/F.col('lag'), 3)))\
.withColumn('Residual_rate', F.round(F.col('activated_user')/F.max("activated_user").over(window), 3))\
.withColumn('Drop_rate', F.concat(F.round(100*F.lag(F.col('Residual_rate')-F.lag('Residual_rate', -1, 0).over(window), 1, 0).over(window), 1), F.lit("%p")))\
.select("event_type", "activated_user", "Bounce_Rate", "Residual_rate", "Drop_rate").show()

[Stage 42:>                                                         (0 + 1) / 1]

+----------+--------------+-----------+-------------+---------+
|event_type|activated_user|Bounce_Rate|Residual_rate|Drop_rate|
+----------+--------------+-----------+-------------+---------+
|      view|        486485|        0.0|          1.0|    0.0%p|
|      cart|        156741|      0.678|        0.322|   67.8%p|
|  purchase|        113889|      0.273|        0.234|    8.8%p|
+----------+--------------+-----------+-------------+---------+



                                                                                