In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import dayofmonth, to_date, count_distinct, hour,minute
from pyspark.sql.types import *

from pprint import pprint

In [17]:
spark = SparkSession.builder.master("local").appName("SparkSQL").getOrCreate()

In [18]:
spark.sparkContext.setLogLevel("ERROR")

In [19]:
schema = StructType(
    [
        StructField('event_time', StringType()),
        StructField('event_type', StringType()),
        StructField('product_id', StringType()),
        StructField('category_id', StringType()),
        StructField('category_code', StringType()),
        StructField('brand', StringType()),
        StructField('price', StringType()),
        StructField('user_id', StringType()),
        StructField('user_session', StringType()),
    ]
)

In [73]:
df = spark.read.option("header",True).schema(schema).csv("./2019-Nov.csv")#.limit(10000)

In [6]:
df.show(truncate=False)

+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|event_time             |event_type|product_id|category_id        |category_code                   |brand   |price |user_id  |user_session                        |
+-----------------------+----------+----------+-------------------+--------------------------------+--------+------+---------+------------------------------------+
|2019-11-01 00:00:00 UTC|view      |1003461   |2053013555631882655|electronics.smartphone          |xiaomi  |489.07|520088904|4d3b30da-a5e4-49df-b1a8-ba5943f1dd33|
|2019-11-01 00:00:00 UTC|view      |5000088   |2053013566100866035|appliances.sewing_machine       |janome  |293.65|530496790|8e5f4f83-366c-4f70-860e-ca7417414283|
|2019-11-01 00:00:01 UTC|view      |17302664  |2053013553853497655|null                            |creed   |28.31 |561587266|755422e7-9040-477b-9bd2-6a6e8fd97387|
|2019-11-01 00:0

In [7]:
df.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [8]:
df.head(5)

[Row(event_time='2019-11-01 00:00:00 UTC', event_type='view', product_id='1003461', category_id='2053013555631882655', category_code='electronics.smartphone', brand='xiaomi', price='489.07', user_id='520088904', user_session='4d3b30da-a5e4-49df-b1a8-ba5943f1dd33'),
 Row(event_time='2019-11-01 00:00:00 UTC', event_type='view', product_id='5000088', category_id='2053013566100866035', category_code='appliances.sewing_machine', brand='janome', price='293.65', user_id='530496790', user_session='8e5f4f83-366c-4f70-860e-ca7417414283'),
 Row(event_time='2019-11-01 00:00:01 UTC', event_type='view', product_id='17302664', category_id='2053013553853497655', category_code=None, brand='creed', price='28.31', user_id='561587266', user_session='755422e7-9040-477b-9bd2-6a6e8fd97387'),
 Row(event_time='2019-11-01 00:00:01 UTC', event_type='view', product_id='3601530', category_id='2053013563810775923', category_code='appliances.kitchen.washer', brand='lg', price='712.87', user_id='518085591', user_sess

In [None]:
df.tail(5)



# 1. 해당 전체 기간에서, KST 기준으로 active user 수가 제일 큰 날짜를 구하세요

In [74]:
max_active = df.groupby(dayofmonth('event_time').alias('date')).agg(count_distinct('user_id').alias('u_count')).orderBy('u_count', ascending=False)

In [75]:
max_user = max_active.collect()

                                                                                

In [76]:
max_user

[Row(date=17, u_count=487501),
 Row(date=16, u_count=487327),
 Row(date=15, u_count=379786),
 Row(date=18, u_count=319537),
 Row(date=14, u_count=308842),
 Row(date=11, u_count=292400),
 Row(date=20, u_count=291900),
 Row(date=19, u_count=290798),
 Row(date=21, u_count=287079),
 Row(date=29, u_count=286699),
 Row(date=8, u_count=286053),
 Row(date=12, u_count=285994),
 Row(date=13, u_count=285408),
 Row(date=7, u_count=278988),
 Row(date=10, u_count=276198),
 Row(date=4, u_count=275612),
 Row(date=9, u_count=268643),
 Row(date=5, u_count=262338),
 Row(date=25, u_count=257613),
 Row(date=6, u_count=256887),
 Row(date=27, u_count=256546),
 Row(date=30, u_count=255497),
 Row(date=28, u_count=253979),
 Row(date=26, u_count=252527),
 Row(date=22, u_count=249401),
 Row(date=23, u_count=249362),
 Row(date=24, u_count=240526),
 Row(date=3, u_count=240187),
 Row(date=2, u_count=234685),
 Row(date=1, u_count=223108)]

In [77]:
target_date = max_user[0].date

In [78]:
target_date

17

---

# 1의 날짜에서, 세션이 가장 긴 사용자 10명에 대해 "user_id, session_id, 세션시간"를 구하세요

In [79]:
tmp = df.filter(dayofmonth(df.event_time) == target_date).orderBy('user_session')

In [80]:
tmp.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)



In [81]:
spark.catalog.dropTempView('tmp')
tmp.createTempView('tmp')

In [145]:
spark.sql("""
        select first(t.user_id) as user_id, last(user_id) as last
        from tmp t
        group by t.user_session
        having last != user_id
        """).show(truncate=False)
#세션아이디가 같으면서 유저아이디가 다른 경우가 있는지 확인



+-------+----+
|user_id|last|
+-------+----+
+-------+----+



                                                                                


spark.sql("""
        select t.user_session, min(t.event_time) as start, max(t.event_time) as end, bigint(to_timestamp(max(t.event_time)))-bigint(to_timestamp(min(t.event_time))) as keep, first(t.user_id) as user_id, last(user_id) as last
        from tmp t
        group by t.user_session
        having last != user_id
        order by keep desc
        
        """).show(truncate=False)

In [None]:
spark.sql("""
        select t.user_id, 
        from tmp t
        group by t.user_id
        limit 10
        """).show(truncate=False)

# 3. 1의 날짜의 15분단위로 active user 수를 구하세요

In [259]:
std = 1572566400

spark.sql("""
        with RECURSIVE time as (
            select 0 MINUTE
            union
            select MINUTE+15
            from time
        )
        SELECT *, count(*) as 'COUNT'
        FROM (
            select minute(to_timestamp(event_time)) as MINUTE from tmp
            union all
            select MINUTE from time
        ) as f
        group by MINUTE
        order by MINUTE
        """).show(truncate=False)

In [28]:
spark.sql("""
        SELECT floor((hour(event_time)*60 + minute(event_time))/15)*15 as m, count(distinct(user_id))
        from tmp
        group by floor((hour(event_time)*60 + minute(event_time))/15)
        """).show(truncate=False)



+---+-----------------------+
|m  |count(DISTINCT user_id)|
+---+-----------------------+
|0  |542                    |
|15 |628                    |
|30 |768                    |
|45 |669                    |
+---+-----------------------+



                                                                                

query = """concat( string( floor((floor((hour(event_time)*60 + minute(event_time))/15)*15)/60) ), 
                    ":",
                    string( mod( floor((hour(event_time)*60 + minute(event_time))/15)*15, 60)))"""

query2 = """concat( string( floor((floor((hour(event_time)*60 + minute(event_time))/15)*15+15)/60) ), 
                    ":",
                    string( mod( floor((hour(event_time)*60 + minute(event_time))/15)*15+15, 60)))"""

---

In [92]:
start1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15)/60))"

In [93]:
start2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15, 60))"

In [94]:
query = f"""concat( if(length({start1})=2, 
                    {start1}, 
                    concat(0,{start1})), 
                    
                    ":",
                    
                    if(length({start2})=2,
                    {start2},
                    concat(0, {start2}))
                )"""

In [98]:
end1 = "string(floor((floor((hour(event_time)*60 + minute(event_time))/15)*15+15)/60))"

In [101]:
end2 = "string(mod(floor((hour(event_time)*60 + minute(event_time))/15)*15+15, 60))"

In [102]:
query2 = f"""concat( if(length({end1})=2, 
                    {end1}, 
                    concat(0,{end1})), 
                    
                    ":",
                    
                    if(length({end2})=2,
                    {end2},
                    concat(0, {end2})
                    )
                )"""

In [103]:
spark.sql(f"""
        SELECT {query} as start, {query2} as end, count(distinct(user_id))
        from tmp
        group by floor((hour(event_time)*60 + minute(event_time))/15)
        order by start
        """).show(200, truncate=False)

[Stage 78:>                                                         (0 + 1) / 1]

+-----+-----+-----------------------+
|start|end  |count(DISTINCT user_id)|
+-----+-----+-----------------------+
|00:00|00:15|1195                   |
|00:15|00:30|1302                   |
|00:30|00:45|1409                   |
|00:45|01:00|1559                   |
|01:00|01:15|1867                   |
|01:15|01:30|2218                   |
|01:30|01:45|2759                   |
|01:45|02:00|1853                   |
|02:00|02:15|2816                   |
|02:15|02:30|5012                   |
|02:30|02:45|6316                   |
|02:45|03:00|7390                   |
|03:00|03:15|8662                   |
|03:15|03:30|9896                   |
|03:30|03:45|11422                  |
|03:45|04:00|12621                  |
|04:00|04:15|13958                  |
|04:15|04:30|15367                  |
|04:30|04:45|16570                  |
|04:45|05:00|17640                  |
|05:00|05:15|18104                  |
|05:15|05:30|19420                  |
|05:30|05:45|20132                  |
|05:45|06:00

                                                                                

In [334]:
spark.sql("""
        SELECT floor(bigint(to_timestamp(event_time))/60/15)*15, floor(bigint(to_timestamp(event_time))/60/15)*15-15, count(distinct(user_id))
        from tmp
        group by floor(bigint(to_timestamp(event_time))/60/15)*15
        """).show(truncate=False)



+----------------------------------------------------+-----------------------------------------------------------+-----------------------+
|(FLOOR(((to_timestamp(event_time) / 60) / 15)) * 15)|((FLOOR(((to_timestamp(event_time) / 60) / 15)) * 15) - 15)|count(DISTINCT user_id)|
+----------------------------------------------------+-----------------------------------------------------------+-----------------------+
|26209440                                            |26209425                                                   |542                    |
|26209455                                            |26209440                                                   |628                    |
|26209470                                            |26209455                                                   |768                    |
|26209485                                            |26209470                                                   |669                    |
+--------------------------

                                                                                