In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from snowflake.snowpark import Session

In [3]:
load_dotenv()

True

In [4]:
ACCOUNT = os.getenv('ACCOUNT')
USER = os.getenv('SP_USER')
PASSWORD = os.getenv('PASSWORD')
ROLE = os.getenv('ROLE')
WAREHOUSE = os.getenv('WAREHOUSE')
DATABASE = os.getenv('DATABASE')
SCHEMA = os.getenv('SCHEMA')

print(ACCOUNT, USER, ROLE, WAREHOUSE, DATABASE, SCHEMA)

hum.us-east-2.aws ericatuva RUP UVARND_WH CORE CLIENT


In [5]:
connection_parameters = {
    'account': os.getenv('ACCOUNT'),
    'user': USER,
    'password': os.getenv('PASSWORD'),
    'role': os.getenv('ROLE'),
    'warehouse': os.getenv('WAREHOUSE'),
    'database': os.getenv('DATABASE'),
    'schema': os.getenv('SCHEMA')
}

In [6]:
test_session = Session.builder.configs(connection_parameters).create()

**Snowflake function documentation:**
- [COALESCE](https://docs.snowflake.com/en/sql-reference/functions/coalesce.html)
- [DATEDIFF](https://docs.snowflake.com/en/sql-reference/functions/datediff.html)
- [LAG](https://docs.snowflake.com/en/sql-reference/functions/lag.html) / [LEAD](https://docs.snowflake.com/en/sql-reference/functions/lead.html)
- [ROW_NUMBER](https://docs.snowflake.com/en/sql-reference/functions/row_number.html)
- [LISTAGG](https://docs.snowflake.com/en/sql-reference/functions/listagg.html) / [Additional LISTAGG](https://stephenallwright.com/snowflake-listagg/)

In [7]:
events_query = """
WITH events_summary AS (
    SELECT
        visitor_id
        , id
        , event
        -- use date if created is null
        , COALESCE(created, date) AS created
    FROM event
    WHERE DATE_TRUNC('year', day) = DATE('2022-01-01')
    ORDER BY 1, 3
)
, visitor_events AS (
    SELECT
        visitor_id
        , COUNT(DISTINCT id) AS total_events
    FROM events_summary
    GROUP BY 1
)
, events_ranked AS (
    SELECT 
        visitor_id
        , event
        , created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY created) AS event_rank
    FROM events_summary
)
, events_with_diff AS (
    SELECT 
        *
        -- subtract created of next event in the list per each visitor
        , DATEDIFF('second', created, LEAD(created) OVER(PARTITION BY visitor_id ORDER BY event_rank)) AS seconds_to_next_event
    FROM events_ranked
    ORDER BY visitor_id, event_rank
)

SELECT 
    e.visitor_id
    , v.total_events
    -- change seconds_to_next_event to 0 if null so that it is included in the listagg
    , LISTAGG(CONCAT('(', e.event, ',', e.created, ',', COALESCE(e.seconds_to_next_event, 0), ',', e.event_rank, ')'), ';') 
    WITHIN GROUP (ORDER BY event_rank) AS events
FROM events_with_diff e
JOIN visitor_events v ON e.visitor_id = v.visitor_id
GROUP BY e.visitor_id, v.total_events
ORDER BY e.visitor_id
"""

In [8]:
print('querying...')
query_results = test_session.sql(events_query).collect()
print('query done')
test_session.close()
print('session closed')

querying...
query done
session closed


In [10]:
# convert to pandas df
query_json = list(map(lambda x: x.as_dict(), query_results))
query_df = pd.DataFrame(query_json)
query_df.to_csv('rnn_clean_data.csv', index = 0)
# query_df = pd.read_csv('rnn_clean_data.csv')

In [11]:
query_df

Unnamed: 0,VISITOR_ID,TOTAL_EVENTS,EVENTS
0,00000404-5bf6-4798-89d8-1592758d661e,3,"(pageview,2022-05-11 00:00:00.000,0,1);(pagevi..."
1,000008b9-45d5-4ed2-8cfe-2463853d5640,4,"(pageview,2022-12-28 03:01:23.498,5,1);(post-r..."
2,00001624-b501-4b87-8cd6-a90760e47df8,4,"(pageview,2022-09-02 10:22:19.588,5,1);(post-r..."
3,00001e6f-ce60-4bf4-b356-4ab63181ccc3,4,"(pageview,2022-10-11 06:36:42.247,6,1);(post-r..."
4,00002124-5ca5-4a6f-9fd9-839062c866b3,8,"(pageview,2022-10-26 10:04:59.893,5,1);(post-r..."
...,...,...,...
1894585,ffffd7ea-c335-412e-a6f9-1676e2533cd5,12,"(pageview,2022-09-09 20:15:09.686,5,1);(post-r..."
1894586,ffffd8d1-e23e-49d4-9a1d-f24244edaeea,4,"(pageview,2022-11-16 06:13:44.099,5,1);(post-r..."
1894587,ffffe48e-5fde-4163-8e92-5d0d95bbc91a,1,"(pageview,2022-08-20 23:55:37.560,0,1)"
1894588,ffffe76a-5434-4064-9cc3-274504915aeb,5,"(pageview,2022-07-21 09:53:01.046,4,1);(post-r..."


**EVENTS** column format:

(*event type*, *created time*, *seconds until next event*, *event rank based on time*)

Each event is split by a *";"* delimeter

In [17]:
test_data = query_df\
    .where(lambda x: (x.TOTAL_EVENTS >= 20) & (x.TOTAL_EVENTS <= 50)).dropna()
test_data

Unnamed: 0,VISITOR_ID,TOTAL_EVENTS,EVENTS
69,00029129-ebca-4282-b970-92cb271690e2,30.0,"(post-read-start,2022-04-06 00:00:00.000,0,1);..."
119,0004ccae-284b-4146-a697-b2b172365819,48.0,"(pageview,2022-10-26 00:13:32.781,6,1);(post-r..."
144,0005b34e-f539-4a7e-b399-abacd9f50460,36.0,"(pageview,2022-08-04 20:46:27.645,5,1);(post-r..."
181,0006bd67-88ad-4dcd-aff6-c6f75f7e982f,22.0,"(pageview,2022-10-19 18:47:24.309,5,1);(post-r..."
254,000908be-b690-43ce-8551-8dbff8c0c36b,22.0,"(pageview,2022-09-12 08:01:45.222,17,1);(pagev..."
...,...,...,...
1894468,fffb6b28-6f5e-4eba-b3c3-01f0566bc569,33.0,"(pageview,2022-09-27 03:19:39.283,5,1);(post-r..."
1894471,fffb8a41-3ef6-4191-9680-f8cff8db4733,20.0,"(pageview,2022-10-21 06:52:53.263,5,1);(post-r..."
1894476,fffba2e7-9038-482e-bdf2-ee4bcf4d443f,22.0,"(pageview,2022-12-02 00:26:49.084,5,1);(post-r..."
1894520,fffd1135-fb73-4ba5-87e9-6cec137fac04,33.0,"(pageview,2022-09-25 09:21:10.723,5,1);(post-r..."


53k users with events between 20 and 50 for 2022