In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from snowflake.snowpark import Session

In [2]:
load_dotenv()

True

In [3]:
ACCOUNT = os.getenv('ACCOUNT')
USER = os.getenv('SP_USER')
PASSWORD = os.getenv('PASSWORD')
ROLE = os.getenv('ROLE')
WAREHOUSE = os.getenv('WAREHOUSE')
DATABASE = os.getenv('DATABASE')
SCHEMA = os.getenv('SCHEMA')

print(ACCOUNT, USER, ROLE, WAREHOUSE, DATABASE, SCHEMA)

hum.us-east-2.aws ericatuva RUP UVARND_WH CORE CLIENT


In [4]:
connection_parameters = {
    'account': os.getenv('ACCOUNT'),
    'user': USER,
    'password': os.getenv('PASSWORD'),
    'role': os.getenv('ROLE'),
    'warehouse': os.getenv('WAREHOUSE'),
    'database': os.getenv('DATABASE'),
    'schema': os.getenv('SCHEMA')
}

In [5]:
test_session = Session.builder.configs(connection_parameters).create()

In [6]:
def query_snowpark(query, test_session=test_session):
    print('querying...')
    query_results = test_session.sql(query).collect()
    print('query done')
    
    query_json = list(map(lambda x: x.as_dict(), query_results))
    query_df = pd.DataFrame(query_json)
    
    return query_df

In [7]:
# EVENT META

event_meta_query = """
SELECT
    e.id
    , em.name AS meta_name
    , em.value AS meta_value
FROM event e
LEFT JOIN event_meta em ON e.id = em.event_id
WHERE DATE_TRUNC('year', e.day) = DATE('2022-01-01')
"""
event_meta_df = query_snowpark(event_meta_query)

querying...
query done


In [8]:
event_meta_df.META_NAME.unique()

array([None, 'referer', 'tags', 'day', 'description', 'title', 'image',
       'content_type', 'utm_campaign', 'utm_content', 'utm_medium',
       'utm_term', 'utm_source'], dtype=object)

In [9]:
unique_events = len(event_meta_df.ID.unique())

In [10]:
unique_events

10863469

### Content Type

In [11]:
event_meta_df[event_meta_df['META_NAME'] == 'content_type']\
    .groupby('META_VALUE')['ID'].nunique()

META_VALUE
journal_article    1068966
microsite_home           4
Name: ID, dtype: int64

- Total Events: 10,863,469
- Journal Articles: 1,068,966
- Microsite Home: 4

### Other Event Meta Types

In [12]:
event_meta_df.groupby('META_NAME')['ID'].nunique()

META_NAME
content_type    1068970
day             2011605
description     1886980
image            279918
referer         2011605
tags             642979
title           1886909
utm_campaign      10174
utm_content       10174
utm_medium        10174
utm_source        10174
utm_term          10174
Name: ID, dtype: int64

In [13]:
content_query = """
SELECT 
    c.type
    , COUNT(DISTINCT e.id) AS events
FROM event e
LEFT JOIN content c ON CONCAT(e.source, '_', e.content_id) = c.id
WHERE DATE_TRUNC('year', e.day) = DATE('2022-01-01')
GROUP BY 1
"""
content_df = query_snowpark(content_query)

querying...
query done


In [24]:
content_df

Unnamed: 0,TYPE,EVENTS
0,issue,125641
1,journal_article,8012589
2,account_management,179789
3,search,197223
4,,1623851
5,in-brief,39
6,self-serve,129483
7,cross-ref-citation,12
8,microsite_home,594842


### Referer & URL

In [14]:
referer_query = """
SELECT
    CASE
        WHEN referer IS NULL THEN 'NULL'
        WHEN referer LIKE '%rupress.org%' THEN 'RUPRESS'
        WHEN referer LIKE '%scholar.google%' THEN 'GOOGLE SCHOLAR'
        WHEN referer LIKE '%google%' THEN 'GOOGLE'
        WHEN referer LIKE '%pubmed%' THEN 'PUBMED'
        ELSE 'OTHER'
    END AS referer_group
    , COUNT(DISTINCT id) AS events
FROM event
WHERE DATE_TRUNC('year', day) = DATE('2022-01-01')
GROUP BY 1
ORDER BY 2 DESC
"""

referer_df = query_snowpark(referer_query)

querying...
query done


In [15]:
referer_df

Unnamed: 0,REFERER_GROUP,EVENTS
0,OTHER,4166569
1,GOOGLE,2933301
2,PUBMED,1428813
3,RUPRESS,1355904
4,GOOGLE SCHOLAR,978882


In [16]:
url_query = """
SELECT
    CASE
        WHEN url LIKE '%article%' THEN 'ARTICLE'
        ELSE 'OTHER'
    END AS url_type
    , COUNT(DISTINCT id) AS events
FROM event
WHERE DATE_TRUNC('year', day) = DATE('2022-01-01')
GROUP BY 1
ORDER BY 2 DESC
"""

url_df = query_snowpark(url_query)

querying...
query done


In [17]:
url_df

Unnamed: 0,URL_TYPE,EVENTS
0,ARTICLE,9516546
1,OTHER,1346923


### Tags & Keywords

In [18]:
tag_query = """
WITH tag_table AS (
    SELECT 
        e.id
        , value AS tag
    FROM event e,
    LATERAL FLATTEN(input => e.tags) f
    WHERE DATE_TRUNC('year', e.day) = DATE('2022-01-01')
)

SELECT
    tag
    , COUNT(DISTINCT id) AS events
FROM tag_table
GROUP BY 1
ORDER BY 2 DESC
"""

tag_df = query_snowpark(tag_query)

querying...
query done


In [19]:
tag_df

Unnamed: 0,TAG,EVENTS
0,"""mice""",2216857
1,"""t-lymphocytes""",996746
2,"""tissue membrane""",698284
3,"""signal transduction""",619142
4,"""antibodies""",585891
...,...,...
34604,"""N-formylated peptides""",1
34605,"""ciguatoxins""",1
34606,"""osteoclast; islet amyloid polypeptide; CTR; C...",1
34607,"""forehead hematoma""",1


In [20]:
tag_df.head(20)

Unnamed: 0,TAG,EVENTS
0,"""mice""",2216857
1,"""t-lymphocytes""",996746
2,"""tissue membrane""",698284
3,"""signal transduction""",619142
4,"""antibodies""",585891
5,"""neoplasms""",581571
6,"""infections""",499700
7,"""genes""",425608
8,"""hum_immunopathogenesis""",414480
9,"""mitochondria""",365549


In [21]:
keyword_query = """
SELECT
    ck.keyword
    , COUNT(DISTINCT e.id) AS EVENTS
FROM event e
LEFT JOIN content_keyword ck ON CONCAT(e.source, '_', e.content_id) = ck.content_id
WHERE DATE_TRUNC('year', e.day) = DATE('2022-01-01')
GROUP BY 1
ORDER BY 2 DESC
"""

keyword_df = query_snowpark(keyword_query)

querying...
query done


In [22]:
keyword_df

Unnamed: 0,KEYWORD,EVENTS
0,mice,2500110
1,,2125695
2,t-lymphocytes,1133698
3,tissue membrane,768383
4,signal transduction,703489
...,...,...
23662,cochlear implants,1
23663,phosphorylases,1
23664,confidence interval,1
23665,supraoptic nucleus,1


Are tag or keyword groups available in the data?

In [7]:
new_features_query = """
-- NEW CONTENT SCORE
with content_aggs AS (
    SELECT
        COUNT(DISTINCT id) AS total_content
    FROM content

)
,new_content AS (
    SELECT
        *
        , (
            (COALESCE(pdf_click, 0) * 4 * (1 - 0.25))
            + (COALESCE(pageview, 0) * 1 * (1 - 0.25))
            + (COALESCE(post_read_start, 0) * 2 * (1 - 0.2))
            + (COALESCE(post_read_mid, 0) * 5 * (1 - 0.1))
            + (COALESCE(post_read_end, 0) * 6 * (1 - 0.05))
            + (COALESCE(scroll, 0) * 2 * (1 - 0.15))
        ) * (100 / POW(total_content, 0.8)) AS new_content_score
    FROM content
    JOIN content_aggs ON 1 = 1
)
-- USER LEVEL
, event_expanded AS (
    SELECT 
        e.id AS event_id
        , e.set_profile AS profile_id
        , LOWER(e.referer) AS referer
        , e.created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY e.set_profile ORDER BY e.created) AS event_rank
        , p.user_id AS email
        , LOWER(c.type) AS content_type
        , c.id AS content_id
        , c.new_content_score AS content_score
    FROM event e
    LEFT JOIN new_content c ON CONCAT(e.source, '_', e.content_id) = c.id
    LEFT JOIN profile p ON e.set_profile = p.id
    WHERE DATE_TRUNC('year', e.day) = DATE('2022-01-01')
)
, events_with_diff AS (
    SELECT 
        *
        -- subtract created of next event in the list per each visitor
        , DATEDIFF('second', created, LEAD(created) OVER(PARTITION BY profile_id ORDER BY event_rank)) AS seconds_to_next_event
    FROM event_expanded
)
, idle_hours AS (
    SELECT 
        AVG(seconds_to_next_event) AS mean_time
        , 1/AVG(seconds_to_next_event) AS lam
        , (-1 * AVG(seconds_to_next_event) * LN(0.05)) / (60 * 60) AS avg_idle_hours
    FROM events_with_diff
)
, event_aggs AS (
    SELECT
        profile_id
        , COUNT(DISTINCT event_id) AS events
    FROM events_with_diff
    GROUP BY 1
)
, event_aggs_first_40 AS (
    SELECT
        e.profile_id
        , COUNT(DISTINCT CASE WHEN (e.seconds_to_next_event / (60*60)) >= i.avg_idle_hours THEN event_id END) + 1 AS cycles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN event_id END) AS article_events
        , COUNT(DISTINCT content_id) AS distinct_content
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN content_id END) AS distinct_articles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' AND referer like '%www.google.com%' THEN content_id END) AS distinct_articles_from_google
        , AVG(content_score) AS average_content_score
    FROM events_with_diff e
    JOIN idle_hours i ON 1 = 1
    WHERE e.event_rank <= 40 -- first 40 events only
    GROUP BY 1
)
, time_to_forty AS (
    SELECT
        a.profile_id
        , DATEDIFF('day', a.created, b.created) AS days_to_40
    FROM event_expanded a
    LEFT JOIN event_expanded b ON a.profile_id = b.profile_id AND b.event_rank = 40
    WHERE a.event_rank = 1
)

SELECT
    ea.profile_id
    , CASE
        WHEN e.events >= 40 THEN 1
        ELSE 0
    END AS reached_40_events
    , CASE
        WHEN e.events >= 80 THEN 1
        ELSE 0
    END AS reached_80_events
    , ea.cycles AS event_cycles_f40
    , ea.distinct_articles AS distinct_articles_f40
    , CASE
        WHEN ea.distinct_articles > 0 THEN ea.distinct_articles_from_google / ea.distinct_articles
        ELSE 0
    END AS percent_google_articles_f40
    , CASE
        WHEN ea.distinct_content > 0 THEN ea.distinct_articles / ea.distinct_content
        ELSE 0
    END AS percent_article_content_f40
    , COALESCE(ea.average_content_score, 0) AS average_content_score_f40
    , ttf.days_to_40 AS days_to_forty_events
FROM event_aggs_first_40 ea
JOIN event_aggs e ON ea.profile_id = e.profile_id
LEFT JOIN time_to_forty ttf ON ea.profile_id = ttf.profile_id
"""

new_features_df = query_snowpark(new_features_query)

querying...
query done


In [8]:
new_features_df

Unnamed: 0,PROFILE_ID,REACHED_40_EVENTS,REACHED_80_EVENTS,EVENT_CYCLES_F40,DISTINCT_ARTICLES_F40,PERCENT_GOOGLE_ARTICLES_F40,PERCENT_ARTICLE_CONTENT_F40,AVERAGE_CONTENT_SCORE_F40,DAYS_TO_FORTY_EVENTS
0,A_ZSgoIBGoM3uBpF2kSF,1,0,6,8,0.250000,1.000000,0.000000,70.0
1,kjKCZ4IB8Tq1gs32qWCB,1,1,4,1,0.000000,0.500000,0.000000,30.0
2,Ji-sNYQBwWEOklUHtLOe,1,0,2,4,0.000000,0.666667,39.837483,7.0
3,vvCZE4QBwWEOklUHM3Ep,1,0,4,2,0.500000,1.000000,0.000000,42.0
4,VeHWX4IB8Tq1gs32ty7z,1,0,8,2,0.000000,1.000000,0.000000,112.0
...,...,...,...,...,...,...,...,...,...
1889034,krf1E4ABoj1uvCUbkRm_,0,0,1,1,0.000000,1.000000,0.000000,
1889035,MKK6NoUBEEdskq5qNOCK,0,0,1,1,1.000000,1.000000,0.000000,
1889036,5yzmwYMBu9aDgDX1XOD-,0,0,1,0,0.000000,0.000000,0.000000,
1889037,6sKrzn8Bale87oUgfoA8,0,0,1,1,0.000000,1.000000,0.000000,


In [9]:
# >= 40 events
reached_40 = new_features_df[new_features_df['REACHED_40_EVENTS'] == 1]

In [10]:
reached_40

Unnamed: 0,PROFILE_ID,REACHED_40_EVENTS,REACHED_80_EVENTS,EVENT_CYCLES_F40,DISTINCT_ARTICLES_F40,PERCENT_GOOGLE_ARTICLES_F40,PERCENT_ARTICLE_CONTENT_F40,AVERAGE_CONTENT_SCORE_F40,DAYS_TO_FORTY_EVENTS
0,A_ZSgoIBGoM3uBpF2kSF,1,0,6,8,0.250000,1.000000,0.000000,70.0
1,kjKCZ4IB8Tq1gs32qWCB,1,1,4,1,0.000000,0.500000,0.000000,30.0
2,Ji-sNYQBwWEOklUHtLOe,1,0,2,4,0.000000,0.666667,39.837483,7.0
3,vvCZE4QBwWEOklUHM3Ep,1,0,4,2,0.500000,1.000000,0.000000,42.0
4,VeHWX4IB8Tq1gs32ty7z,1,0,8,2,0.000000,1.000000,0.000000,112.0
...,...,...,...,...,...,...,...,...,...
1830352,ajcJaIIB8Tq1gs32PCYb,1,0,8,4,0.250000,1.000000,0.000000,123.0
1830353,iHhAY4QBwWEOklUHgUTe,1,0,4,3,0.000000,1.000000,0.000000,29.0
1830354,-AQuoIIBGoM3uBpF_07P,1,1,1,1,0.000000,1.000000,0.000000,3.0
1830355,ITAUCoIBUgM-564P3GZN,1,0,1,7,0.000000,1.000000,0.000000,1.0


In [11]:
reached_40.to_csv('new_features_40.csv', index = 0)

In [11]:
# < 40 events
new_features_df[new_features_df['REACHED_40_EVENTS'] == 0]

Unnamed: 0,PROFILE_ID,REACHED_40_EVENTS,REACHED_80_EVENTS,EVENT_CYCLES_F40,DISTINCT_ARTICLES_F40,PERCENT_GOOGLE_ARTICLES_F40,PERCENT_ARTICLE_CONTENT_F40,AVERAGE_CONTENT_SCORE_F40,DAYS_TO_FORTY_EVENTS
625,fH_OhIMB28LFeUkXzGu-,0,0,1,1,1.000000,1.000000,0.0,
626,FKHrLoUBEEdskq5qUwkz,0,0,1,1,0.000000,1.000000,0.0,
627,dvsjboMB28LFeUkXf-66,0,0,1,1,1.000000,1.000000,0.0,
628,mE2Q6IQBRkWobPtwnd9G,0,0,1,1,0.000000,1.000000,0.0,
629,Y5PaCoQBu9aDgDX11FeC,0,0,1,1,1.000000,1.000000,0.0,
...,...,...,...,...,...,...,...,...,...
1889034,krf1E4ABoj1uvCUbkRm_,0,0,1,1,0.000000,1.000000,0.0,
1889035,MKK6NoUBEEdskq5qNOCK,0,0,1,1,1.000000,1.000000,0.0,
1889036,5yzmwYMBu9aDgDX1XOD-,0,0,1,0,0.000000,0.000000,0.0,
1889037,6sKrzn8Bale87oUgfoA8,0,0,1,1,0.000000,1.000000,0.0,


In [13]:
new_features_df.groupby(['REACHED_40_EVENTS', 'REACHED_80_EVENTS'])['PROFILE_ID'].count()

REACHED_40_EVENTS  REACHED_80_EVENTS
0                  0                    1868862
1                  0                      14393
                   1                       5784
Name: PROFILE_ID, dtype: int64

In [14]:
new_features_df.to_csv('new_features.csv', index = 0)

In [15]:
test_session.close()
print('session closed')

session closed


In [7]:
new_features_query_2 = """
-- NEW CONTENT SCORE
with content_aggs AS (
    SELECT
        COUNT(DISTINCT id) AS total_content
    FROM content

)
,new_content AS (
    SELECT
        *
        , (
            (COALESCE(pdf_click, 0) * 4 * (1 - 0.25))
            + (COALESCE(pageview, 0) * 1 * (1 - 0.25))
            + (COALESCE(post_read_start, 0) * 2 * (1 - 0.2))
            + (COALESCE(post_read_mid, 0) * 5 * (1 - 0.1))
            + (COALESCE(post_read_end, 0) * 6 * (1 - 0.05))
            + (COALESCE(scroll, 0) * 2 * (1 - 0.15))
        ) * (100 / POW(total_content, 0.8)) AS new_content_score
    FROM content
    JOIN content_aggs ON 1 = 1
)
-- USER LEVEL
, event_expanded AS (
    SELECT 
        e.id AS event_id
        , e.set_profile AS profile_id
        , LOWER(e.referer) AS referer
        , e.created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY e.set_profile ORDER BY e.created) AS event_rank
        , p.user_id AS email
        , LOWER(c.type) AS content_type
        , c.id AS content_id
        , c.new_content_score AS content_score
    FROM event e
    LEFT JOIN new_content c ON CONCAT(e.source, '_', e.content_id) = c.id
    LEFT JOIN profile p ON e.set_profile = p.id
    WHERE DATE_TRUNC('year', e.day) >= DATE('2022-01-01') -- 2022 to Present
)
, events_with_diff AS (
    SELECT 
        *
        -- subtract created of next event in the list per each visitor
        , DATEDIFF('second', created, LEAD(created) OVER(PARTITION BY profile_id ORDER BY event_rank)) AS seconds_to_next_event
    FROM event_expanded
)
, idle_hours AS (
    SELECT 
        AVG(seconds_to_next_event) AS mean_time
        , 1/AVG(seconds_to_next_event) AS lam
        , (-1 * AVG(seconds_to_next_event) * LN(0.05)) / (60 * 60) AS avg_idle_hours
    FROM events_with_diff
)
, event_aggs AS (
    SELECT
        profile_id
        , COUNT(DISTINCT event_id) AS events
        , MAX(created) AS latest_event
    FROM events_with_diff
    GROUP BY 1
)
, event_aggs_first_{0} AS (
    SELECT
        e.profile_id
        , COUNT(DISTINCT CASE WHEN (e.seconds_to_next_event / (60*60)) >= i.avg_idle_hours THEN event_id END) + 1 AS cycles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN event_id END) AS article_events
        , COUNT(DISTINCT content_id) AS distinct_content
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN content_id END) AS distinct_articles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' AND referer like '%www.google.com%' THEN content_id END) AS distinct_articles_from_google
        , AVG(content_score) AS average_content_score
    FROM events_with_diff e
    JOIN idle_hours i ON 1 = 1
    WHERE e.event_rank <= {0} -- first {0} events only
    GROUP BY 1
)
, time_to_{0} AS (
    SELECT
        a.profile_id
        , DATEDIFF('day', a.created, b.created) AS days_to_{0}
    FROM event_expanded a
    LEFT JOIN event_expanded b ON a.profile_id = b.profile_id AND b.event_rank = 40
    WHERE a.event_rank = 1
)

SELECT
    ea.profile_id
    , CASE
        WHEN e.events >= {0} THEN 1
        ELSE 0
    END AS reached_{0}_events
    , CASE
        WHEN e.events >= {1} THEN 1
        ELSE 0
    END AS reached_{1}_events
    , CASE
        WHEN DATEDIFF('day', e.latest_event, CURRENT_TIMESTAMP()) <= 21 THEN 1
        ELSE 0
    END AS recent_last_event
    , ea.cycles AS event_cycles_f{0}
    , ea.distinct_articles AS distinct_articles_f{0}
    , CASE
        WHEN ea.distinct_articles > 0 THEN ea.distinct_articles_from_google / ea.distinct_articles
        ELSE 0
    END AS percent_google_articles_f{0}
    , CASE
        WHEN ea.distinct_content > 0 THEN ea.distinct_articles / ea.distinct_content
        ELSE 0
    END AS percent_article_content_f{0}
    , COALESCE(ea.average_content_score, 0) AS average_content_score_f{0}
    , ttf.days_to_{0} AS days_to_{0}_events
FROM event_aggs_first_{0} ea
JOIN event_aggs e ON ea.profile_id = e.profile_id
LEFT JOIN time_to_{0} ttf ON ea.profile_id = ttf.profile_id
""".format(40, 80)

print(new_features_query_2)

new_features_df_2 = query_snowpark(new_features_query_2)


-- NEW CONTENT SCORE
with content_aggs AS (
    SELECT
        COUNT(DISTINCT id) AS total_content
    FROM content

)
,new_content AS (
    SELECT
        *
        , (
            (COALESCE(pdf_click, 0) * 4 * (1 - 0.25))
            + (COALESCE(pageview, 0) * 1 * (1 - 0.25))
            + (COALESCE(post_read_start, 0) * 2 * (1 - 0.2))
            + (COALESCE(post_read_mid, 0) * 5 * (1 - 0.1))
            + (COALESCE(post_read_end, 0) * 6 * (1 - 0.05))
            + (COALESCE(scroll, 0) * 2 * (1 - 0.15))
        ) * (100 / POW(total_content, 0.8)) AS new_content_score
    FROM content
    JOIN content_aggs ON 1 = 1
)
-- USER LEVEL
, event_expanded AS (
    SELECT 
        e.id AS event_id
        , e.set_profile AS profile_id
        , LOWER(e.referer) AS referer
        , e.created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY e.set_profile ORDER BY e.created) AS event_rank
        , p.user_id AS email
        , LOWER(c.type

In [9]:
new_features_df_2

Unnamed: 0,PROFILE_ID,REACHED_40_EVENTS,REACHED_80_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F40,DISTINCT_ARTICLES_F40,PERCENT_GOOGLE_ARTICLES_F40,PERCENT_ARTICLE_CONTENT_F40,AVERAGE_CONTENT_SCORE_F40,DAYS_TO_FORTY_EVENTS
0,xGzjdYIBGoM3uBpFUxC3,1,1,1,7,4,0.000000,0.333333,466.762777,98.0
1,ouWpFoYBRkWobPtwxxK8,1,0,1,2,6,0.000000,0.600000,187.361993,10.0
2,E2HS1oMB28LFeUkXfxkk,1,0,0,1,1,0.000000,1.000000,0.000000,0.0
3,8iILiIIBGoM3uBpFODuU,1,1,0,2,2,0.000000,0.666667,39.110081,22.0
4,WIX2P4QBEEdskq5qbdjI,1,0,0,4,5,0.800000,0.833333,0.000000,25.0
...,...,...,...,...,...,...,...,...,...,...
2234625,Jg4vH4QBwWEOklUHEhYq,0,0,0,1,1,0.000000,1.000000,0.000000,
2234626,VhQr34EBuqp2E9ofm_zS,0,0,0,1,1,0.000000,1.000000,0.000000,
2234627,2sjIaYUBEEdskq5qAiXu,0,0,0,1,1,0.000000,1.000000,0.000000,
2234628,S9znE4YBRkWobPtwMdnB,0,0,0,1,1,0.000000,1.000000,0.000000,


In [11]:
reached_40 = new_features_df_2[new_features_df_2['REACHED_40_EVENTS'] == 1]

In [24]:
reached_40_80 = reached_40[~((reached_40['REACHED_80_EVENTS'] == 0) & (reached_40['RECENT_LAST_EVENT'] == 1))]

In [25]:
reached_40_80.to_csv('reached_40_80.csv', index=0)

In [7]:
thresh_1 = 16
thresh_2 = 80

classification_query = """
WITH event_expanded AS (
    SELECT 
        e.id AS event_id
        , e.set_profile AS profile_id
        , LOWER(e.referer) AS referer
        , e.created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY e.set_profile ORDER BY e.created) AS event_rank
        , p.user_id AS email
        , LOWER(c.type) AS content_type
        , c.id AS content_id
        , c.score AS content_score
    FROM event e
    LEFT JOIN content c ON CONCAT(e.source, '_', e.content_id) = c.id
    LEFT JOIN profile p ON e.set_profile = p.id
    WHERE DATE_TRUNC('year', e.day) >= DATE('2022-01-01') -- 2022 to Present
)
, events_with_diff AS (
    SELECT 
        *
        -- subtract created of next event in the list per each visitor
        , DATEDIFF('second', created, LEAD(created) OVER(PARTITION BY profile_id ORDER BY event_rank)) AS seconds_to_next_event
    FROM event_expanded
)
, idle_hours AS (
    SELECT 
        AVG(seconds_to_next_event) AS mean_time
        , 1/AVG(seconds_to_next_event) AS lam
        , (-1 * AVG(seconds_to_next_event) * LN(0.05)) / (60 * 60) AS avg_idle_hours
    FROM events_with_diff
)
, event_aggs AS (
    SELECT
        profile_id
        , COUNT(DISTINCT event_id) AS events
        , MAX(created) AS latest_event
    FROM events_with_diff
    GROUP BY 1
)
, event_aggs_first_{0} AS (
    SELECT
        e.profile_id
        , COUNT(DISTINCT CASE WHEN (e.seconds_to_next_event / (60*60)) >= i.avg_idle_hours THEN event_id END) + 1 AS cycles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN event_id END) AS article_events
        , COUNT(DISTINCT content_id) AS distinct_content
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN content_id END) AS distinct_articles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' AND referer like '%www.google.com%' THEN content_id END) AS distinct_articles_from_google
        , AVG(content_score) AS average_content_score
    FROM events_with_diff e
    JOIN idle_hours i ON 1 = 1
    WHERE e.event_rank <= {0} -- first {0} events only
    GROUP BY 1
)
, time_to_{0} AS (
    SELECT
        a.profile_id
        , DATEDIFF('day', a.created, b.created) AS days_to_{0}
    FROM event_expanded a
    LEFT JOIN event_expanded b ON a.profile_id = b.profile_id AND b.event_rank = {0}
    WHERE a.event_rank = 1
)

SELECT
    ea.profile_id
    , CASE
        WHEN e.events >= {0} THEN 1
        ELSE 0
    END AS reached_{0}_events
    , CASE
        WHEN e.events >= {1} THEN 1
        ELSE 0
    END AS reached_{1}_events
    , CASE
        WHEN DATEDIFF('day', e.latest_event, CURRENT_TIMESTAMP()) <= 21 THEN 1
        ELSE 0
    END AS recent_last_event
    , ea.cycles AS event_cycles_f{0}
    , ea.distinct_articles AS distinct_articles_f{0}
    , CASE
        WHEN ea.distinct_articles > 0 THEN ea.distinct_articles_from_google / ea.distinct_articles
        ELSE 0
    END AS percent_google_articles_f{0}
    , CASE
        WHEN ea.distinct_content > 0 THEN ea.distinct_articles / ea.distinct_content
        ELSE 0
    END AS percent_article_content_f{0}
    , COALESCE(ea.average_content_score, 0) AS average_content_score_f{0}
    , ttf.days_to_{0} AS days_to_{0}_events
FROM event_aggs_first_{0} ea
JOIN event_aggs e ON ea.profile_id = e.profile_id
LEFT JOIN time_to_{0} ttf ON ea.profile_id = ttf.profile_id
""".format(thresh_1, thresh_2)

classification_df = query_snowpark(classification_query)

querying...
query done


In [8]:
reached_first = classification_df[classification_df['REACHED_{}_EVENTS'.format(thresh_1)] == 1]
reached_first.to_csv('reached_{0}_first_{0}.csv'.format(thresh_1))
# reached_first_second = reached_first[~((reached_first['REACHED_{}_EVENTS'.format(thresh_2)] == 0) & (reached_first['RECENT_LAST_EVENT'] == 1))]
# reached_first_second.to_csv('reached_{0}_{1}.csv'.format(thresh_1, thresh_2), index=0)

In [11]:
reached_first.sort_values('PROFILE_ID')

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,REACHED_80_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_F16,DISTINCT_ARTICLES_F16,PERCENT_GOOGLE_ARTICLES_F16,PERCENT_ARTICLE_CONTENT_F16,AVERAGE_CONTENT_SCORE_F16,DAYS_TO_16_EVENTS
21,--0p74IB28LFeUkXSg6b,1,0,0,2,2,0.000000,1.000000,0.0,133.0
8,--6dYIIB8Tq1gs32cnxN,1,0,0,1,1,1.000000,1.000000,0.0,7.0
9,--7CGYYBRkWobPtw2Qjb,1,0,1,3,1,1.000000,1.000000,0.0,27.0
10,--7egIIBGoM3uBpFKICG,1,0,0,1,4,1.000000,1.000000,0.0,5.0
14,--7xtoIBGoM3uBpFf0FV,1,0,0,3,2,0.000000,1.000000,0.0,103.0
...,...,...,...,...,...,...,...,...,...,...
2246601,zzlKjIIBGoM3uBpFBagQ,1,0,1,3,3,0.666667,1.000000,0.0,133.0
2246725,zzpjDYQBEEdskq5qGMuW,1,0,0,2,1,0.000000,1.000000,0.0,17.0
2246315,zzrPDIQBEEdskq5qZRER,1,0,1,4,3,0.333333,1.000000,0.0,129.0
2246473,zzsQPYQBwWEOklUHCNVJ,1,0,0,1,0,0.000000,0.000000,0.0,2.0


In [14]:
clustering_query = """
WITH event_expanded AS (
    SELECT 
        e.id AS event_id
        , e.set_profile AS profile_id
        , LOWER(e.referer) AS referer
        , e.created
        -- ranking of event per visitor based on created time
        , ROW_NUMBER() OVER (PARTITION BY e.set_profile ORDER BY e.created) AS event_rank
        , p.user_id AS email
        , LOWER(c.type) AS content_type
        , c.id AS content_id
        , c.score AS content_score
    FROM event e
    LEFT JOIN content c ON CONCAT(e.source, '_', e.content_id) = c.id
    LEFT JOIN profile p ON e.set_profile = p.id
    WHERE DATE_TRUNC('year', e.day) >= DATE('2022-01-01') -- 2022 to Present
)
, events_with_diff AS (
    SELECT 
        *
        -- subtract created of next event in the list per each visitor
        , DATEDIFF('second', created, LEAD(created) OVER(PARTITION BY profile_id ORDER BY event_rank)) AS seconds_to_next_event
    FROM event_expanded
)
, idle_hours AS (
    SELECT 
        AVG(seconds_to_next_event) AS mean_time
        , 1/AVG(seconds_to_next_event) AS lam
        , (-1 * AVG(seconds_to_next_event) * LN(0.05)) / (60 * 60) AS avg_idle_hours
    FROM events_with_diff
)
, event_aggs AS (
    SELECT
        profile_id
        , COUNT(DISTINCT event_id) AS events
        , COUNT(DISTINCT CASE WHEN (seconds_to_next_event / (60*60)) >= i.avg_idle_hours THEN event_id END) + 1 AS cycles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN event_id END) AS article_events
        , COUNT(DISTINCT content_id) AS distinct_content
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' THEN content_id END) AS distinct_articles
        , COUNT(DISTINCT CASE WHEN content_type LIKE '%article%' AND referer like '%www.google.com%' THEN content_id END) AS distinct_articles_from_google
        , AVG(content_score) AS average_content_score
        , MAX(created) AS latest_event_time
        , MIN(created) AS first_event_time
    FROM events_with_diff
    JOIN idle_hours i ON 1 = 1
    GROUP BY 1
)
, time_to_{0} AS (
    SELECT
        a.profile_id
        , DATEDIFF('day', a.created, b.created) AS days_to_{0}
    FROM event_expanded a
    LEFT JOIN event_expanded b ON a.profile_id = b.profile_id AND b.event_rank = {0}
    WHERE a.event_rank = 1
)

SELECT
    ea.profile_id
    , CASE
        WHEN ea.events >= {0} THEN 1
        ELSE 0
    END AS reached_{0}_events
    , CASE
        WHEN DATEDIFF('day', ea.latest_event_time, CURRENT_TIMESTAMP()) <= 21 THEN 1
        ELSE 0
    END AS recent_last_event
    , ea.cycles AS event_cycles_all
    , ea.distinct_articles AS distinct_articles_all
    , CASE
        WHEN ea.distinct_articles > 0 THEN ea.distinct_articles_from_google / ea.distinct_articles
        ELSE 0
    END AS percent_google_articles_all
    , CASE
        WHEN ea.distinct_content > 0 THEN ea.distinct_articles / ea.distinct_content
        ELSE 0
    END AS percent_article_content_all
    , COALESCE(ea.average_content_score, 0) AS average_content_score_all
    , ttf.days_to_{0} AS days_to_{0}_events
    , ea.first_event_time
    , ea.latest_event_time
FROM event_aggs ea
LEFT JOIN time_to_{0} ttf ON ea.profile_id = ttf.profile_id
""".format(thresh_1)

clustering_df = query_snowpark(clustering_query)

querying...
query done


In [15]:
reached_first_all = clustering_df[clustering_df['REACHED_{}_EVENTS'.format(thresh_1)] == 1]
reached_first_all.to_csv('reached_{0}_all.csv'.format(thresh_1))

In [16]:
reached_first_all.sort_values('PROFILE_ID')

Unnamed: 0,PROFILE_ID,REACHED_16_EVENTS,RECENT_LAST_EVENT,EVENT_CYCLES_ALL,DISTINCT_ARTICLES_ALL,PERCENT_GOOGLE_ARTICLES_ALL,PERCENT_ARTICLE_CONTENT_ALL,AVERAGE_CONTENT_SCORE_ALL,DAYS_TO_16_EVENTS,FIRST_EVENT_TIME,LATEST_EVENT_TIME
24,--0p74IB28LFeUkXSg6b,1,0,2,2,0.000000,1.000000,0.000000,133.0,2022-08-30 14:30:38.171966,2023-01-10 16:43:55.720219
27,--6dYIIB8Tq1gs32cnxN,1,0,1,1,1.000000,1.000000,0.000000,7.0,2022-08-02 22:11:48.120981,2022-08-09 16:19:25.990118
1,--7CGYYBRkWobPtw2Qjb,1,1,4,1,1.000000,1.000000,0.000000,27.0,2023-02-04 00:10:45.911994,2023-03-17 20:28:24.300215
16,--7egIIBGoM3uBpFKICG,1,0,11,11,1.000000,0.916667,1303.650794,5.0,2022-08-09 04:30:20.132357,2023-01-04 17:26:37.806226
5,--7xtoIBGoM3uBpFf0FV,1,0,3,2,0.000000,1.000000,0.000000,103.0,2022-08-19 16:30:57.064488,2022-11-30 19:41:43.155668
...,...,...,...,...,...,...,...,...,...,...,...
2245720,zzlKjIIBGoM3uBpFBagQ,1,1,5,5,0.800000,1.000000,0.000000,133.0,2022-08-11 09:43:58.953482,2023-03-14 00:54:31.666582
2246346,zzpjDYQBEEdskq5qGMuW,1,0,5,1,0.000000,1.000000,0.000000,17.0,2022-10-25 04:25:09.614383,2023-02-11 00:25:21.186098
2245847,zzrPDIQBEEdskq5qZRER,1,1,4,3,0.333333,1.000000,0.000000,129.0,2022-10-25 01:43:50.150416,2023-03-03 07:59:42.533219
2246738,zzsQPYQBwWEOklUHCNVJ,1,0,1,0,0.000000,0.000000,0.000000,2.0,2022-11-03 10:36:12.009305,2022-11-08 11:59:11.293372
