In [1]:
## trello card: https://trello.com/c/W0EUPHdT/1482-data-request-grab-the-mic

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import utils as u

plt.rcParams['figure.figsize'] = 12, 9

  """)


### Load data:

1) Member event log data for last year    
2) Users who participated in GTM with a flag indicating if they were new members or not

Note: supply correct postgres db/user params in order to `load_connection`

In [2]:
sql = '''
SELECT *
FROM member_event_log
WHERE "timestamp" >= '2018-01-01'
AND "timestamp" < '2019-01-01'
'''

df = pd.read_sql(sql, u.load_connection())

In [3]:
GTM_members = '''
SELECT DISTINCT s.northstar_id, gtm_first_users.new_GTM_member
FROM signups s
LEFT JOIN (
    SELECT f.northstar_id,
       CASE WHEN f.first_campaign = '8017' THEN 1 ELSE NULL END as new_GTM_member
    FROM (
        SELECT 
            northstar_id, 
            first_value(campaign_id) OVER (PARTITION BY northstar_id ORDER BY created_at) AS first_campaign
        FROM signups
        WHERE created_at < '2019-01-01'
    ) f
    WHERE f.first_campaign = '8017'
) gtm_first_users
ON gtm_first_users.northstar_id = s.northstar_id
WHERE s.campaign_id = '8017'
AND s.created_at < '2019-01-01'
'''

df_gtm_users = pd.read_sql(GTM_members, u.load_connection())

In [4]:
len(df_gtm_users), len(df_gtm_users[df_gtm_users['new_gtm_member'].notnull()])

(379654, 161982)

In [5]:
df.loc[df['northstar_id'].isin(df_gtm_users['northstar_id'].unique()), 'GTM_participant'] = 1

In [6]:
df.loc[df['northstar_id'].isin(
    df_gtm_users[df_gtm_users['new_gtm_member'].notnull()]['northstar_id'].unique()), 'GTM_first_signup'] = 1

In [7]:
df_GTM_members_returning = df[(df['GTM_participant'] == 1) & (df['GTM_first_signup'].isnull())]
df_GTM_members_new = df[(df['GTM_participant'] == 1) & (df['GTM_first_signup'] == 1)]
df_nonGTM_members = df[~(df['GTM_participant'] == 1)]

In [8]:
len(df_GTM_members_returning), len(df_GTM_members_new), len(df_nonGTM_members)

(1819624, 390194, 5079855)

### How many actions on avg per user per month?

In [9]:
returning_monthly_num_active = df_GTM_members_returning.set_index('timestamp').groupby(
    ['northstar_id', pd.Grouper(freq='M')]).size()

new_monthly_num_active = df_GTM_members_new.set_index('timestamp').groupby(
    ['northstar_id', pd.Grouper(freq='M')]).size()

nonGTM_monthly_num_active = df_nonGTM_members.set_index('timestamp').groupby(
    ['northstar_id', pd.Grouper(freq='M')]).size()

In [10]:
(returning_monthly_num_active.groupby('northstar_id').mean().median(), 
 returning_monthly_num_active.groupby('northstar_id').mean().mean())

(2.6666666666666665, 3.674246928854904)

In [11]:
(new_monthly_num_active.groupby('northstar_id').mean().median(), 
 new_monthly_num_active.groupby('northstar_id').mean().mean())

(1.0, 2.553407670112482)

In [12]:
(nonGTM_monthly_num_active.groupby('northstar_id').mean().median(), 
 nonGTM_monthly_num_active.groupby('northstar_id').mean().mean())

(2.0, 3.2346665208158356)

### What actions are these groups doing?

In [13]:
df_GTM_members_returning['action_type'].value_counts(normalize=True)

messaged_gambit          0.294773
signup                   0.257759
site_access              0.098863
bertly_link_click        0.083846
site_login               0.066855
post                     0.063797
bertly_link_uncertain    0.055973
clicked_link             0.048605
account_creation         0.029529
Name: action_type, dtype: float64

In [14]:
df_GTM_members_new['action_type'].value_counts(normalize=True)

post                     0.215557
signup                   0.179418
site_access              0.168022
site_login               0.144069
account_creation         0.112977
clicked_link             0.095998
messaged_gambit          0.054547
bertly_link_click        0.016528
bertly_link_uncertain    0.012883
Name: action_type, dtype: float64

In [15]:
df_nonGTM_members['action_type'].value_counts(normalize=True)

messaged_gambit          0.267357
signup                   0.151917
clicked_link             0.127766
bertly_link_click        0.112758
site_access              0.096506
site_login               0.079741
account_creation         0.074857
bertly_link_uncertain    0.060214
post                     0.028885
Name: action_type, dtype: float64

### How many signups per user?

In [16]:
returning_num_signups = df_GTM_members_returning[
    df_GTM_members_returning['action_type'] == 'signup'].groupby('northstar_id').size()

In [17]:
new_num_signups = df_GTM_members_new[
    df_GTM_members_new['action_type'] == 'signup'].groupby('northstar_id').size()

In [18]:
nonGTM_num_signups = df_nonGTM_members[
    df_nonGTM_members['action_type'] == 'signup'].groupby('northstar_id').size()

In [19]:
returning_num_signups.describe()

count    198003.000000
mean          2.368772
std           1.785707
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max         107.000000
dtype: float64

In [20]:
new_num_signups.describe()

count    48959.000000
mean         1.429931
std          1.009143
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         64.000000
dtype: float64

In [21]:
nonGTM_num_signups.describe()

count    579030.000000
mean          1.332770
std           0.802298
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          97.000000
dtype: float64

### RBs?

In [22]:
returning_num_posts = df_GTM_members_returning[
    df_GTM_members_returning['action_type'] == 'post'].groupby('northstar_id').size()

In [23]:
new_num_posts = df_GTM_members_new[
    df_GTM_members_new['action_type'] == 'post'].groupby('northstar_id').size()

In [24]:
nonGTM_num_posts = df_nonGTM_members[
    df_nonGTM_members['action_type'] == 'post'].groupby('northstar_id').size()

In [25]:
returning_num_posts.describe()

count    76527.000000
mean         1.516929
std          2.798618
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        326.000000
dtype: float64

In [26]:
new_num_posts.describe()

count    76000.000000
mean         1.106697
std          0.731350
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         59.000000
dtype: float64

In [27]:
nonGTM_num_posts.describe()

count    99596.000000
mean         1.473252
std          8.817974
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max       1506.000000
dtype: float64

### What channel is most popular for each group?

In [28]:
returning_channel = df_GTM_members_returning.groupby('channel').size()

In [29]:
returning_channel / returning_channel.sum()

channel
email                   0.048799
niche_coregistration    0.006997
other                   0.035892
sms                     0.590042
web                     0.318270
dtype: float64

In [30]:
new_channel = df_GTM_members_new.groupby('channel').size()

In [31]:
new_channel / new_channel.sum()

channel
email                   0.096241
niche_coregistration    0.002405
other                   0.190105
sms                     0.104082
web                     0.607167
dtype: float64

In [32]:
nonGTM_channel = df_nonGTM_members.groupby('channel').size()

In [33]:
nonGTM_channel / nonGTM_channel.sum()

channel
email                   0.128415
niche_coregistration    0.039300
other                   0.004983
sms                     0.497897
web                     0.329405
dtype: float64