In [1]:
%load_ext edx_jupyter

The edx_jupyter extension is already loaded. To reload it, use:
  %reload_ext edx_jupyter


In [2]:
%%vertica
SELECT 'Connected to Vertica!'

OPS_WAREHOUSE_PASSWORD: ········


Unnamed: 0,?column?
0,Connected to Vertica!


In [3]:
import datetime

This is an arbitrary date in the past (must be at least 14 days prior to now). It represents the simulated start of our experiment.

In [4]:
intervention_date = datetime.date(2017, 5, 1)
end_date = intervention_date + datetime.timedelta(days=7)
earliest_content_availability_date = intervention_date - datetime.timedelta(days=14)

In [5]:
date_to_string_format = '%Y-%m-%d'
intervention_date_str = intervention_date.strftime(date_to_string_format)
end_date_str = end_date.strftime(date_to_string_format)

## Identify the courses to analyze

In [6]:
%%vertica (earliest_content_availability_date=earliest_content_availability_date, intervention_date=intervention_date, end_date=end_date)
DROP TABLE IF EXISTS ex_courses;

CREATE local TEMPORARY TABLE ex_courses
ON COMMIT PRESERVE ROWS
AS SELECT
    *
FROM
    business_intelligence.course_master
WHERE
        (content_language is NULL OR content_language = 'en-us')
    AND course_start_date <= :earliest_content_availability_date

In [7]:
%%vertica
SELECT COUNT(*) FROM ex_courses

Unnamed: 0,COUNT
0,2579


## Compute the content availability date for each enrollment

In [8]:
%%vertica
DROP TABLE IF EXISTS user_content_availability;

CREATE local TEMPORARY TABLE user_content_availability
ON COMMIT PRESERVE ROWS AS
SELECT
    CASE
        WHEN ec.course_start_date >= uc.first_enrollment_time THEN ec.course_start_date
        WHEN ec.course_start_date < uc.first_enrollment_time THEN DATE(uc.first_enrollment_time)
    END AS content_availability_date,
    uc.user_id,
    uc.course_id,
    uc.first_verified_enrollment_time
FROM
    production.d_user_course uc
JOIN
    ex_courses ec
ON
    ec.course_id = uc.course_id

## Users who actually visited the Course Outline page

In [10]:
%%vertica (intervention_date_str=intervention_date_str, end_date_str=end_date_str)
DROP TABLE IF EXISTS ex_events_with_course_id;

CREATE local TEMPORARY TABLE ex_events_with_course_id
ON COMMIT PRESERVE ROWS AS
SELECT
    user_id,
    LEFT(RIGHT(url, LENGTH(url) - 32), LENGTH(RIGHT(url, LENGTH(url) - 32)) - 8) AS computed_course_id,
    date,
    event_source,
    event_type
FROM
    experimental_events_run14.event_records
WHERE
    event_source = 'client' AND
    event_type = 'page' AND
    url LIKE 'https://courses.edx.org/courses/%/course/' AND
    date BETWEEN :intervention_date_str AND :end_date_str AND
    user_id is not NULL AND
    --- Filter out the 190,809 rows in the table with non-numeric user_ids (out of 987,541,397 rows)
    REGEXP_LIKE(user_id, '^[0-9]+')

In [11]:
%%vertica
SELECT * FROM ex_events_with_course_id LIMIT 10

Unnamed: 0,user_id,computed_course_id,date,event_source,event_type
0,5777288,course-v1:UBCx+ITSx+1T2017,2017-05-02,client,page
1,14320565,course-v1:RITx+CYBER501x+1T2017,2017-05-02,client,page
2,11961209,course-v1:DelftX+BMI.3x+1T2017,2017-05-02,client,page
3,13723560,course-v1:MITx+7.28.1x+1T2017,2017-05-02,client,page
4,10151328,course-v1:IDBx+IDB6x+1T2017,2017-05-02,client,page
5,14032987,course-v1:DavidsonX+D001x+1T2017,2017-05-02,client,page
6,10106784,course-v1:DavidsonX+D001x+1T2016,2017-05-02,client,page
7,14032987,course-v1:DavidsonX+D001x+1T2017,2017-05-02,client,page
8,5421858,course-v1:BerkeleyX+BJC.34x+1T2017,2017-05-02,client,page
9,3519339,course-v1:HarvardX+CS50+AP,2017-05-02,client,page


In [12]:
%%vertica (intervention_date=intervention_date, end_date=end_date, earliest_content_availability_date=earliest_content_availability_date)
DROP TABLE IF EXISTS ex_enrollments;

CREATE local TEMPORARY TABLE ex_enrollments
ON COMMIT PRESERVE ROWS AS
SELECT
    e.user_id,
    ca.course_id,
    ca.content_availability_date,
    ca.first_verified_enrollment_time
FROM
    ex_events_with_course_id e
JOIN
    user_content_availability ca
ON
    ca.course_id = e.computed_course_id
    AND e.user_id = ca.user_id
    AND ca.content_availability_date BETWEEN :earliest_content_availability_date AND :end_date
GROUP BY 1, 2, 3, 4

In [13]:
%%vertica
select * from ex_enrollments limit 10

Unnamed: 0,user_id,course_id,content_availability_date,first_verified_enrollment_time
0,1000,course-v1:PennX+ROBO1x+1T2017,2017-04-17,NaT
1,10000403,course-v1:MITx+6.041x_4+1T2017,2017-04-29,NaT
2,10000416,course-v1:UQx+IELTSx+3T2016,2017-04-25,NaT
3,10000556,course-v1:DelftX+EX102+1T2017,2017-04-25,NaT
4,10000556,course-v1:Microsoft+DAT205x+3T2016,2017-04-25,2017-04-26 08:31:43.904746
5,10000556,course-v1:Microsoft+DAT206x+2T2017,2017-04-25,NaT
6,1000069,course-v1:DartmouthX+DART.MUS.02X+2T2017,2017-04-20,NaT
7,10001185,course-v1:HarvardX+SPU27x+1T2017,2017-04-22,NaT
8,10001432,course-v1:Microsoft+DAT206x+2T2017,2017-05-02,NaT
9,10001448,course-v1:IITBombayX+CS101.1x+1T2017,2017-05-08,NaT


## Required sample size by VTR

In [15]:
%%vertica -> naa_test
SELECT COUNT(*) FROM ex_enrollments

In [19]:
naa_test.COUNT[0]

Unnamed: 0,COUNT
0,65956


Count of rows in ex_enrollments is: **65,995**.

In [15]:
%%vertica (end_date=end_date, intervention_date=intervention_date) -> results

SELECT
    SUM(value) AS 'bookings',
    AVG(value) AS 'bookings_per_enrollment',
    SUM(did_upgrade)/COUNT(*) AS 'conversion_rate',
    COUNT(*) AS 'cnt_enrollments'
FROM
(
    SELECT
        CASE
            WHEN enr.first_verified_enrollment_time BETWEEN :intervention_date AND :end_date THEN crs.course_seat_price
            ELSE 0
        END AS value,
        CASE
            WHEN enr.first_verified_enrollment_time BETWEEN :intervention_date AND :end_date THEN 1
            ELSE 0
        END AS did_upgrade
    FROM
        ex_enrollments AS enr
    JOIN
            ex_courses AS crs
        ON  crs.course_id = enr.course_id
) AS enr

In [16]:
baseline_conversion_rate = float(results.loc[0].conversion_rate)

In [17]:
print('Baseline conversion rate: {0}%'.format(round(baseline_conversion_rate*100, 2)))

Baseline conversion rate: 1.35%


In [36]:
relative_increase_in_rate = 0.1
power = 0.8
significance_level = 0.1

In [37]:
! sudo pip install statsmodels

[33mThe directory '/home/developer/.cache/pip/http' or its parent directory is not owned by the current user and the cache has been disabled. Please check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m
[33mThe directory '/home/developer/.cache/pip' or its parent directory is not owned by the current user and caching wheels has been disabled. check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m


In [38]:
import statsmodels.stats.api as sms

In [39]:
es = sms.proportion_effectsize(baseline_conversion_rate, baseline_conversion_rate * (1 + relative_increase_in_rate))

In [40]:
round(sms.NormalIndPower().solve_power(es, power=power, alpha=significance_level, ratio=1))

94961

Given these conversion rates, it looks like we will need about **94,961** users in each variant.

# Required sample size by engagement

In [18]:
%%vertica (end_date=end_date, intervention_date=intervention_date) -> results

SELECT
    SUM(engaged_in_intervention_week)/COUNT(*) AS 'engaged_in_intervention_week_rate',
    SUM(engaged_in_first_week)/COUNT(*) AS 'engaged_in_first_week_rate',
    SUM(engaged_in_second_week)/COUNT(*) AS 'engaged_in_second_week_rate',
    COUNT(*) AS 'cnt_enrollments'
FROM
(
    SELECT
        bi_engage.user_id,
        bi_engage.course_id,
        CASE
            WHEN DATEDIFF('day', :intervention_date, bi_engage.date) BETWEEN 0 AND 6 THEN 1
            ELSE 0
        END AS engaged_in_intervention_week,
        CASE
            WHEN DATEDIFF('day', :end_date, bi_engage.date) BETWEEN 0 AND 6 THEN 1
            ELSE 0
        END AS engaged_in_first_week,
        CASE
            WHEN DATEDIFF('day', :end_date, bi_engage.date) BETWEEN 7 AND 13 THEN 1
            ELSE 0
        END AS engaged_in_second_week
    FROM
        ex_enrollments AS enr
    JOIN
        business_intelligence.activity_engagement_user_daily bi_engage
    ON
        enr.course_id = bi_engage.course_id AND
        enr.user_id = bi_engage.user_id
) AS enr

In [19]:
results

Unnamed: 0,engaged_in_intervention_week_rate,engaged_in_first_week_rate,engaged_in_second_week_rate,cnt_enrollments
0,0.3408458474540306,0.1966146701667808,0.1051822706687602,329894


In [20]:
relative_increase_in_rate = 0.1
power = 0.8
significance_level = 0.1

In [21]:
! sudo pip install statsmodels

[33mThe directory '/home/developer/.cache/pip/http' or its parent directory is not owned by the current user and the cache has been disabled. Please check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m
[33mThe directory '/home/developer/.cache/pip' or its parent directory is not owned by the current user and caching wheels has been disabled. check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m


In [22]:
import statsmodels.stats.api as sms

In [23]:
baseline_exp_week_rate, baseline_1st_week_rate, baseline_2nd_week_rate = (
    float(results.loc[0].engaged_in_intervention_week_rate),
    float(results.loc[0].engaged_in_first_week_rate),
    float(results.loc[0].engaged_in_second_week_rate)
)   

In [24]:
print('Baseline engagement rates: experiment week: {}%, 1st week: {}%, 2nd week: {}%'.format(
    round(baseline_exp_week_rate*100, 2),
    round(baseline_1st_week_rate*100, 2),
    round(baseline_2nd_week_rate*100, 2)
))

Baseline engagement rates: experiment week: 34.08%, 1st week: 19.66%, 2nd week: 10.52%


In [25]:
sample_sizes = []
for rate in (baseline_exp_week_rate, baseline_1st_week_rate, baseline_2nd_week_rate):
    es = sms.proportion_effectsize(rate, rate * (1 + relative_increase_in_rate))
    size = round(sms.NormalIndPower().solve_power(es, power=power, alpha=significance_level, ratio=1))
    sample_sizes.append(size)

In [35]:
print(
    'It looks like we will need a sample size of either \n'
    '\t{} (experiment week), \n'
    '\t{} (1st week after experiment ends), or \n'
    '\t{} (2nd week after experiment ends) \n'
    'depending on which week we want to track.'
    .format(sample_sizes[0], sample_sizes[1], sample_sizes[2])
)

It looks like we will need a sample size of either 
	2444 (experiment week), 
	5237 (1st week after experiment ends), or 
	10974 (2nd week after experiment ends) 
depending on which week we want to track.


Given these conversion rates, it looks like we will need about (**2,444** or) **5,237** (or **10,974**) users in each variant.