# Setup

In [1]:
%%vertica
SELECT 'Connected to Vertica!' AS ''

Unnamed: 0,Unnamed: 1
0,Connected to Vertica!


In [2]:
%vertica SET TIME ZONE to UTC;

In [3]:
%%query [lms]
SELECT 'Connected to the LMS read replica!' AS ''

Unnamed: 0,Unnamed: 1
0,Connected to the LMS read replica!


In [4]:
import datetime

# Configuration

## Enrollment Date Config

In [5]:
from dateutil.tz import tzutc

# enter enrollment start date
enrollment_start = datetime.datetime(2017, 8, 2, 11, 0, tzinfo=tzutc())
enrollment_end = enrollment_start + datetime.timedelta(days=7)

# Make sure we start our window on a Wednesday
# assert(enrollment_start.isoweekday() == 3)

## Segmentation Config

In [6]:
# enter number of user segments (i.e., total number of treatment groups + 1 for the control group)
# Control = 0
# Hard Deadlines = 1
# Soft Deadlines = 2
num_user_segments = 3

## Experiment Identifier

In [7]:
experiment_id = 8

# Segmented Enrollments

## Load segmentation data from the LMS

In [8]:
%%query [lms] (experiment_id) -> already_segmented
SELECT
    exp.key,
    exp.value,
    exp.user_id
FROM
    experiments_experimentdata AS exp
WHERE
    experiment_id = :experiment_id

In [9]:
import json
import ciso8601
import pandas as pd

already_segmented['course_id'] = already_segmented['key'].apply(
    lambda key: key.replace('cohort_availability_date.', ''))
already_segmented['segment'] = already_segmented['value'].apply(
    lambda value: json.loads(value)['segment'])
already_segmented['cohort_availability'] = already_segmented['value'].apply(
    lambda value: ciso8601.parse_datetime(json.loads(value)['cohort_availability']).date())
already_segmented['segmentation_reason'] = already_segmented['value'].apply(
    lambda value: json.loads(value).get('segmentation_reason', 'visit'))
already_segmented['user_id'] = already_segmented['user_id'].astype(int)
del already_segmented['key']
del already_segmented['value']

In [10]:
len(already_segmented)

99907

In [11]:
already_segmented.groupby(['cohort_availability']).user_id.count()

cohort_availability
2017-07-26      267
2017-08-02    98741
2017-08-09      899
Name: user_id, dtype: int64

## Load any newly content-available users from the enrollments table

In [12]:
%%query [lms] (enrollment_start, enrollment_end, experiment_id) -> enrollments
SELECT
    sce.user_id,
    sce.course_id,
    sce.created AS content_availability,
    au.username
FROM
    student_courseenrollment sce
JOIN auth_user au ON au.id = sce.user_id
JOIN user_api_userpreference up ON up.`key`='pref-lang' AND up.value='en' AND up.user_id = sce.user_id
JOIN course_modes_coursemode cm ON cm.course_id = sce.course_id AND cm.mode_slug = 'verified'
JOIN course_overviews_courseoverview co ON co.id = sce.course_id
LEFT JOIN experiments_experimentdata AS exp ON exp.user_id = sce.user_id AND exp.experiment_id = :experiment_id
WHERE
        sce.created BETWEEN :enrollment_start AND CAST(:enrollment_end AS DATE)
    AND exp.user_id IS NULL
    AND cm.expiration_datetime > '2017-08-23'
    AND co.self_paced
    AND co.start <= :enrollment_start

In [13]:
len(enrollments)

20

In [14]:
import pandas as pd
enrollments['content_availability'] = pd.to_datetime(enrollments.content_availability)

In [15]:
enrollments['cohort_availability'] = enrollments.content_availability.apply(
    lambda ts: ts.date() - datetime.timedelta((ts.date().isoweekday() + 4) % 7))

In [16]:
unsegmented_users = enrollments[enrollments.cohort_availability == datetime.date(2017, 8, 2)][['user_id','course_id','cohort_availability','username']].copy()

In [17]:
unsegmented_users['segmentation_reason'] = 'enrollment'

In [18]:
import numpy as np
unsegmented_users['segment'] = np.random.randint(num_user_segments, size=len(unsegmented_users))

In [19]:
unsegmented_users.groupby('course_id').user_id.count().sort_values(ascending=False)[:20]

course_id
course-v1:UQx+IELTSx+2T2017              2
course-v1:W3Cx+HTML5.0x+1T2017           1
course-v1:IITBombayX+CS101.1x+1T2017     1
course-v1:DelftX+DDA691x+1T2017          1
course-v1:EPFLx+FndBioImg2x+1T2017       1
course-v1:ETHx+ETHx-FC-03x+2T2017        1
course-v1:ETHx+FC-01x+2016_T2            1
course-v1:HarvardX+GSD1x+1T2017          1
course-v1:HarvardX+GSE2x+2T2017          1
course-v1:HarvardX+SW47x+2T2017          1
course-v1:MichiganX+UX501x+3T2016        1
course-v1:Microsoft+DAT203.1x+3T2017     1
course-v1:Microsoft+DAT203.2x+3T2017     1
course-v1:NotreDameX+DS101x+1T2017       1
course-v1:PennX+ROBO1x+1T2017            1
course-v1:PennX+SD1x+2T2017              1
course-v1:UPValenciaX+BSP101x+1T2017     1
course-v1:UPValenciaX+UNY201.x+2T2017    1
course-v1:CornellX+ENGR2000X+1T2017      1
Name: user_id, dtype: int64

In [20]:
unsegmented_users.groupby(['segment']).user_id.count()[:20]

segment
0    10
1     1
2     9
Name: user_id, dtype: int64

In [21]:
unsegmented_users.groupby(['cohort_availability']).user_id.count()

cohort_availability
2017-08-02    20
Name: user_id, dtype: int64

# LMS

In [22]:
import edx_secret

environment = 'prod'

if environment == 'stage':
    server_base_url = 'https://courses.stage.edx.org'
elif environment == 'prod':
    server_base_url = 'https://courses.edx.org'
else:
    raise(f'Unrecognized environment: {environment}')

oauth2_client_id = edx_secret.read_secret_from_env_or_prompt(f'OPS_{environment.upper()}_EDX_OAUTH_CLIENT_ID')
oauth2_client_secret = edx_secret.read_secret_from_env_or_prompt(f'OPS_{environment.upper()}_EDX_OAUTH_CLIENT_SECRET')

In [23]:
from edx_rest_api_client.client import EdxRestApiClient

access_token_url = server_base_url.strip('/') + '/oauth2/access_token/'
access_token, expires = EdxRestApiClient.get_oauth_access_token(
    access_token_url, oauth2_client_id, oauth2_client_secret, token_type='jwt')
print('JWT access token expires at: {}'.format(expires))

experiments_api_url = server_base_url.strip('/') + '/api/experiments/v0/'
client = EdxRestApiClient(experiments_api_url, jwt=access_token)

print(f'Connected to {server_base_url}')

JWT access token expires at: 2017-08-22 00:59:48.673051
Connected to https://courses.edx.org


## Insert User Data

In [75]:
import json
from pprint import pprint as pp
from slumber.exceptions import HttpClientError, HttpServerError
import ipywidgets as widgets
from IPython.display import display

data = [{
        'experiment_id': experiment_id,
        'user': row.username,
        'key': 'cohort_availability_date.' + row.course_id,
        'value': json.dumps({
            'cohort_availability': datetime.datetime.strftime(row.cohort_availability, '%Y-%m-%dT%H:%M:%S.000Z'),
            'segment': int(row.segment),
            'segmentation_reason': row.segmentation_reason
        })
    } for row in unsegmented_users.itertuples()]

progress = widgets.IntProgress(min=0, max=len(data))
display(progress)

succeeded = widgets.IntText(value=0, description='Succeeded: ')
display(succeeded)
failed = widgets.IntText(value=0, description='Failed: ')
display(failed)

from collections import defaultdict
errors = defaultdict(int)

failed_learners = []
for learner in data:
    try:
        #client.data.post(learner)
        succeeded.value += 1
    except (HttpClientError, HttpServerError) as ex:
        failed.value += 1
        error = ex.response.json()
        failed_learners.append((learner, error))
        for ek, ev in error.items():
            errors[ek] += len(ev)
    finally:
        progress.value += 1

pp(dict(errors))

{'non_field_errors': 41}
