In [1]:
import pandas as pd
import numpy as np
import vertica_python
import os
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
username = os.getenv('VERTICA_USERNAME')
password = os.getenv('VERTICA_PASSWORD')
conn_info = {'host': 'warehouse.analytics.edx.org',
             'port': 5433,
             'user': username,
             'password': password,
             'database': 'warehouse',
             'read_timeout': 600,
             'unicode_error': 'strict',
             'ssl': False}

In [4]:
query = """
SELECT
    pacing_type,
    CASE
        WHEN duc.first_verified_enrollment_time IS NULL
            THEN TRUE
        ELSE DATEDIFF('minute', duc.first_enrollment_time, duc.first_verified_enrollment_time) > 60
    END AS verified_after_enrollment,
    CASE
        WHEN duc.first_enrollment_time > (ca.course_start_date + 7) THEN 'After'
        WHEN duc.first_enrollment_time > ca.course_start_date THEN 'First Week'
        WHEN (duc.first_enrollment_time + 7) > ca.course_start_date THEN 'One Week Before'
        ELSE '> 1 Week Before'
    END AS enrolled_before_course_start,
    CASE
        WHEN c.course_partner = 'Microsoft' THEN 'Microsoft'
        ELSE 'Not Microsoft'
    END AS is_microsoft,
    CASE
        WHEN c.course_seat_price :: INT BETWEEN 0 AND 25 THEN '1) 0 < price <= 25'
        WHEN c.course_seat_price :: INT BETWEEN 26 AND 75 THEN '2) 25 < price <= 75'
        WHEN c.course_seat_price :: INT BETWEEN 76 AND 150 THEN '3) 75 < price <= 150'
        WHEN c.course_seat_price :: INT > 150 THEN '4) 150 < price'
    END AS price,
    CASE
        WHEN duc.first_verified_enrollment_time IS NULL THEN DATEDIFF('day', duc.first_enrollment_time, NOW())
        ELSE DATEDIFF('day', duc.first_enrollment_time, duc.first_verified_enrollment_time)
    END AS duration,
    duc.first_verified_enrollment_time IS NOT NULL AS verified,
    COUNT(*)
FROM
(
    SELECT DISTINCT
        user_id,
        course_id,
        first_enrollment_time,
        first_verified_enrollment_time
    FROM production.d_user_course
) AS duc
JOIN
(
    SELECT DISTINCT
        course_id,
        course_start_date
    FROM business_intelligence.course_availability
    WHERE date = NOW()::DATE
) AS ca
    ON duc.course_id = ca.course_id
JOIN 
(
    SELECT DISTINCT
        course_id,
        course_partner,
        pacing_type,
        course_seat_price,
        course_verification_end_date,
        has_verification_deadline_passed,
        is_WL
    FROM
        business_intelligence.course_master
) AS c ON c.course_id = duc.course_id
WHERE
    ca.course_start_date > '2016-01-01'::DATE
    AND duc.first_enrollment_time < c.course_verification_end_date

GROUP BY 1, 2, 3, 4, 5, 6, 7
ORDER BY 1, 5, 2, 3, 4, 6, 7

"""

In [5]:
connection = vertica_python.connect(**conn_info)

cur = connection.cursor('dict')

cur.execute(query)
extract = cur.fetchall()
df = pd.DataFrame(extract)

connection.close()

In [6]:
df.groupby(['pacing_type', ])

Unnamed: 0,pacing_type,verified_after_enrollment,enrolled_before_course_start,is_microsoft,price,duration,verified,COUNT
0,instructor_paced,False,> 1 Week Before,Not Microsoft,1) 0 < price <= 25,0,True,17160
1,instructor_paced,False,> 1 Week Before,Not Microsoft,1) 0 < price <= 25,1,True,64
2,instructor_paced,False,After,Not Microsoft,1) 0 < price <= 25,0,True,2318
3,instructor_paced,False,After,Not Microsoft,1) 0 < price <= 25,1,True,8
4,instructor_paced,False,First Week,Not Microsoft,1) 0 < price <= 25,0,True,4020
5,instructor_paced,False,First Week,Not Microsoft,1) 0 < price <= 25,1,True,10
6,instructor_paced,False,One Week Before,Not Microsoft,1) 0 < price <= 25,0,True,5944
7,instructor_paced,False,One Week Before,Not Microsoft,1) 0 < price <= 25,1,True,40
8,instructor_paced,True,> 1 Week Before,Not Microsoft,1) 0 < price <= 25,0,True,1222
9,instructor_paced,True,> 1 Week Before,Not Microsoft,1) 0 < price <= 25,1,True,940
