In [None]:
# pandas-related imports
import pandas as pd

# Numpy
import numpy as np

# database interaction imports
import sqlalchemy

In [None]:
# to create a connection to the database, 
# we need to pass the name of the database and host of the database

host = 'stuffed.adrf.info'
DB = 'appliedda'

connection_string = "postgresql://{}/{}".format(host, DB)
conn = sqlalchemy.create_engine(connection_string)

# Number of Community college graduates by rural/urban county of residency

In [None]:
# store query to find 2012-13 academic year graduates in a temporary table
# use conn.execute instead of pd.read_sql because there is no output
qry = '''
create temp table all_grads as
select a.*, b.cnty_code
from data_ohio_olda_2018.oh_hei_long a
join data_ohio_olda_2018.oh_hei_demo b on a.ssn_hash=b.ssn_hash 
where (degcert_yr_earned = '2012' and (degcert_term_earned = '4' or degcert_term_earned = '1')) or 
    (degcert_yr_earned = '2013' and (degcert_term_earned = '2' or degcert_term_earned = '3'))
'''
conn.execute(qry)

> Definition of rural counties: Nonmetro - Urban population of 20,000 or more, not adjacent to a metro area                                              
> Definition of urban counties: Metro - Counties in metro areas of 1 million population or more                                                                                                                                         

In [None]:
# need to join all_grads to the oh_hei_campus_county_lkp lookup table
                            # oh_region_county_lkp lookup table
                            # and the public.usda_rural_urban_codes_2013
qry = '''
with grad_community_college as (select ssn_hash, cnty_code
					from all_grads a
					join data_ohio_olda_2018.oh_hei_campus_county_lkp lkp on a.degcert_campus = lkp.campus_num
					where lkp.campus_type_code in ('TC', 'SC', 'CC')),
	 rural_urban as (select right(fips, 3)::int as fips, county_name, 
                     case
                     when m_nm = 'True' then 'Urban'
                     when m_nm = 'False' then 'Rural'
                     end urban_rural
					from public.usda_rural_urban_codes_2013 
					where state = 'OH')
select distinct a.ssn_hash, urban_rural, b.county_name
from grad_community_college a
join rural_urban b on a.cnty_code = b.fips;
'''
df=pd.read_sql(qry, conn)

In [None]:
# List of graduates by county of residency rural/urban classification
df

In [None]:
# find number of graduates by urban or rural county of residency
df.groupby(['urban_rural'])['ssn_hash'].nunique()

# Community college on-time graduation, late graduation, and dropouts

This section describes the process for determining which college students are on-time graduates vs. late graduates and dropouts. Overall the steps are:
1. Select 2010-11 community college enrolles from the full population of higher education students.
2. Find first-time enrollment from school year 2010-2011 for each student
3. Merge the 2010-11 enrollee table with the table of community college graduates to find the first instance of an earned award
4. Caterorize graduates as those who graduated within 2 years of their first enrollment vs. graduated in more than 2 years
5. Categorize the remaining enrolles as dropouts <br>
<br>
This procedure does not apply to Ohio Technical Center students. The OTC data includes a `student_result` column which indicates whether ot not the students completed the course or not.

### First-time enrollment during school year 2010-2011

In [None]:
qry = ''' create temp table comm_coll as ( 
with cc as (select distinct ssn_hash, enroll_campus, enroll_yr_num,
                           case when a.enroll_term = 2 then 1
                            when a.enroll_term = 3 then 2
                            when a.enroll_term = 4 then 3
                            when a.enroll_term = 1 then 4
                            end as enroll_quarter,
                            case when enroll_term = 4 THEN
                            format('%%s-%%s-01', enroll_yr_num, 7)::date 
                            when enroll_term = 1 THEN
                            format('%%s-%%s-01', enroll_yr_num, 10)::date 
                            when enroll_term = 2 THEN
                            format('%%s-%%s-01', enroll_yr_num, 1)::date 
                            when enroll_term = 3 THEN
                            format('%%s-%%s-01', enroll_yr_num, 4)::date 
                            end as enroll_date
            from data_ohio_olda_2018.oh_hei_long as a
            where (enroll_yr_num = '2011' and (enroll_term = '1' or enroll_term = '2')) or 
            enroll_yr_num < '2011')
select  ssn_hash, enroll_date, enroll_yr_num, enroll_quarter
from cc
join data_ohio_olda_2018.oh_hei_campus_county_lkp lkp on enroll_campus = lkp.campus_num
where lkp.campus_type_code in ('TC', 'SC', 'CC'))
;
'''

conn.execute(qry)

In [None]:
## First enrollment school year 2010-2011
qry = '''
select distinct ssn_hash, min(enroll_date) as enroll_date
from comm_coll
where (enroll_yr_num = '2011' and (enroll_quarter = 1 or enroll_quarter = 2)) or 
      (enroll_yr_num = '2010' and (enroll_quarter = 3 or enroll_quarter = 4))
group by ssn_hash;
'''
df1 = pd.read_sql(qry, conn)

In [None]:
# Count of students with first enrollment date during school year 2010-2011
df1.ssn_hash.nunique()

In [None]:
# Count of students with first enrollment date during school year 2010-2011 by date
df1.groupby(['enroll_date'])['ssn_hash'].count()

### Community College graduates

In [None]:
# store query to find graduate students in community college in a temporary table
# use conn.execute instead of pd.read_sql because there is no output
qry = '''
select distinct a.ssn_hash,
	   a.degcert_yr_earned,
       case when a.degcert_term_earned = 2 then 1
            when a.degcert_term_earned = 3 then 2
            when a.degcert_term_earned = 4 then 3
            when a.degcert_term_earned = 1 then 4
            end as degcert_quarter_earned,
       case when a.degcert_term_earned = 4 then
                format('%%s-%%s-01', degcert_yr_earned, 7)::date 
            when a.degcert_term_earned = 1 then
                format('%%s-%%s-01', degcert_yr_earned, 10)::date 
            when a.degcert_term_earned = 2 then
                format('%%s-%%s-01', degcert_yr_earned, 1)::date 
            when a.degcert_term_earned = 3 then
                format('%%s-%%s-01', degcert_yr_earned, 4)::date 
            end as deg_date
from data_ohio_olda_2018.oh_hei_long as a
join data_ohio_olda_2018.oh_hei_campus_county_lkp lkp2 on a.degcert_campus = lkp2.campus_num
where lkp2.campus_type_code in ('TC', 'SC', 'CC') and 
      (degcert_yr_earned = '2010' and degcert_term_earned in ('1', '2')) or 
      degcert_yr_earned > '2010';
'''

df2 = pd.read_sql(qry, conn)
df2.head()

### First graduation date, after first-time enrollment

In [None]:
# Merging enrollees and graduates in/from community colleges
enroll_grad = pd.merge(df1, df2, left_on = ('ssn_hash'), right_on = ('ssn_hash'), how = 'left')
enroll_grad['id'] = 1 

In [None]:
first_grad = enroll_grad.fillna(-1).groupby(['ssn_hash'])['deg_date'].min()

In [None]:
first_grad = pd.DataFrame(first_grad).reset_index()

In [None]:
first_enr_grad = pd.merge(first_grad, enroll_grad, how = 'left')

In [None]:
first_enr_grad = first_enr_grad[first_enr_grad.deg_date != -1]

In [None]:
first_enr_grad.shape

### Merge first-time enrollment with first graduation date

In [None]:
first_enr_grad['deg_date'] = first_enr_grad['deg_date'].astype('datetime64[ns]')
first_enr_grad['enroll_date'] = first_enr_grad['enroll_date'].astype('datetime64[ns]')

In [None]:
first_enr_grad

In [None]:
# Number of years enrolled before graduation
first_enr_grad['yr_before_grad'] = (first_enr_grad.loc[:,'deg_date'] - first_enr_grad.loc[:,'enroll_date']).astype('timedelta64[Y]')
# Eliminating graduation from programs that started before school year 2010-2011
first_enr_grad = first_enr_grad.loc[first_enr_grad.yr_before_grad >=0]

In [None]:
first_enr_grad

In [None]:
# Classifying graduates
grad_groups = first_enr_grad.loc[:,('ssn_hash', 'yr_before_grad')]
grad_groups['college_graduation'] = 'NULL'
grad_groups.loc[(first_enr_grad.yr_before_grad <= 2), 'college_graduation'] = '2yrs_or_less'
grad_groups.loc[(first_enr_grad.yr_before_grad > 2), 'college_graduation'] = 'more_than_2yrs'

In [None]:
grad_groups

In [None]:
# Adding dropouts
enr_grad_10_11 = pd.merge(df1, grad_groups, how = 'left')

In [None]:
# Community college students that didn't graduate
enr_grad_10_11 = enr_grad_10_11.fillna(value={'college_graduation': 'dropouts'})

In [None]:
enr_grad_10_11.groupby(['college_graduation'])['ssn_hash'].count()