### Import libraries and necessary datasets

In [122]:
import psycopg2
import pandas as pd
import numpy as np

df_wg = pd.read_csv('full_database.csv')
df_wg.head()


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed
0,5,0,40.0,90.0,1
1,6,0,11.0,78.0,0
2,4,0,9.0,60.0,0
3,11,0,48.0,109.0,1
4,11,0,48.0,108.0,1


In [123]:
df_wg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39202 entries, 0 to 39201
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   education_attained  39202 non-null  int64  
 1   enrollment_status   39202 non-null  int64  
 2   age                 39202 non-null  float64
 3   sample_weight       39202 non-null  float64
 4   employed            39202 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 1.5 MB


In [124]:
df_wg[(df_wg['employed'] == 0)].count()

education_attained    8496
enrollment_status     8496
age                   8496
sample_weight         8496
employed              8496
dtype: int64

In [125]:
#add ranges for binning of age groups
df_wg['ranges'] = (df_wg['age'] -16)//3
df_wg


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
0,5,0,40.0,90.0,1,8.0
1,6,0,11.0,78.0,0,-2.0
2,4,0,9.0,60.0,0,-3.0
3,11,0,48.0,109.0,1,10.0
4,11,0,48.0,108.0,1,10.0
...,...,...,...,...,...,...
39197,21,0,72.0,3.0,1,18.0
39198,19,0,69.0,5.0,1,17.0
39199,16,0,94.0,3.0,1,26.0
39200,22,0,94.0,7.0,1,26.0


In [126]:
#replace ranges output with grouping names.  
df_wg.loc[df_wg['ranges'] == 0.0, 'ranges'] = '16-18'
df_wg.loc[df_wg['ranges'] == 1.0, 'ranges'] = '19-21'
df_wg.loc[df_wg['ranges'] == 2.0, 'ranges'] = '22-24'
df_wg


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
0,5,0,40.0,90.0,1,8
1,6,0,11.0,78.0,0,-2
2,4,0,9.0,60.0,0,-3
3,11,0,48.0,109.0,1,10
4,11,0,48.0,108.0,1,10
...,...,...,...,...,...,...
39197,21,0,72.0,3.0,1,18
39198,19,0,69.0,5.0,1,17
39199,16,0,94.0,3.0,1,26
39200,22,0,94.0,7.0,1,26


## Total Opportunity Youth
Pulling the total number of opportunity youth as defined by: "not in school, not working, between the ages of 16 and 24" from 2016 report.

In [127]:
df_total_OY = df_wg[(df_wg['employed'] == 0) & (df_wg['age'].between(16,24) & (df_wg['enrollment_status'] == 0))]
df_total_OY


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
155,19,0,20.0,16.0,0,19-21
203,20,0,20.0,12.0,0,19-21
332,16,0,21.0,20.0,0,19-21
349,17,0,22.0,19.0,0,22-24
624,19,0,19.0,6.0,0,19-21
...,...,...,...,...,...,...
39126,18,0,19.0,7.0,0,19-21
39140,19,0,20.0,14.0,0,19-21
39154,18,0,19.0,16.0,0,19-21
39160,19,0,19.0,17.0,0,19-21


In [128]:
df_oy_gr = df_total_OY.groupby('ranges').sum()
df_oy_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,1221,0,1477.0,2203.0,0
19-21,1768,0,2084.0,2324.0,0
22-24,1549,0,2017.0,2134.0,0


## Total Working Without a Diploma (and also out of school) 



In [129]:
df_total_working = df_wg[(df_wg['education_attained'] < 16) 
        & (df_wg['employed'] == 1) & (df_wg['age'].between(16,24)) 
        & (df_wg['enrollment_status'] == 0)]
df_total_working

Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
31,13,0,17.0,36.0,1,16-18
49,12,0,17.0,13.0,1,16-18
64,15,0,18.0,24.0,1,16-18
189,14,0,24.0,20.0,1,22-24
215,13,0,18.0,16.0,1,16-18
...,...,...,...,...,...,...
39079,14,0,23.0,17.0,1,22-24
39086,12,0,16.0,5.0,1,16-18
39135,14,0,18.0,12.0,1,16-18
39156,14,0,18.0,2.0,1,16-18


In [130]:
df_working_gr = df_total_working.groupby('ranges').sum()
df_working_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,13299,0,16772.0,23387.0,1003
19-21,1546,0,2346.0,3300.0,118
22-24,1132,0,2184.0,2670.0,95


## Total Not Oportunity Youth

EDUCATION ATTAINED BOUNDRIES? a not OY is: working, has a degree (high school at minimum?

In [131]:
df_total_NOY = df_wg[(df_wg['education_attained'] >= 16)  
        & (df_wg['employed'] == 1) & (df_wg['age'].between(16,24)) 
        & (df_wg.enrollment_status.isin([0, 1]))]
df_total_NOY

Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
13,21,0,24.0,20.0,1,22-24
14,21,0,24.0,15.0,1,22-24
28,18,0,19.0,30.0,1,19-21
37,16,1,19.0,36.0,1,19-21
40,18,1,18.0,15.0,1,16-18
...,...,...,...,...,...,...
39141,19,0,20.0,7.0,1,19-21
39150,20,0,20.0,13.0,1,19-21
39166,19,0,20.0,17.0,1,19-21
39188,19,0,19.0,6.0,1,19-21


In [132]:
df_noy_gr = df_total_NOY.groupby('ranges').sum()
df_noy_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,4561,26,4743.0,6011.0,266
19-21,17576,110,19660.0,23904.0,983
22-24,21698,112,26549.0,30833.0,1152


## Opportunity Youth No Diploma 
I ALSO TRIED TO DO IT FROM LARGER DATA SET INSTEAD OF FROM SUBSET, results were the same

In [95]:
df_OY_ND = df_total_OY[(df_total_OY['education_attained'] < 16)]
df_OY_ND


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
963,13,0,16.0,74.0,0,16-18
1024,1,0,19.0,29.0,0,19-21
1571,13,0,16.0,12.0,0,16-18
1694,12,0,16.0,5.0,0,16-18
1842,14,0,20.0,73.0,0,19-21
...,...,...,...,...,...,...
35591,15,0,22.0,10.0,0,22-24
36536,14,0,18.0,18.0,0,16-18
37218,14,0,18.0,15.0,0,16-18
38460,14,0,17.0,18.0,0,16-18


In [133]:
df_oy_nd_gr = df_OY_ND.groupby('ranges').sum()
df_oy_nd_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,896,0,1141.0,1671.0,0
19-21,270,0,416.0,540.0,0
22-24,156,0,277.0,296.0,0


## Opportunity Youth High School Diploma or GED 

In [134]:
##Using data dictionary: '16' = Regular high school diploma ,  '17' = GED or alternative credential
df_OY_HS = df_total_OY[(df_total_OY.education_attained.isin([16, 17]))]
df_OY_HS

Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
332,16,0,21.0,20.0,0,19-21
349,17,0,22.0,19.0,0,22-24
922,17,0,24.0,22.0,0,22-24
1166,16,0,18.0,17.0,0,16-18
1752,16,0,21.0,21.0,0,19-21
...,...,...,...,...,...,...
36158,16,0,20.0,48.0,0,19-21
36653,16,0,21.0,26.0,0,19-21
36733,16,0,20.0,57.0,0,19-21
37620,16,0,19.0,18.0,0,19-21


In [135]:
df_oy_hs_gr = df_OY_HS.groupby('ranges').sum()
df_oy_hs_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,161,0,179.0,280.0,0
19-21,550,0,679.0,832.0,0
22-24,472,0,660.0,742.0,0


## Opportunity Youth Some College, No Degree 

In [136]:
#Using data dictionary: '18' =  Some college, but less than 1 year ,  '19' = 1 or more years of college credit, no degree
df_OY_SC = df_total_OY[(df_total_OY.education_attained.isin([18, 19]))]
df_OY_SC

Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
155,19,0,20.0,16.0,0,19-21
624,19,0,19.0,6.0,0,19-21
1897,19,0,21.0,13.0,0,19-21
2725,19,0,20.0,6.0,0,19-21
2850,19,0,24.0,20.0,0,22-24
...,...,...,...,...,...,...
39126,18,0,19.0,7.0,0,19-21
39140,19,0,20.0,14.0,0,19-21
39154,18,0,19.0,16.0,0,19-21
39160,19,0,19.0,17.0,0,19-21


In [137]:
df_oy_sc_gr = df_OY_SC.groupby('ranges').sum()
df_oy_sc_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,164,0,157.0,252.0,0
19-21,746,0,784.0,763.0,0
22-24,425,0,526.0,595.0,0


## Opportunity Youth Degree(Associate or Higher)

In [138]:
#Using data dictionary: 20-24: Associates, masters, doctorate, etc
df_OY_YD = df_total_OY[(df_total_OY['education_attained'] >= 20)]
df_OY_YD

Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,ranges
203,20,0,20.0,12.0,0,19-21
1230,21,0,22.0,14.0,0,22-24
3181,21,0,24.0,18.0,0,22-24
3400,20,0,23.0,6.0,0,22-24
3954,21,0,21.0,16.0,0,19-21
5120,20,0,23.0,27.0,0,22-24
6337,20,0,22.0,77.0,0,22-24
7069,21,0,22.0,15.0,0,22-24
8157,20,0,21.0,15.0,0,19-21
9367,20,0,22.0,24.0,0,22-24


In [139]:
df_oy_yd_gr = df_OY_YD.groupby('ranges').sum()
df_oy_yd_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,sample_weight,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19-21,202,0,205.0,189.0,0
22-24,496,0,554.0,501.0,0


# RENAME SAMPLE WEIGHT COLUMNS

In [140]:
df_oy_gr = df_oy_gr.rename(columns={'sample_weight': 'Total_OY'})
df_oy_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,Total_OY,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,1221,0,1477.0,2203.0,0
19-21,1768,0,2084.0,2324.0,0
22-24,1549,0,2017.0,2134.0,0


In [141]:
df_working_gr= df_working_gr.rename(columns={'sample_weight': 'Working'})
df_working_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,Working,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,13299,0,16772.0,23387.0,1003
19-21,1546,0,2346.0,3300.0,118
22-24,1132,0,2184.0,2670.0,95


In [142]:
df_noy_gr= df_noy_gr.rename(columns={'sample_weight': 'Not_OY'})
df_noy_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,Not_OY,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,4561,26,4743.0,6011.0,266
19-21,17576,110,19660.0,23904.0,983
22-24,21698,112,26549.0,30833.0,1152


In [143]:
df_oy_nd_gr= df_oy_nd_gr.rename(columns={'sample_weight': 'OY_No_Diploma'})
df_oy_nd_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,OY_No_Diploma,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,896,0,1141.0,1671.0,0
19-21,270,0,416.0,540.0,0
22-24,156,0,277.0,296.0,0


In [144]:
df_oy_hs_gr= df_oy_hs_gr.rename(columns={'sample_weight': 'OY_Diploma_GED'})
df_oy_hs_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,OY_Diploma_GED,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,161,0,179.0,280.0,0
19-21,550,0,679.0,832.0,0
22-24,472,0,660.0,742.0,0


In [145]:
df_oy_sc_gr= df_oy_sc_gr.rename(columns={'sample_weight': 'OY_College_NoDegree'})
df_oy_sc_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,OY_College_NoDegree,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,164,0,157.0,252.0,0
19-21,746,0,784.0,763.0,0
22-24,425,0,526.0,595.0,0


In [146]:
df_oy_yd_gr= df_oy_yd_gr.rename(columns={'sample_weight': 'OY_Degree'})
df_oy_yd_gr

Unnamed: 0_level_0,education_attained,enrollment_status,age,OY_Degree,employed
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19-21,202,0,205.0,189.0,0
22-24,496,0,554.0,501.0,0


# Merged Table-- Totals of OY no degree, High School/GED, some college no degree, college degree
ADD TOTAL OY COLUMN

In [147]:
merged_oy = df_oy_nd_gr
merged_oy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 16-18 to 22-24
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   education_attained  3 non-null      int64  
 1   enrollment_status   3 non-null      int64  
 2   age                 3 non-null      float64
 3   OY_No_Diploma       3 non-null      float64
 4   employed            3 non-null      int64  
dtypes: float64(2), int64(3)
memory usage: 144.0+ bytes


In [149]:
merged_oy['OY_Diploma_GED'] = df_oy_hs_gr['OY_Diploma_GED'].values
merged_oy['OY_College_NoDegree'] = df_oy_sc_gr['OY_College_NoDegree'].values
#merged_oy['OY_Degree'] = df_oy_yd_gr['OY_Degree'].values  THIS IS NOT WORKING BECAUSE INDEXING IS DIFFERENT, DONT HAVE 16-18 CATEGORY
merged_oy
merged_oy2 = merged_oy
merged_oy2

Unnamed: 0_level_0,education_attained,enrollment_status,age,OY_No_Diploma,employed,OY_Diploma_GED,OY_College_NoDegree
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16-18,896,0,1141.0,1671.0,0,280.0,252.0
19-21,270,0,416.0,540.0,0,832.0,763.0
22-24,156,0,277.0,296.0,0,742.0,595.0


In [150]:
del merged_oy2['education_attained']
del merged_oy2['enrollment_status']
del merged_oy2['age']
del merged_oy2['employed']

In [151]:
merged_oy2

Unnamed: 0_level_0,OY_No_Diploma,OY_Diploma_GED,OY_College_NoDegree
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16-18,1671.0,280.0,252.0
19-21,540.0,832.0,763.0
22-24,296.0,742.0,595.0


In [152]:
merged_oy_comb = merged_oy2
merged_oy_comb
#MISSING ADVANCED DEGREE


Unnamed: 0_level_0,OY_No_Diploma,OY_Diploma_GED,OY_College_NoDegree
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16-18,1671.0,280.0,252.0
19-21,540.0,832.0,763.0
22-24,296.0,742.0,595.0


# Merged Table-- Totals by OY, Working, and Not OY
ADD TOTAL POPULATION COLUMN

In [118]:
merged_totals = df_oy_gr 

merged_totals['Working'] = df_working_gr['Working'].values
merged_totals['Not_OY'] = df_noy_gr['Not_OY'].values
merged_totals

Unnamed: 0_level_0,education_attained,enrollment_status,age,Total_OY,employed,Working,Not_OY
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16-18,1221,0,1477.0,2203.0,0,23387.0,6011.0
19-21,1768,0,2084.0,2324.0,0,3300.0,23904.0
22-24,1549,0,2017.0,2134.0,0,2670.0,30833.0


In [119]:
del merged_totals['education_attained']
del merged_totals['enrollment_status']
del merged_totals['age']
del merged_totals['employed']

In [120]:
merged_totals_comb= merged_totals
merged_totals

Unnamed: 0_level_0,Total_OY,Working,Not_OY
ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16-18,2203.0,23387.0,6011.0
19-21,2324.0,3300.0,23904.0
22-24,2134.0,2670.0,30833.0
