# Data Aggregation

We pull the data that was queried and cleaned in 'data_preparation.ipynb'.  'full_database.csv' represents the entire population of the 7 PUMA regions that we chose to study.

In [270]:
import pandas as pd
import numpy as np

In [271]:
df_wg = pd.read_csv('tables/full_database.csv')
df_wg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39202 entries, 0 to 39201
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   education_attained  39202 non-null  int64  
 1   enrollment_status   39202 non-null  int64  
 2   age                 39202 non-null  float64
 3   sample_weight       39202 non-null  float64
 4   employed            39202 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 1.5 MB


### Focus on the Youth

Create a subset of the data representing only the youth between 16 and 25 in the data, and bin youth into 3 age ranges.

In [272]:
youth = df_wg[(df_wg.age >=16) & (df_wg.age <=24)]

youth['age_range'] = (youth.age-16)//3

youth.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  youth['age_range'] = (youth.age-16)//3


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,age_range
13,21,0,24.0,20.0,1,2.0
14,21,0,24.0,15.0,1,2.0
28,18,1,19.0,30.0,0,1.0
31,13,1,17.0,36.0,0,0.0
37,16,0,19.0,36.0,0,1.0


## Create Aggregate Tables

create some variables to help us with the aggregation

In [273]:
# labels for age ranges
ages = ['16-18', '19-21', '22-24']

# variables for common filters
no_diploma = (youth['education_attained'] < 16)
not_enrolled = (youth['enrollment_status'] == 0)
not_employed = (youth['employed'] == 0)
opportunity = (not_enrolled & not_employed)

### Table of Total Population of Youth in Region

In [274]:
total_population = pd.DataFrame()

total_population['Total Population'] = youth.groupby('age_range').sum()['sample_weight']
total_population['Opportunity Youth'] = youth[opportunity].groupby('age_range').sum()['sample_weight']
total_population['Working Without Diploma'] = youth[no_diploma & (not_employed == False) & not_enrolled].groupby('age_range').sum()['sample_weight']
total_population['Not an Opportunity Youth'] = youth[opportunity == False].groupby('age_range').sum()['sample_weight']

total_population['ages'] = ages
total_population.set_index('ages', inplace = True)

total_population

Unnamed: 0_level_0,Total Population,Opportunity Youth,Working Without Diploma,Not an Opportunity Youth
ages,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16-18,33791.0,1941.0,449.0,31850.0
19-21,29915.0,4361.0,1539.0,25554.0
22-24,36277.0,5228.0,1737.0,31049.0


### Table of Opportunity Youth

In [275]:
opportunity_youth = pd.DataFrame()

opportunity_youth['Opportunity Youth'] = youth[opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['No diploma'] = youth[no_diploma & opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['HS diploma or GED'] = youth[youth.education_attained.isin([16,17]) & opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['Some College, no degree'] = youth[opportunity & youth.education_attained.isin([18,19])].groupby('age_range').sum()['sample_weight']
opportunity_youth['Degree (Associate or higher)'] = youth[opportunity & (youth.education_attained > 19)].groupby('age_range').sum()['sample_weight']
opportunity_youth.fillna(0, inplace = True)



opportunity_youth['ages'] = ages
opportunity_youth.set_index('ages', inplace = True)

opportunity_youth

Unnamed: 0_level_0,Opportunity Youth,No diploma,HS diploma or GED,"Some College, no degree",Degree (Associate or higher)
ages,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,1941.0,986.0,812.0,143.0,0.0
19-21,4361.0,1180.0,2448.0,601.0,132.0
22-24,5228.0,1429.0,2171.0,1098.0,530.0


### Pivot the Tables

pivot tables, calculate percentages, and totals.

In [276]:
pivot0 = pd.DataFrame()
pivot0['16-18'] = opportunity_youth.iloc[0,:]
pivot0['19-21'] = opportunity_youth.iloc[1,:]
pivot0['22-24'] = opportunity_youth.iloc[2,:]

pivot1 = pd.DataFrame()
pivot1['16-18'] = total_population.iloc[0,:]
pivot1['19-21'] = total_population.iloc[1,:]
pivot1['22-24'] = total_population.iloc[2,:]

#Opportunity Youth Counts
opportunity_youth_pivoted = pd.DataFrame()
for col in pivot0.columns:
    opportunity_youth_pivoted[col] = pivot0[col]


#Opportunity Youth Percentages
opportunity_youth_pivotedper = pd.DataFrame()
for col in pivot0.columns:
    opportunity_youth_pivotedper[f'%{col}'] = (pivot0[col]/pivot0.loc["Opportunity Youth",col]*100).round(2)


#Putting Opportunity Youth Tables Together

opportunity_youth = pd.DataFrame()
for x, y in zip(opportunity_youth_pivoted.columns, opportunity_youth_pivotedper.columns):
    opportunity_youth[x] = opportunity_youth_pivoted[x]
    opportunity_youth[y] = opportunity_youth_pivotedper[y]
    
opportunity_youth['Totals'] = [opportunity_youth_pivoted.loc[row,:].sum(axis=0) for row in opportunity_youth_pivoted.index]
opportunity_youth['%Total'] = (opportunity_youth['Totals']/opportunity_youth['Totals'][0]*100).round(2)

#Total Population counts
total_population_pivoted = pd.DataFrame()
for col in pivot1.columns:
    total_population_pivoted[col] = pivot1[col]


#Total Population Percentages
total_population_pivotedper = pd.DataFrame()
for col in pivot1.columns:
    total_population_pivotedper[f'%{col}'] = (pivot1[col]/pivot1.loc["Total Population",col]*100).round(2)


#putting Total Population tables together

total_population = pd.DataFrame()
for x, y in zip(total_population_pivoted.columns, total_population_pivotedper.columns):
    total_population[x] = total_population_pivoted[x]
    total_population[y] = total_population_pivotedper[y]
    
total_population['Totals'] = [total_population_pivoted.loc[row,:].sum(axis=0) for row in total_population_pivoted.index]
total_population['%Total'] = (total_population['Totals']/total_population['Totals'][0]*100).round(2)


    
opportunity_youth

Unnamed: 0,16-18,%16-18,19-21,%19-21,22-24,%22-24,Totals,%Total
Opportunity Youth,1941.0,100.0,4361.0,100.0,5228.0,100.0,11530.0,100.0
No diploma,986.0,50.8,1180.0,27.06,1429.0,27.33,3595.0,31.18
HS diploma or GED,812.0,41.83,2448.0,56.13,2171.0,41.53,5431.0,47.1
"Some College, no degree",143.0,7.37,601.0,13.78,1098.0,21.0,1842.0,15.98
Degree (Associate or higher),0.0,0.0,132.0,3.03,530.0,10.14,662.0,5.74


In [277]:
total_population

Unnamed: 0,16-18,%16-18,19-21,%19-21,22-24,%22-24,Totals,%Total
Total Population,33791.0,100.0,29915.0,100.0,36277.0,100.0,99983.0,100.0
Opportunity Youth,1941.0,5.74,4361.0,14.58,5228.0,14.41,11530.0,11.53
Working Without Diploma,449.0,1.33,1539.0,5.14,1737.0,4.79,3725.0,3.73
Not an Opportunity Youth,31850.0,94.26,25554.0,85.42,31049.0,85.59,88453.0,88.47


## Export Tables

In [279]:
opportunity_youth.to_csv('tables/opportunity_youth.csv')
total_population.to_csv('tables/total_population.csv')