# Data Aggregation

We pull the data that was queried and cleaned in 'data_preparation.ipynb'.  'full_database.csv' represents the entire population of the 7 PUMA regions that we chose to study.

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_wg = pd.read_csv('full_database.csv')
df_wg.shape

(39202, 5)

In [4]:
df_wg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39202 entries, 0 to 39201
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   education_attained  39202 non-null  int64  
 1   enrollment_status   39202 non-null  int64  
 2   age                 39202 non-null  float64
 3   sample_weight       39202 non-null  float64
 4   employed            39202 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 1.5 MB


### Focus on the Youth

Create a subset of the data representing only the youth between 16 and 25 in the data, and bin youth into 3 age ranges.

In [5]:
youth = df_wg[(df_wg.age >=16) & (df_wg.age <=24)]

youth['age_range'] = (youth.age-16)//3

youth.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  youth['age_range'] = (youth.age-16)//3


Unnamed: 0,education_attained,enrollment_status,age,sample_weight,employed,age_range
13,21,0,24.0,20.0,1,2.0
14,21,0,24.0,15.0,1,2.0
28,18,1,19.0,30.0,0,1.0
31,13,1,17.0,36.0,0,0.0
37,16,0,19.0,36.0,0,1.0


## Create Aggregate Tables

create some variables to help us with the aggregation

In [6]:
# labels for age ranges
ages = ['16-18', '19-21', '22-24']

# variables for common filters
no_diploma = (youth['education_attained'] < 16)
not_enrolled = (youth['enrollment_status'] == 0)
not_employed = (youth['employed'] == 0)
opportunity = (not_enrolled & not_employed)

### Table of Total Population of Youth in Region

In [7]:
total_population = pd.DataFrame()

total_population['Total Population'] = youth.groupby('age_range').sum()['sample_weight']
total_population['Opportunity Youth'] = youth[opportunity].groupby('age_range').sum()['sample_weight']
total_population['Working Without Diploma'] = youth[no_diploma & (not_employed == False) & not_enrolled].groupby('age_range').sum()['sample_weight']
total_population['Not an Opportunity Youth'] = youth[opportunity == False].groupby('age_range').sum()['sample_weight']

total_population['ages'] = ages
total_population.set_index('ages', inplace = True)

total_population

Unnamed: 0_level_0,Total Population,Opportunity Youth,Working Without Diploma,Not an Opportunity Youth
ages,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16-18,33791.0,1941.0,449.0,31850.0
19-21,29915.0,4361.0,1539.0,25554.0
22-24,36277.0,5228.0,1737.0,31049.0


### Table of Opportunity Youth

In [8]:
opportunity_youth = pd.DataFrame()

opportunity_youth['Opportunity Youth'] = youth[opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['No diploma'] = youth[no_diploma & opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['HS diploma or GED'] = youth[youth.education_attained.isin([16,17]) & opportunity].groupby('age_range').sum()['sample_weight']
opportunity_youth['Some College, no degree'] = youth[opportunity & youth.education_attained.isin([18,19])].groupby('age_range').sum()['sample_weight']
opportunity_youth['Degree (Associate or higher)'] = youth[opportunity & (youth.education_attained > 19)].groupby('age_range').sum()['sample_weight']
opportunity_youth.fillna(0, inplace = True)

opportunity_youth['ages'] = ages
opportunity_youth.set_index('ages', inplace = True)

opportunity_youth

Unnamed: 0_level_0,Opportunity Youth,No diploma,HS diploma or GED,"Some College, no degree",Degree (Associate or higher)
ages,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16-18,1941.0,986.0,812.0,143.0,0.0
19-21,4361.0,1180.0,2448.0,601.0,132.0
22-24,5228.0,1429.0,2171.0,1098.0,530.0


In [12]:
pivot = pd.DataFrame()
pivot['16-18'] = opportunity_youth.iloc[0,:]

pivot['19-21'] = opportunity_youth.iloc[1,:]
pivot['22-14'] = opportunity_youth.iloc[2,:]
pivot

Unnamed: 0,16-18,19-21,22-14
Opportunity Youth,1941.0,4361.0,5228.0
No diploma,986.0,1180.0,1429.0
HS diploma or GED,812.0,2448.0,2171.0
"Some College, no degree",143.0,601.0,1098.0
Degree (Associate or higher),0.0,132.0,530.0


In [25]:
percent = pd.DataFrame()
for col in pivot.columns:
    percent[col] = pivot[col]
    per = (pivot[col]/pivot.loc["Opportunity Youth",col]*100).round(2)
    percent[f'%{col}'] = f'${per}'
percent

SyntaxError: EOL while scanning string literal (<ipython-input-25-2375992821be>, line 5)

for col in total_population.columns:
    total_population[col + '%'] = (100*total_population[col]/total_population['Total Population']).round(2)
    total_population[col + '%'] = total_population[col + '%'].astype(str) + '%'
total_population

col2 = ['Total Population', 'Total Population%',
        'Opportunity Youth','Opportunity Youth%', 
        'Working Without Diploma', 'Working Without Diploma%',
       'Not an Opportunity Youth','Not an Opportunity Youth%']
total_population = total_population[col2]
total_population

for col in opportunity_youth.columns:
    opportunity_youth[col + '%'] = (100*opportunity_youth[col]/opportunity_youth['Opportunity Youth']).round(2)
    opportunity_youth[col + '%'] = opportunity_youth[col + '%'].astype(str) + '%'
opportunity_youth

col1 = ['Opportunity Youth','Opportunity Youth%',
        'No diploma','No diploma%', 
        'HS diploma or GED', 'HS diploma or GED%',
       'Some College, no degree', 'Some College, no degree%',
       'Degree (Associate or higher)','Degree (Associate or higher)%']
opportunity_youth = opportunity_youth[col1]
opportunity_youth

In [145]:
opportunity_youth.to_csv('opportunity_youth.csv')
total_population.to_csv('total_population.csv')