# Demo 5.2 acs5 Data Profile (*acs/acs5/profile*)    

- Census Dataset:  *acs/acs5/profile*   

- **Vintage: 2022**    
 
- **Examples**: https://api.census.gov/data/2022/acs/acs5/profile/examples.html  
- **Variables**:  https://api.census.gov/data/2022/acs/acs5/profile/variables.html 
- **Home Page**: https://www.census.gov/data/developers/data-sets/acs-5year.html  
 




- **Question**:  What is the percent of population for each NC County  who have a Graduate degrees and who are 25 or older?     

  


In [1]:
import pandas as pd
import json
import requests
import pprint

import plotly.express as px

# 1. Load Spatial file of Boundary Lines  

In [2]:
f = open('Data_Spatial/Data_SpatialRegion/cb_2018_us_region_5m.geojson')
geo_boundaries = json.load(f)

In [3]:
type(geo_boundaries)

dict

In [4]:
print(json.dumps(geo_boundaries['features'][:2], indent=2))

[
  {
    "type": "Feature",
    "properties": {
      "REGIONCE": "1",
      "AFFGEOID": "0200000US1",
      "GEOID": "1",
      "NAME": "Northeast",
      "LSAD": "68",
      "ALAND": 419357835545,
      "AWATER": 50259300137
    },
    "geometry": {
      "type": "MultiPolygon",
      "coordinates": [
        [
          [
            [
              -68.376591,
              44.113762
            ],
            [
              -68.375382,
              44.11646
            ],
            [
              -68.365514,
              44.124079
            ],
            [
              -68.358388,
              44.125082
            ],
            [
              -68.35301,
              44.127884
            ],
            [
              -68.346724,
              44.127749
            ],
            [
              -68.330716,
              44.110598
            ],
            [
              -68.331032,
              44.10758
            ],
            [
              -68.338012,
   

In [5]:
geo_boundaries['features']

[{'type': 'Feature',
  'properties': {'REGIONCE': '1',
   'AFFGEOID': '0200000US1',
   'GEOID': '1',
   'NAME': 'Northeast',
   'LSAD': '68',
   'ALAND': 419357835545,
   'AWATER': 50259300137},
  'geometry': {'type': 'MultiPolygon',
   'coordinates': [[[[-68.376591, 44.113762],
      [-68.375382, 44.11646],
      [-68.365514, 44.124079],
      [-68.358388, 44.125082],
      [-68.35301, 44.127884],
      [-68.346724, 44.127749],
      [-68.330716, 44.110598],
      [-68.331032, 44.10758],
      [-68.338012, 44.101473],
      [-68.365176, 44.101464],
      [-68.376593, 44.112207],
      [-68.376591, 44.113762]]],
    [[[-68.371235, 44.193003],
      [-68.364469, 44.197534],
      [-68.355279, 44.199096],
      [-68.333227, 44.207308],
      [-68.32423, 44.205732],
      [-68.31606, 44.200244],
      [-68.314789, 44.197157],
      [-68.318476, 44.196608],
      [-68.321178, 44.199032],
      [-68.332639, 44.192131],
      [-68.339029, 44.171839],
      [-68.347416, 44.169459],
      [-68

# Get Data from API Call  

### 1. Build the API Request URL

### 1. How to Start your Data Request  
- Requests always begin with: https://api.census.gov/data  

In [6]:
base_url = "https://api.census.gov/data"

### 2. Add the Dataset Name


In [7]:
dataset_name = "/2022/acs/acs5/profile"

### 3. Start your Variable Request


In [8]:
get_start = "?get="

### 4. Add your Variables  
- **DP02_0066PE**: Percent of Population 25 years and over with Graduate or professional degree


In [9]:
get_variables = "NAME,DP02_0154E,DP02_0154PE"

###   5. Add your Geography  


In [10]:
#geography = "&for=state:*"
geography = "&for=region:*"

###   6. Put it all Together 

In [11]:
request_url = base_url + dataset_name + get_start + get_variables + geography
print("request_url = ", request_url)

request_url =  https://api.census.gov/data/2022/acs/acs5/profile?get=NAME,DP02_0154E,DP02_0154PE&for=region:*


### 2. Use *requests* library to make the API call

In [12]:
# Make API Call
r = requests.get(request_url)

api_results = r.json()

In [13]:
print(api_results)

[['NAME', 'DP02_0154E', 'DP02_0154PE', 'region'], ['Northeast Region', '19718458', '88.8', '1'], ['Midwest Region', '24066084', '87.7', '2'], ['South Region', '41779862', '87.1', '3'], ['West Region', '25519510', '90.7', '4']]


In [14]:
#pprint makes it possible to see the structure of the returned data -- but it can be very, very long!
pprint.pprint(api_results)

[['NAME', 'DP02_0154E', 'DP02_0154PE', 'region'],
 ['Northeast Region', '19718458', '88.8', '1'],
 ['Midwest Region', '24066084', '87.7', '2'],
 ['South Region', '41779862', '87.1', '3'],
 ['West Region', '25519510', '90.7', '4']]


In [15]:
type(api_results)

list

### 3. Get the data into a Dataframe  
- These Census Data results are in a list and have a specific form:  
  - The first element is a list of column names  
  - The remaining list elements are data  
  
  

In [16]:
df = pd.DataFrame(api_results)

print(df.shape)
df.head()

(5, 4)


Unnamed: 0,0,1,2,3
0,NAME,DP02_0154E,DP02_0154PE,region
1,Northeast Region,19718458,88.8,1
2,Midwest Region,24066084,87.7,2
3,South Region,41779862,87.1,3
4,West Region,25519510,90.7,4


### 4. Get the first Row into columns and then get rid of it  

In [17]:
df.columns = df.iloc[0]

df =df.iloc[1:]

print("Region")
print(df.shape)
df.head()

Region
(4, 4)


Unnamed: 0,NAME,DP02_0154E,DP02_0154PE,region
1,Northeast Region,19718458,88.8,1
2,Midwest Region,24066084,87.7,2
3,South Region,41779862,87.1,3
4,West Region,25519510,90.7,4


# Change Data Types as Needed

In [18]:
df.dtypes

0
NAME           object
DP02_0154E     object
DP02_0154PE    object
region         object
dtype: object

# Rename variable names

In [19]:
cols_to_rename = {
                  'DP02_0154E': 'Number of Households with a broadband Internet subscription',
                  'DP02_0154PE':'Percent of Households with a broadband Internet subscription',
                  'NAME': 'Region',
                  'region':'GEOID'

                  
                  
                  
                  
                  
                 }

df.rename(columns= cols_to_rename, inplace=True)

print(df.shape)
df

(4, 4)


Unnamed: 0,Region,Number of Households with a broadband Internet subscription,Percent of Households with a broadband Internet subscription,GEOID
1,Northeast Region,19718458,88.8,1
2,Midwest Region,24066084,87.7,2
3,South Region,41779862,87.1,3
4,West Region,25519510,90.7,4


## Removing region from region name

In [20]:
df['Region'] = df['Region'].str.replace(' Region','')
print(df.shape)
df

(4, 4)


Unnamed: 0,Region,Number of Households with a broadband Internet subscription,Percent of Households with a broadband Internet subscription,GEOID
1,Northeast,19718458,88.8,1
2,Midwest,24066084,87.7,2
3,South,41779862,87.1,3
4,West,25519510,90.7,4


In [21]:
df['Number of Households with a broadband Internet subscription'] = df['Number of Households with a broadband Internet subscription'].astype(int)
df['GEOID'] = df['GEOID'].astype(int)

df.dtypes

0
Region                                                          object
Number of Households with a broadband Internet subscription      int32
Percent of Households with a broadband Internet subscription    object
GEOID                                                            int32
dtype: object

In [22]:
df.head()

Unnamed: 0,Region,Number of Households with a broadband Internet subscription,Percent of Households with a broadband Internet subscription,GEOID
1,Northeast,19718458,88.8,1
2,Midwest,24066084,87.7,2
3,South,41779862,87.1,3
4,West,25519510,90.7,4


## Group by one

In [23]:
df.columns

Index(['Region', 'Number of Households with a broadband Internet subscription',
       'Percent of Households with a broadband Internet subscription',
       'GEOID'],
      dtype='object', name=0)

In [24]:
categories_to_groupby = ["Region"] 
measures_to_group = ['Number of Households with a broadband Internet subscription']

ser = df.groupby(categories_to_groupby)[measures_to_group].sum()

ser

Unnamed: 0_level_0,Number of Households with a broadband Internet subscription
Region,Unnamed: 1_level_1
Midwest,24066084
Northeast,19718458
South,41779862
West,25519510


In [25]:
print("ser is a " , type(ser))

# Check if it is a Series or Dataframe and name accordingly
if (isinstance(ser,pd.Series)): 
    print('That is a Series')
    df_grouped = ser.to_frame()
else:
    print("Not a Series but a Dataframe")
    df_grouped = ser
    
print(df_grouped.shape)
df_grouped.head()

ser is a  <class 'pandas.core.frame.DataFrame'>
Not a Series but a Dataframe
(4, 1)


Unnamed: 0_level_0,Number of Households with a broadband Internet subscription
Region,Unnamed: 1_level_1
Midwest,24066084
Northeast,19718458
South,41779862
West,25519510


In [26]:
df_grouped.reset_index(inplace=True)

print("Highest number of households by region")
print(df_grouped.shape)
df_grouped.head()

Highest number of households by region
(4, 2)


Unnamed: 0,Region,Number of Households with a broadband Internet subscription
0,Midwest,24066084
1,Northeast,19718458
2,South,41779862
3,West,25519510


In [27]:
#Sorting the values 
df_grouped.sort_values(by=['Number of Households with a broadband Internet subscription'], ascending=[False], inplace=True )


print(df.shape)
df.head(10)

(4, 4)


Unnamed: 0,Region,Number of Households with a broadband Internet subscription,Percent of Households with a broadband Internet subscription,GEOID
1,Northeast,19718458,88.8,1
2,Midwest,24066084,87.7,2
3,South,41779862,87.1,3
4,West,25519510,90.7,4


# Plotting the data


#### Horizontal Bar Chart

In [28]:
fig = px.bar(df_grouped,
            x='Region',
            y='Number of Households with a broadband Internet subscription',
            labels={'Number of Households with a broadband Internet subscription': 'Number of Households'},
            orientation = 'v',
            template = 'simple_white',
            title='Comparing Broadband Internet Access in U.S. Regions (2022)'
            )

fig.update_traces(textposition='auto',
                  #texttemplate='%{text:$.3s}'
                  #texttemplate='%{text:,}'       # This adds commas
                  #texttemplate='%{text:,.0f}'   # This adds commas and formats to zero decimal places
                  #texttemplate='%{text:,.1f}'   # This adds commas and formats to one decimal places
                  #texttemplate='%{text:$,.0f}'   # This adds commas and formats to two decimal places
                  #texttemplate='%{text:$,.2f}'  # This adds $, commas, and formats to two decimal places
                  #texttemplate='%{text:.0%}'    # This give a Percent format with zero decimal places
                  #texttemplate='%{text:.1%}'    # This give a Percent format with one decimal place
                  #texttemplate='%{text:.2%}'    # This give a Percent format with two decimal place
                  #texttemplate='%{text:.2s}M'
                  #text=[f"{v:.1f}M" for v in values_in_millions],  # Convert to 1 decimal point and add "M"
                  texttemplate='%{y:.3s}'
    
                 )

fig.show()

In [None]:
csv_file_to_create = "Regions.csv"

filename_with_path = "Data/" + csv_file_to_create
df.to_csv(filename_with_path, index=False)