In [243]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

from concurrent.futures import ThreadPoolExecutor

from utilities.preprocessors import column_summary, model_population_table
from utilities.visualizers import disp_cat_feat, view_feat_outliers


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [244]:
DATA_DIR = './data/population-data'
EXCLUSIONS = ["us_populations_per_state_2001_to_2021.csv"]
files = list(filter(lambda file: not file in EXCLUSIONS, os.listdir(DATA_DIR)))
populations_00_10 = list(filter(lambda file: "2000-2010" in file, files))
populations_10_19 = list(filter(lambda file: "2010-2019" in file, files))
populations_20_23 = list(filter(lambda file: "2020-2023" in file, files))
len(populations_00_10), len(populations_10_19), len(populations_20_23)

(51, 51, 51)

# Read sample excel sheet

In [245]:
test_df = pd.read_excel(os.path.join(DATA_DIR, "Alabama_2000-2010.xls"), dtype=object, header=None)
test_df.head(40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,Sex and Age,"April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
4,BOTH SEXES,4447207,4452173,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736,4785298
5,.Under 5 years,296000,295185,296624.0,296046.0,295204.0,295970.0,296441.0,297222.0,300300.0,304842.0,305412.0,304957,304840
6,.5 to 9 years,315369,313178,307526.0,302632.0,299148.0,297554.0,298450.0,303581.0,306013.0,306682.0,307864.0,308229,308125
7,.10 to 14 years,320266,321372,323615.0,325008.0,326642.0,326228.0,323028.0,321867.0,320407.0,319503.0,319072.0,319655,319314
8,.15 to 19 years,324583,325612,321866.0,320749.0,321655.0,325095.0,330753.0,337003.0,341279.0,345580.0,346611.0,343471,341504
9,.20 to 24 years,306876,309170,318741.0,322812.0,326983.0,326749.0,326727.0,326239.0,327293.0,328751.0,332117.0,335322,336601


In [246]:
male_start = test_df[test_df[0] == "MALE"].index.to_list()[0]
male_start

39

In [247]:
pop_brackets = test_df.iloc[male_start:]
pop_brackets

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560,2149338,2158138,2165719,2179422,2192872,2213382,2243501,2265565,2287949,2309779,2320188,2323317
40,.Under 5 years,151071,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463,155265,155196
41,.5 to 9 years,161798,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145,157340,157294
42,.10 to 14 years,164637,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165,163417,163222
43,.15 to 19 years,164416,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744,175151,174172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Note: Median age is calculated based on single...,,,,,,,,,,,,,
113,Suggested Citation:,,,,,,,,,,,,,
114,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
115,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


In [248]:
female_start = pop_brackets[pop_brackets[0] == "FEMALE"].index.to_list()[0]
male_end, female_end = pop_brackets[pop_brackets[0] == ".Median age (years)"].index.to_list()
male_end, female_end

(73, 108)

# split the excel spreadsheet into the male and female population brackets

In [249]:
male_pop_bracket = test_df.iloc[male_start:male_end]
male_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560.0,2149338.0,2158138.0,2165719.0,2179422.0,2192872.0,2213382.0,2243501.0,2265565.0,2287949.0,2309779.0,2320188.0,2323317.0
40,.Under 5 years,151071.0,150609.0,151410.0,150856.0,150594.0,150699.0,150960.0,151442.0,153128.0,155061.0,155463.0,155265.0,155196.0
41,.5 to 9 years,161798.0,160685.0,157513.0,154832.0,152874.0,151948.0,152574.0,155157.0,156345.0,156770.0,157145.0,157340.0,157294.0
42,.10 to 14 years,164637.0,165170.0,166253.0,166796.0,167376.0,167198.0,165333.0,164608.0,163819.0,163445.0,163165.0,163417.0,163222.0
43,.15 to 19 years,164416.0,165156.0,163598.0,163527.0,164178.0,165836.0,169052.0,172295.0,174268.0,176205.0,176744.0,175151.0,174172.0
44,.20 to 24 years,151811.0,152937.0,157924.0,160193.0,163064.0,163013.0,163055.0,163368.0,163868.0,164488.0,165830.0,167520.0,168170.0
45,.25 to 29 years,149270.0,148063.0,141826.0,138866.0,138346.0,139913.0,143069.0,148916.0,151122.0,153665.0,154238.0,153716.0,154413.0
46,.30 to 34 years,148685.0,148363.0,148924.0,149479.0,149716.0,147796.0,145535.0,141715.0,140442.0,140890.0,144437.0,146424.0,147553.0
47,.35 to 39 years,166595.0,165784.0,161913.0,156961.0,152711.0,149728.0,148720.0,151475.0,153426.0,153863.0,153311.0,151078.0,150161.0
48,.40 to 44 years,168344.0,168611.0,169104.0,168292.0,167519.0,167409.0,165646.0,163182.0,159582.0,155950.0,154308.0,152707.0,152560.0


In [250]:
female_pop_bracket = test_df.iloc[female_start:female_end]
female_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
74,FEMALE,2300647.0,2302835.0,2309496.0,2314370.0,2324069.0,2337857.0,2356423.0,2385480.0,2407275.0,2430257.0,2448159.0,2459548.0,2461981.0
75,.Under 5 years,144929.0,144576.0,145214.0,145190.0,144610.0,145271.0,145481.0,145780.0,147172.0,149781.0,149949.0,149692.0,149644.0
76,.5 to 9 years,153571.0,152493.0,150013.0,147800.0,146274.0,145606.0,145876.0,148424.0,149668.0,149912.0,150719.0,150889.0,150831.0
77,.10 to 14 years,155629.0,156202.0,157362.0,158212.0,159266.0,159030.0,157695.0,157259.0,156588.0,156058.0,155907.0,156238.0,156092.0
78,.15 to 19 years,160167.0,160456.0,158268.0,157222.0,157477.0,159259.0,161701.0,164708.0,167011.0,169375.0,169867.0,168320.0,167332.0
79,.20 to 24 years,155065.0,156233.0,160817.0,162619.0,163919.0,163736.0,163672.0,162871.0,163425.0,164263.0,166287.0,167802.0,168431.0
80,.25 to 29 years,151927.0,150616.0,144937.0,142137.0,142041.0,143951.0,147499.0,153897.0,156497.0,157839.0,157961.0,157318.0,157516.0
81,.30 to 34 years,153157.0,152816.0,153262.0,153189.0,153006.0,152100.0,149728.0,146533.0,145591.0,146382.0,148915.0,151464.0,152567.0
82,.35 to 39 years,173718.0,172909.0,168957.0,164153.0,159449.0,155830.0,154736.0,157188.0,159044.0,160006.0,159486.0,157352.0,156281.0
83,.40 to 44 years,176874.0,177209.0,178029.0,176731.0,175672.0,174844.0,173653.0,170861.0,166904.0,162815.0,159544.0,158364.0,158196.0


#### Remove the following
* column `1`, column `12`, and column `13` (the reasoning is these contain only the population estimates of april 1 and not the most recent one which is supposed to be at july 1, and that column `13` is the year 2010 which already exists in the next population years)
* rows with mostly Nan and the a dot symbol in column `1` i.e. `[. Nan Nan Nan Nan Nan ... Nan]`
* and the male column 

#### we also rename the columns to be `bracket`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`

In [251]:
cond = (male_pop_bracket[0] != ".") & (male_pop_bracket[0] != "MALE")
name_map = {0: "bracket", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009}
temp_male = male_pop_bracket[cond].drop(columns=[1, 12, 13]).rename(columns=name_map).reset_index(drop=True)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


#### we also apply transformations to the bracket column by renaming say `.5 to 9 years` to `5 <= 9`

In [252]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "< 5"
    if keyword == "under":
        return f"< {numbers[-1]}"
    
    # e.g. "5 to 9" becomes "5 <= 9"
    elif keyword == "to":
        return f"{numbers[0]} <= {numbers[-1]}"
    
    # e.g. "9 and over" becomes ">= 9"
    elif keyword == "and over": 
        return f">= {numbers[-1]}"

In [253]:
temp_male["bracket"] = temp_male["bracket"].apply(helper)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,< 5,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,5 <= 9,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,10 <= 14,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,15 <= 19,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,20 <= 24,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,25 <= 29,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,30 <= 34,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,35 <= 39,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,40 <= 44,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,45 <= 49,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


#### we remove the brackets that have duplicates

In [254]:
temp_male = temp_male.drop_duplicates(ignore_index=True)

#### now when we transpose this dataframe...

In [255]:
temp_male = temp_male.T
temp_male

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
bracket,< 5,5 <= 9,10 <= 14,15 <= 19,20 <= 24,25 <= 29,30 <= 34,35 <= 39,40 <= 44,45 <= 49,...,5 <= 13,14 <= 17,18 <= 64,18 <= 24,25 <= 44,45 <= 64,>= 65,>= 16,>= 18,15 <= 44
2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,153919,...,293198,131231,1342158,219519,630821,491818,232142,1639774,1574300,948914
2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,157109,...,291195,131039,1349690,223054,621767,504869,234804,1650255,1584494,943289
2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,160859,...,289177,131255,1357543,224916,613598,519029,236888,1660508,1594431,937318
2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,163830,...,287311,131583,1369733,228598,608292,532843,240201,1675823,1609934,935534
2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,165310,...,284421,134325,1380525,229249,604846,546430,242902,1689549,1623427,933695
2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,167466,...,283902,136558,1393897,229554,602970,561373,248065,1709133,1641962,935077
2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,169420,...,286388,138224,1413074,230816,605288,576970,254373,1737495,1667447,940951
2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,169469,...,286623,139050,1427223,232627,604572,590024,259541,1758143,1686764,942708
2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,169523,...,287133,137553,1441219,236222,604368,600629,266983,1778263,1708202,945061


#### we would want our first row which would now be our age brackets to be our headers instead and the indeces we have which contain our years we would want as a column instead

In [256]:
# get first row as headers but exclude the value with bracket as we won't use this as a column header
temp_male = temp_male.reset_index()
headers = temp_male.iloc[0]
temp_male.columns = headers
temp_male = temp_male.iloc[1:]

In [257]:
final_name_map = {"bracket": "year"}
final_male_pop_bracket = temp_male.rename(columns=final_name_map)
final_male_pop_bracket

Unnamed: 0,year,< 5,5 <= 9,10 <= 14,15 <= 19,20 <= 24,25 <= 29,30 <= 34,35 <= 39,40 <= 44,...,5 <= 13,14 <= 17,18 <= 64,18 <= 24,25 <= 44,45 <= 64,>= 65,>= 16,>= 18,15 <= 44
1,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,293198,131231,1342158,219519,630821,491818,232142,1639774,1574300,948914
2,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,291195,131039,1349690,223054,621767,504869,234804,1650255,1584494,943289
3,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,289177,131255,1357543,224916,613598,519029,236888,1660508,1594431,937318
4,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,287311,131583,1369733,228598,608292,532843,240201,1675823,1609934,935534
5,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,284421,134325,1380525,229249,604846,546430,242902,1689549,1623427,933695
6,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,283902,136558,1393897,229554,602970,561373,248065,1709133,1641962,935077
7,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,286388,138224,1413074,230816,605288,576970,254373,1737495,1667447,940951
8,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,286623,139050,1427223,232627,604572,590024,259541,1758143,1686764,942708
9,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,287133,137553,1441219,236222,604368,600629,266983,1778263,1708202,945061
10,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,287583,136327,1456515,238974,606294,611247,273891,1800104,1730406,948868


In [258]:
final_male_pop_bracket["sex"] = "Male"

In [259]:
final_male_pop_bracket["state"] = "Alabama"

In [260]:
final_male_pop_bracket

Unnamed: 0,year,< 5,5 <= 9,10 <= 14,15 <= 19,20 <= 24,25 <= 29,30 <= 34,35 <= 39,40 <= 44,...,18 <= 64,18 <= 24,25 <= 44,45 <= 64,>= 65,>= 16,>= 18,15 <= 44,sex,state
1,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,Male,Alabama
2,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,Male,Alabama
3,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,Male,Alabama
4,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,Male,Alabama
5,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,Male,Alabama
6,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,1393897,229554,602970,561373,248065,1709133,1641962,935077,Male,Alabama
7,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,1413074,230816,605288,576970,254373,1737495,1667447,940951,Male,Alabama
8,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,1427223,232627,604572,590024,259541,1758143,1686764,942708,Male,Alabama
9,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,1441219,236222,604368,600629,266983,1778263,1708202,945061,Male,Alabama
10,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,1456515,238974,606294,611247,273891,1800104,1730406,948868,Male,Alabama


In [261]:
column_summary(final_male_pop_bracket)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,year,object,0,10,10,"{2000: 1, 2001: 1, 2002: 1, 2003: 1, 2004: 1, ..."
1,< 5,object,0,10,10,"{150609: 1, 151410: 1, 150856: 1, 150594: 1, 1..."
2,5 <= 9,object,0,10,10,"{160685: 1, 157513: 1, 154832: 1, 152874: 1, 1..."
3,10 <= 14,object,0,10,10,"{165170: 1, 166253: 1, 166796: 1, 167376: 1, 1..."
4,15 <= 19,object,0,10,10,"{165156: 1, 163598: 1, 163527: 1, 164178: 1, 1..."
5,20 <= 24,object,0,10,10,"{152937: 1, 157924: 1, 160193: 1, 163064: 1, 1..."
6,25 <= 29,object,0,10,10,"{148063: 1, 141826: 1, 138866: 1, 138346: 1, 1..."
7,30 <= 34,object,0,10,10,"{148363: 1, 148924: 1, 149479: 1, 149716: 1, 1..."
8,35 <= 39,object,0,10,10,"{165784: 1, 161913: 1, 156961: 1, 152711: 1, 1..."
9,40 <= 44,object,0,10,10,"{168611: 1, 169104: 1, 168292: 1, 167519: 1, 1..."


#### We've done our preprocessing on the male population age brackets now we have to this same preprocessing on the female demographic. We can achieve this by writing a function that implements our above prototype that not only does it to the male population but also that of the female one, adn combines the resulting dataframes into one single dataframe for easy collation

In [262]:
model_population_table(test_df, "Alabama")

Unnamed: 0,year,< 5,5 <= 9,10 <= 14,15 <= 19,20 <= 24,25 <= 29,30 <= 34,35 <= 39,40 <= 44,...,18 <= 64,18 <= 24,25 <= 44,45 <= 64,>= 65,>= 16,>= 18,15 <= 44,sex,state
0,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,male,Alabama
1,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,male,Alabama
2,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,male,Alabama
3,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,male,Alabama
4,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,male,Alabama
5,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,1393897,229554,602970,561373,248065,1709133,1641962,935077,male,Alabama
6,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,1413074,230816,605288,576970,254373,1737495,1667447,940951,male,Alabama
7,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,1427223,232627,604572,590024,259541,1758143,1686764,942708,male,Alabama
8,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,1441219,236222,604368,600629,266983,1778263,1708202,945061,male,Alabama
9,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,1456515,238974,606294,611247,273891,1800104,1730406,948868,male,Alabama


In [263]:
def helper(file):
    FILE_PATH = os.path.join(DATA_DIR, file)
    state = re.search(r"(^[A-Za-z]+)", file)
    state = "Unknown" if not state else state[0]

    # read excel file
    df = pd.read_excel(FILE_PATH, dtype=object, header=None)
    state_population = model_population_table(df, state)
    return state_population


with ThreadPoolExecutor() as exe:
    state_populations = list(exe.map(helper, populations_00_10))

state_populations_df = pd.concat(state_populations, axis=0, ignore_index=True)

In [265]:
state_populations_df

Unnamed: 0,year,< 5,5 <= 9,10 <= 14,15 <= 19,20 <= 24,25 <= 29,30 <= 34,35 <= 39,40 <= 44,...,18 <= 64,18 <= 24,25 <= 44,45 <= 64,>= 65,>= 16,>= 18,15 <= 44,sex,state
0,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,male,Alabama
1,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,male,Alabama
2,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,male,Alabama
3,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,male,Alabama
4,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,male,Alabama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,2005,16204,15752,17150,18617,19016,15535,14928,14850,18479,...,159656,26697,63792,69167,34028,200947,193684,101425,female,Wyoming
1016,2006,16819,15919,16859,18461,19148,16650,14884,15099,17607,...,162338,26663,64240,71435,34480,204146,196818,101849,female,Wyoming
1017,2007,17882,16355,16839,18468,19377,17760,15120,15691,16780,...,165606,26835,65351,73420,35084,208067,200690,103196,female,Wyoming
1018,2008,18618,16929,17231,18513,19163,18690,15710,15978,16377,...,168422,26822,66755,74845,35928,211714,204350,104431,female,Wyoming


In [None]:
# take note this is just hte below five age bracket, 
# if we include all other age brackets we might have
# a bigger total population value per year
state_populations_df.groupby(by="year").agg(total_population=("< 5", "sum"))

Unnamed: 0_level_0,total_population
year,Unnamed: 1_level_1
2000,19178293
2001,19298217
2002,19429192
2003,19592446
2004,19785885
2005,19917400
2006,19938883
2007,20125962
2008,20271127
2009,20244518
