In [453]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

from concurrent.futures import ThreadPoolExecutor

from utilities.preprocessors import column_summary, model_population_table
from utilities.visualizers import disp_cat_feat, view_feat_outliers


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [454]:
DATA_DIR = './data/population-data'
EXCLUSIONS = ["us_populations_per_state_2001_to_2021.csv"]
files = list(filter(lambda file: not file in EXCLUSIONS, os.listdir(DATA_DIR)))
populations_00_10 = list(filter(lambda file: "2000-2010" in file, files))
populations_10_19 = list(filter(lambda file: "2010-2019" in file, files))
populations_20_23 = list(filter(lambda file: "2020-2023" in file, files))
len(populations_00_10), len(populations_10_19), len(populations_20_23)

(51, 51, 51)

# Read sample excel sheet

In [455]:
test_df = pd.read_excel(os.path.join(DATA_DIR, "Alabama_2000-2010.xls"), dtype=object, header=None)
test_df.head(40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,Sex and Age,"April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
4,BOTH SEXES,4447207,4452173,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736,4785298
5,.Under 5 years,296000,295185,296624.0,296046.0,295204.0,295970.0,296441.0,297222.0,300300.0,304842.0,305412.0,304957,304840
6,.5 to 9 years,315369,313178,307526.0,302632.0,299148.0,297554.0,298450.0,303581.0,306013.0,306682.0,307864.0,308229,308125
7,.10 to 14 years,320266,321372,323615.0,325008.0,326642.0,326228.0,323028.0,321867.0,320407.0,319503.0,319072.0,319655,319314
8,.15 to 19 years,324583,325612,321866.0,320749.0,321655.0,325095.0,330753.0,337003.0,341279.0,345580.0,346611.0,343471,341504
9,.20 to 24 years,306876,309170,318741.0,322812.0,326983.0,326749.0,326727.0,326239.0,327293.0,328751.0,332117.0,335322,336601


In [456]:
male_start = test_df[test_df[0] == "MALE"].index.to_list()[0]
male_start

39

In [457]:
pop_brackets = test_df.iloc[male_start:]
pop_brackets

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560,2149338,2158138,2165719,2179422,2192872,2213382,2243501,2265565,2287949,2309779,2320188,2323317
40,.Under 5 years,151071,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463,155265,155196
41,.5 to 9 years,161798,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145,157340,157294
42,.10 to 14 years,164637,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165,163417,163222
43,.15 to 19 years,164416,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744,175151,174172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Note: Median age is calculated based on single...,,,,,,,,,,,,,
113,Suggested Citation:,,,,,,,,,,,,,
114,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
115,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


In [458]:
female_start = pop_brackets[pop_brackets[0] == "FEMALE"].index.to_list()[0]
male_end, female_end = pop_brackets[pop_brackets[0] == ".Median age (years)"].index.to_list()
male_end, female_end

(73, 108)

# split the excel spreadsheet into the male and female population brackets

In [459]:
male_pop_bracket = test_df.iloc[male_start:male_end]
male_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560.0,2149338.0,2158138.0,2165719.0,2179422.0,2192872.0,2213382.0,2243501.0,2265565.0,2287949.0,2309779.0,2320188.0,2323317.0
40,.Under 5 years,151071.0,150609.0,151410.0,150856.0,150594.0,150699.0,150960.0,151442.0,153128.0,155061.0,155463.0,155265.0,155196.0
41,.5 to 9 years,161798.0,160685.0,157513.0,154832.0,152874.0,151948.0,152574.0,155157.0,156345.0,156770.0,157145.0,157340.0,157294.0
42,.10 to 14 years,164637.0,165170.0,166253.0,166796.0,167376.0,167198.0,165333.0,164608.0,163819.0,163445.0,163165.0,163417.0,163222.0
43,.15 to 19 years,164416.0,165156.0,163598.0,163527.0,164178.0,165836.0,169052.0,172295.0,174268.0,176205.0,176744.0,175151.0,174172.0
44,.20 to 24 years,151811.0,152937.0,157924.0,160193.0,163064.0,163013.0,163055.0,163368.0,163868.0,164488.0,165830.0,167520.0,168170.0
45,.25 to 29 years,149270.0,148063.0,141826.0,138866.0,138346.0,139913.0,143069.0,148916.0,151122.0,153665.0,154238.0,153716.0,154413.0
46,.30 to 34 years,148685.0,148363.0,148924.0,149479.0,149716.0,147796.0,145535.0,141715.0,140442.0,140890.0,144437.0,146424.0,147553.0
47,.35 to 39 years,166595.0,165784.0,161913.0,156961.0,152711.0,149728.0,148720.0,151475.0,153426.0,153863.0,153311.0,151078.0,150161.0
48,.40 to 44 years,168344.0,168611.0,169104.0,168292.0,167519.0,167409.0,165646.0,163182.0,159582.0,155950.0,154308.0,152707.0,152560.0


In [460]:
female_pop_bracket = test_df.iloc[female_start:female_end]
female_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
74,FEMALE,2300647.0,2302835.0,2309496.0,2314370.0,2324069.0,2337857.0,2356423.0,2385480.0,2407275.0,2430257.0,2448159.0,2459548.0,2461981.0
75,.Under 5 years,144929.0,144576.0,145214.0,145190.0,144610.0,145271.0,145481.0,145780.0,147172.0,149781.0,149949.0,149692.0,149644.0
76,.5 to 9 years,153571.0,152493.0,150013.0,147800.0,146274.0,145606.0,145876.0,148424.0,149668.0,149912.0,150719.0,150889.0,150831.0
77,.10 to 14 years,155629.0,156202.0,157362.0,158212.0,159266.0,159030.0,157695.0,157259.0,156588.0,156058.0,155907.0,156238.0,156092.0
78,.15 to 19 years,160167.0,160456.0,158268.0,157222.0,157477.0,159259.0,161701.0,164708.0,167011.0,169375.0,169867.0,168320.0,167332.0
79,.20 to 24 years,155065.0,156233.0,160817.0,162619.0,163919.0,163736.0,163672.0,162871.0,163425.0,164263.0,166287.0,167802.0,168431.0
80,.25 to 29 years,151927.0,150616.0,144937.0,142137.0,142041.0,143951.0,147499.0,153897.0,156497.0,157839.0,157961.0,157318.0,157516.0
81,.30 to 34 years,153157.0,152816.0,153262.0,153189.0,153006.0,152100.0,149728.0,146533.0,145591.0,146382.0,148915.0,151464.0,152567.0
82,.35 to 39 years,173718.0,172909.0,168957.0,164153.0,159449.0,155830.0,154736.0,157188.0,159044.0,160006.0,159486.0,157352.0,156281.0
83,.40 to 44 years,176874.0,177209.0,178029.0,176731.0,175672.0,174844.0,173653.0,170861.0,166904.0,162815.0,159544.0,158364.0,158196.0


#### Remove the following
* column `1`, column `12`, and column `13` (the reasoning is these contain only the population estimates of april 1 and not the most recent one which is supposed to be at july 1, and that column `13` is the year 2010 which already exists in the next population years)
* rows with mostly Nan and the a dot symbol in column `1` i.e. `[. Nan Nan Nan Nan Nan ... Nan]`
* and the male column 

#### we also rename the columns to be `bracket`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`

In [461]:
cols_to_remove = [1, 12, 13]
cond = (male_pop_bracket[0] != ".") & (male_pop_bracket[0] != "MALE")
name_map = {0: "bracket", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009}
temp_male = male_pop_bracket[cond].drop(columns=cols_to_remove).rename(columns=name_map).reset_index(drop=True)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


#### we also apply transformations to the bracket column by renaming say `.5 to 9 years` to `5 <= 9`

In [462]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over|\+)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "_under_5"
    if keyword == "under":
        return f"_under_{numbers[-1]}"
    
    # e.g. "5 to 9" becomes "_5_to_9"
    elif keyword == "to":
        return f"_{numbers[0]}_to_{numbers[-1]}"
    
    # e.g. "9 and over" becomes "_9_and_over"
    elif keyword == "and over" or keyword == "+": 
        return f"_{numbers[-1]}_and_over"
    
    return f"_{numbers[-1]}"

In [463]:
temp_male["bracket"] = temp_male["bracket"].apply(helper)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,_under_5,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,_5_to_9,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,_10_to_14,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,_15_to_19,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,_20_to_24,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,_25_to_29,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,_30_to_34,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,_35_to_39,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,_40_to_44,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,_45_to_49,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


#### we remove the brackets that have duplicates

In [464]:
temp_male = temp_male.drop_duplicates(ignore_index=True)

#### now when we transpose this dataframe...

In [465]:
temp_male = temp_male.T
temp_male

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
bracket,_under_5,_5_to_9,_10_to_14,_15_to_19,_20_to_24,_25_to_29,_30_to_34,_35_to_39,_40_to_44,_45_to_49,...,_5_to_13,_14_to_17,_18_to_64,_18_to_24,_25_to_44,_45_to_64,_65_and_over,_16_and_over,_18_and_over,_15_to_44
2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,153919,...,293198,131231,1342158,219519,630821,491818,232142,1639774,1574300,948914
2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,157109,...,291195,131039,1349690,223054,621767,504869,234804,1650255,1584494,943289
2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,160859,...,289177,131255,1357543,224916,613598,519029,236888,1660508,1594431,937318
2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,163830,...,287311,131583,1369733,228598,608292,532843,240201,1675823,1609934,935534
2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,165310,...,284421,134325,1380525,229249,604846,546430,242902,1689549,1623427,933695
2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,167466,...,283902,136558,1393897,229554,602970,561373,248065,1709133,1641962,935077
2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,169420,...,286388,138224,1413074,230816,605288,576970,254373,1737495,1667447,940951
2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,169469,...,286623,139050,1427223,232627,604572,590024,259541,1758143,1686764,942708
2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,169523,...,287133,137553,1441219,236222,604368,600629,266983,1778263,1708202,945061


#### we would want our first row which would now be our age brackets to be our headers instead and the indeces we have which contain our years we would want as a column instead

In [466]:
# get first row as headers but exclude the value with bracket as we won't use this as a column header
temp_male = temp_male.reset_index()
headers = temp_male.iloc[0]
temp_male.columns = headers
temp_male = temp_male.iloc[1:]

In [467]:
final_name_map = {"bracket": "year"}
final_male_pop_bracket = temp_male.rename(columns=final_name_map)
final_male_pop_bracket

Unnamed: 0,year,_under_5,_5_to_9,_10_to_14,_15_to_19,_20_to_24,_25_to_29,_30_to_34,_35_to_39,_40_to_44,...,_5_to_13,_14_to_17,_18_to_64,_18_to_24,_25_to_44,_45_to_64,_65_and_over,_16_and_over,_18_and_over,_15_to_44
1,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,293198,131231,1342158,219519,630821,491818,232142,1639774,1574300,948914
2,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,291195,131039,1349690,223054,621767,504869,234804,1650255,1584494,943289
3,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,289177,131255,1357543,224916,613598,519029,236888,1660508,1594431,937318
4,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,287311,131583,1369733,228598,608292,532843,240201,1675823,1609934,935534
5,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,284421,134325,1380525,229249,604846,546430,242902,1689549,1623427,933695
6,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,283902,136558,1393897,229554,602970,561373,248065,1709133,1641962,935077
7,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,286388,138224,1413074,230816,605288,576970,254373,1737495,1667447,940951
8,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,286623,139050,1427223,232627,604572,590024,259541,1758143,1686764,942708
9,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,287133,137553,1441219,236222,604368,600629,266983,1778263,1708202,945061
10,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,287583,136327,1456515,238974,606294,611247,273891,1800104,1730406,948868


In [468]:
final_male_pop_bracket["sex"] = "Male"

In [469]:
final_male_pop_bracket["state"] = "Alabama"

In [470]:
final_male_pop_bracket

Unnamed: 0,year,_under_5,_5_to_9,_10_to_14,_15_to_19,_20_to_24,_25_to_29,_30_to_34,_35_to_39,_40_to_44,...,_18_to_64,_18_to_24,_25_to_44,_45_to_64,_65_and_over,_16_and_over,_18_and_over,_15_to_44,sex,state
1,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,Male,Alabama
2,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,Male,Alabama
3,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,Male,Alabama
4,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,Male,Alabama
5,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,Male,Alabama
6,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,1393897,229554,602970,561373,248065,1709133,1641962,935077,Male,Alabama
7,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,1413074,230816,605288,576970,254373,1737495,1667447,940951,Male,Alabama
8,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,1427223,232627,604572,590024,259541,1758143,1686764,942708,Male,Alabama
9,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,1441219,236222,604368,600629,266983,1778263,1708202,945061,Male,Alabama
10,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,1456515,238974,606294,611247,273891,1800104,1730406,948868,Male,Alabama


In [471]:
column_summary(final_male_pop_bracket)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,year,object,0,10,10,"{2000: 1, 2001: 1, 2002: 1, 2003: 1, 2004: 1, ..."
1,_under_5,object,0,10,10,"{150609: 1, 151410: 1, 150856: 1, 150594: 1, 1..."
2,_5_to_9,object,0,10,10,"{160685: 1, 157513: 1, 154832: 1, 152874: 1, 1..."
3,_10_to_14,object,0,10,10,"{165170: 1, 166253: 1, 166796: 1, 167376: 1, 1..."
4,_15_to_19,object,0,10,10,"{165156: 1, 163598: 1, 163527: 1, 164178: 1, 1..."
5,_20_to_24,object,0,10,10,"{152937: 1, 157924: 1, 160193: 1, 163064: 1, 1..."
6,_25_to_29,object,0,10,10,"{148063: 1, 141826: 1, 138866: 1, 138346: 1, 1..."
7,_30_to_34,object,0,10,10,"{148363: 1, 148924: 1, 149479: 1, 149716: 1, 1..."
8,_35_to_39,object,0,10,10,"{165784: 1, 161913: 1, 156961: 1, 152711: 1, 1..."
9,_40_to_44,object,0,10,10,"{168611: 1, 169104: 1, 168292: 1, 167519: 1, 1..."


#### We've done our preprocessing on the male population age brackets now we have to this same preprocessing on the female demographic. We can achieve this by writing a function that implements our above prototype that not only does it to the male population but also that of the female one, adn combines the resulting dataframes into one single dataframe for easy collation

In [472]:
model_population_table(test_df, "Alabama", cols_to_remove, year_range="2000-2009")

Unnamed: 0,year,_under_5,_5_to_9,_10_to_14,_15_to_19,_20_to_24,_25_to_29,_30_to_34,_35_to_39,_40_to_44,...,_18_to_64,_18_to_24,_25_to_44,_45_to_64,_65_and_over,_16_and_over,_18_and_over,_15_to_44,sex,state
0,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,male,Alabama
1,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,male,Alabama
2,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,male,Alabama
3,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,male,Alabama
4,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,male,Alabama
5,2005,150960,152574,165333,169052,163055,143069,145535,148720,165646,...,1393897,229554,602970,561373,248065,1709133,1641962,935077,male,Alabama
6,2006,151442,155157,164608,172295,163368,148916,141715,151475,163182,...,1413074,230816,605288,576970,254373,1737495,1667447,940951,male,Alabama
7,2007,153128,156345,163819,174268,163868,151122,140442,153426,159582,...,1427223,232627,604572,590024,259541,1758143,1686764,942708,male,Alabama
8,2008,155061,156770,163445,176205,164488,153665,140890,153863,155950,...,1441219,236222,604368,600629,266983,1778263,1708202,945061,male,Alabama
9,2009,155463,157145,163165,176744,165830,154238,144437,153311,154308,...,1456515,238974,606294,611247,273891,1800104,1730406,948868,male,Alabama


In [473]:
def concur_model_pop_tables(file, cols_to_remove, year_range):
    FILE_PATH = os.path.join(DATA_DIR, file)
    state = re.search(r"(^[A-Za-z]+)", file)
    state = "Unknown" if not state else state[0]

    # print(cols_to_remove)
    # print(year_range)
    # read excel file
    df = pd.read_excel(FILE_PATH, dtype=object, header=None)
    state_population = model_population_table(df, state, cols_to_remove, year_range=year_range)
    return state_population

In [474]:
with ThreadPoolExecutor() as exe:
    state_populations_00_10 = list(exe.map(
        concur_model_pop_tables, 
        populations_00_10, 
        [cols_to_remove] * len(populations_00_10),
        ["2000-2009"] * len(populations_00_10)
    ))

state_populations_df_00_10 = pd.concat(state_populations_00_10, axis=0, ignore_index=True)

In [475]:
state_populations_df_00_10

Unnamed: 0,year,_under_5,_5_to_9,_10_to_14,_15_to_19,_20_to_24,_25_to_29,_30_to_34,_35_to_39,_40_to_44,...,_18_to_64,_18_to_24,_25_to_44,_45_to_64,_65_and_over,_16_and_over,_18_and_over,_15_to_44,sex,state
0,2000,150609,160685,165170,165156,152937,148063,148363,165784,168611,...,1342158,219519,630821,491818,232142,1639774,1574300,948914,male,Alabama
1,2001,151410,157513,166253,163598,157924,141826,148924,161913,169104,...,1349690,223054,621767,504869,234804,1650255,1584494,943289,male,Alabama
2,2002,150856,154832,166796,163527,160193,138866,149479,156961,168292,...,1357543,224916,613598,519029,236888,1660508,1594431,937318,male,Alabama
3,2003,150594,152874,167376,164178,163064,138346,149716,152711,167519,...,1369733,228598,608292,532843,240201,1675823,1609934,935534,male,Alabama
4,2004,150699,151948,167198,165836,163013,139913,147796,149728,167409,...,1380525,229249,604846,546430,242902,1689549,1623427,933695,male,Alabama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,2005,16204,15752,17150,18617,19016,15535,14928,14850,18479,...,159656,26697,63792,69167,34028,200947,193684,101425,female,Wyoming
1016,2006,16819,15919,16859,18461,19148,16650,14884,15099,17607,...,162338,26663,64240,71435,34480,204146,196818,101849,female,Wyoming
1017,2007,17882,16355,16839,18468,19377,17760,15120,15691,16780,...,165606,26835,65351,73420,35084,208067,200690,103196,female,Wyoming
1018,2008,18618,16929,17231,18513,19163,18690,15710,15978,16377,...,168422,26822,66755,74845,35928,211714,204350,104431,female,Wyoming


In [476]:
# take note this is just hte below five age bracket, 
# if we include all other age brackets we might have
# a bigger total population value per year
state_populations_df_00_10.groupby(by="year").agg(total_population=("_under_5", "sum"))

Unnamed: 0_level_0,total_population
year,Unnamed: 1_level_1
2000,19178293
2001,19298217
2002,19429192
2003,19592446
2004,19785885
2005,19917400
2006,19938883
2007,20125962
2008,20271127
2009,20244518


# Reading sample excel file for year 2010-2019

In [477]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,table with row headers in column A and column ...,,,,,,,,,,...,,,,,,,,,,
1,Annual Estimates of the Resident Population by...,,,,,,,,,,...,,,,,,,,,,
2,.Age,2010-04-01 00:00:00,,,,,,Population Estimate (as of July 1),,,...,,,,,,,,,,
3,,Census,,,Estimates Base,,,2010,,,...,,2017,,,2018,,,2019,,
4,,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,...,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Note: The estimates are based on the 2010 Cens...,,,,,,,,,,...,,,,,,,,,,
95,Suggested Citation:,,,,,,,,,,...,,,,,,,,,,
96,Annual Estimates of the Resident Population by...,,,,,,,,,,...,,,,,,,,,,
97,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,...,,,,,,,,,,


In [478]:
start_index = test_df_10_19[test_df_10_19[0] == ".0"].index.to_list()[0]
start_index

6

In [479]:
end_index = test_df_10_19[test_df_10_19[0] == ".Median Age (years)"].index.to_list()[0]
end_index

93

#### Extract necessary rows

In [480]:
pop_brackets_10_19 = test_df_10_19.iloc[start_index: end_index]
pop_brackets_10_19

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101
91,.85+,75684,22859,52825,75715,22864,52851,76243,23110,53133,...,58765,89262,29812,59450,90410,30582,59828,91543,31322,60221


#### remove duplicates

In [481]:
temp = pop_brackets_10_19.drop_duplicates()
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101
91,.85+,75684,22859,52825,75715,22864,52851,76243,23110,53133,...,58765,89262,29812,59450,90410,30582,59828,91543,31322,60221


#### remove rows with at least 5 nan values

In [482]:
temp = temp.dropna(thresh=5, axis=0)
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,.81,18920,7346,11574,18925,7348,11577,19011,7420,11591,...,12383,21075,8660,12415,20996,8668,12328,22580,9291,13289
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101


#### remove columns 1 to 7, then increment by 3

In [483]:
cols_to_remove = [1, 2, 3, 4, 5, 6] + list(range(7, temp.shape[1], 3))
cols_to_remove

[1, 2, 3, 4, 5, 6, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34]

In [484]:
temp = temp.drop(columns=cols_to_remove)
temp

Unnamed: 0,0,8,9,11,12,14,15,17,18,20,...,23,24,26,27,29,30,32,33,35,36
6,.0,30156,29115,30443,28723,29730,28787,29176,28220,29715,...,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
7,.1,30481,29284,30183,29203,30522,28724,29737,28879,29304,...,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
8,.2,31545,30509,30578,29399,30260,29164,30541,28832,29770,...,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
9,.3,31737,30350,31646,30543,30539,29354,30207,29249,30592,...,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
10,.4,31090,29990,31751,30360,31589,30566,30496,29358,30257,...,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,.81,7420,11591,7826,12017,7681,11847,7914,11684,7944,...,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
88,.82,6775,11204,6819,10824,7206,11257,7057,11064,7343,...,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
89,.83,6088,10435,6145,10485,6201,10099,6524,10500,6434,...,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
90,.84,5266,9385,5474,9704,5537,9761,5591,9376,5888,...,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [485]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over|\+)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "_under_5"
    if keyword == "under":
        return f"_under_{numbers[-1]}"
    
    # e.g. "5 to 9" becomes "_5_to_9"
    elif keyword == "to":
        return f"_{numbers[0]}_to_{numbers[-1]}"
    
    # e.g. "9 and over" becomes "_9_and_over"
    elif keyword == "and over" or keyword == "+": 
        return f"_{numbers[-1]}_and_over"
    
    return f"_{numbers[-1]}"

In [486]:
temp[0] = temp[0].apply(helper)
temp

Unnamed: 0,0,8,9,11,12,14,15,17,18,20,...,23,24,26,27,29,30,32,33,35,36
6,_0,30156,29115,30443,28723,29730,28787,29176,28220,29715,...,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
7,_1,30481,29284,30183,29203,30522,28724,29737,28879,29304,...,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
8,_2,31545,30509,30578,29399,30260,29164,30541,28832,29770,...,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
9,_3,31737,30350,31646,30543,30539,29354,30207,29249,30592,...,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
10,_4,31090,29990,31751,30360,31589,30566,30496,29358,30257,...,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,_81,7420,11591,7826,12017,7681,11847,7914,11684,7944,...,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
88,_82,6775,11204,6819,10824,7206,11257,7057,11064,7343,...,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
89,_83,6088,10435,6145,10485,6201,10099,6524,10500,6434,...,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
90,_84,5266,9385,5474,9704,5537,9761,5591,9376,5888,...,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [487]:
# generate and create multi index for columns
years = sorted(list(range(2010, 2020)) * 2)
genders = ["male", "female"] * 10
multi_index_list = [("bracket", )] + list(zip(years, genders))
multi_index_list

[('bracket',),
 (2010, 'male'),
 (2010, 'female'),
 (2011, 'male'),
 (2011, 'female'),
 (2012, 'male'),
 (2012, 'female'),
 (2013, 'male'),
 (2013, 'female'),
 (2014, 'male'),
 (2014, 'female'),
 (2015, 'male'),
 (2015, 'female'),
 (2016, 'male'),
 (2016, 'female'),
 (2017, 'male'),
 (2017, 'female'),
 (2018, 'male'),
 (2018, 'female'),
 (2019, 'male'),
 (2019, 'female')]

In [488]:
multi_index = pd.MultiIndex.from_tuples(multi_index_list)
multi_index

MultiIndex([('bracket',      nan),
            (     2010,   'male'),
            (     2010, 'female'),
            (     2011,   'male'),
            (     2011, 'female'),
            (     2012,   'male'),
            (     2012, 'female'),
            (     2013,   'male'),
            (     2013, 'female'),
            (     2014,   'male'),
            (     2014, 'female'),
            (     2015,   'male'),
            (     2015, 'female'),
            (     2016,   'male'),
            (     2016, 'female'),
            (     2017,   'male'),
            (     2017, 'female'),
            (     2018,   'male'),
            (     2018, 'female'),
            (     2019,   'male'),
            (     2019, 'female')],
           )

In [489]:
multi_index[0]

('bracket', nan)

In [490]:
temp.columns = multi_index
temp

Unnamed: 0_level_0,bracket,2010,2010,2011,2011,2012,2012,2013,2013,2014,...,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Unnamed: 0_level_1,NaN,male,female,male,female,male,female,male,female,male,...,male,female,male,female,male,female,male,female,male,female
6,_0,30156,29115,30443,28723,29730,28787,29176,28220,29715,...,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
7,_1,30481,29284,30183,29203,30522,28724,29737,28879,29304,...,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
8,_2,31545,30509,30578,29399,30260,29164,30541,28832,29770,...,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
9,_3,31737,30350,31646,30543,30539,29354,30207,29249,30592,...,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
10,_4,31090,29990,31751,30360,31589,30566,30496,29358,30257,...,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,_81,7420,11591,7826,12017,7681,11847,7914,11684,7944,...,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
88,_82,6775,11204,6819,10824,7206,11257,7057,11064,7343,...,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
89,_83,6088,10435,6145,10485,6201,10099,6524,10500,6434,...,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
90,_84,5266,9385,5474,9704,5537,9761,5591,9376,5888,...,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [491]:
temp = temp.T
temp

Unnamed: 0,Unnamed: 1,6,7,8,9,10,11,12,13,14,15,...,82,83,84,85,86,87,88,89,90,91
bracket,,_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,...,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over
2010,male,30156,30481,31545,31737,31090,30927,31061,30900,31101,32785,...,10947,10386,9704,8868,8372,7420,6775,6088,5266,23110
2010,female,29115,29284,30509,30350,29990,29716,29670,29698,29972,31412,...,14436,13913,13596,13086,12666,11591,11204,10435,9385,53133
2011,male,30443,30183,30578,31646,31751,31109,30999,31135,30925,31096,...,11724,10404,9791,9096,8256,7826,6819,6145,5474,24011
2011,female,28723,29203,29399,30543,30360,29965,29806,29748,29894,29972,...,15400,13943,13402,12851,12487,12017,10824,10485,9704,53836
2012,male,29730,30522,30260,30539,31589,31789,31075,30964,31144,30984,...,11783,11100,9871,9175,8459,7681,7206,6201,5537,25043
2012,female,28787,28724,29164,29354,30566,30309,29974,29759,29765,29913,...,15451,14832,13454,12689,12299,11847,11257,10099,9761,54955
2013,male,29176,29737,30541,30207,30496,31608,31669,31045,31020,31189,...,11764,11238,10451,9228,8494,7914,7057,6524,5591,25770
2013,female,28220,28879,28832,29249,29358,30539,30275,30027,29791,29754,...,15353,14873,14344,12713,12113,11684,11064,10500,9376,55764
2014,male,29715,29304,29770,30592,30257,30552,31603,31630,31119,31159,...,12443,11198,10646,9814,8626,7944,7343,6434,5888,26690


In [492]:
temp = temp.reset_index()
temp

Unnamed: 0,level_0,level_1,6,7,8,9,10,11,12,13,...,82,83,84,85,86,87,88,89,90,91
0,bracket,,_0,_1,_2,_3,_4,_5,_6,_7,...,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over
1,2010,male,30156,30481,31545,31737,31090,30927,31061,30900,...,10947,10386,9704,8868,8372,7420,6775,6088,5266,23110
2,2010,female,29115,29284,30509,30350,29990,29716,29670,29698,...,14436,13913,13596,13086,12666,11591,11204,10435,9385,53133
3,2011,male,30443,30183,30578,31646,31751,31109,30999,31135,...,11724,10404,9791,9096,8256,7826,6819,6145,5474,24011
4,2011,female,28723,29203,29399,30543,30360,29965,29806,29748,...,15400,13943,13402,12851,12487,12017,10824,10485,9704,53836
5,2012,male,29730,30522,30260,30539,31589,31789,31075,30964,...,11783,11100,9871,9175,8459,7681,7206,6201,5537,25043
6,2012,female,28787,28724,29164,29354,30566,30309,29974,29759,...,15451,14832,13454,12689,12299,11847,11257,10099,9761,54955
7,2013,male,29176,29737,30541,30207,30496,31608,31669,31045,...,11764,11238,10451,9228,8494,7914,7057,6524,5591,25770
8,2013,female,28220,28879,28832,29249,29358,30539,30275,30027,...,15353,14873,14344,12713,12113,11684,11064,10500,9376,55764
9,2014,male,29715,29304,29770,30592,30257,30552,31603,31630,...,12443,11198,10646,9814,8626,7944,7343,6434,5888,26690


In [493]:
headers = temp.iloc[0]
temp = temp.iloc[1:].reset_index(drop=True)
temp.columns = headers
temp

Unnamed: 0,bracket,NaN,_0,_1,_2,_3,_4,_5,_6,_7,...,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over
0,2010,male,30156,30481,31545,31737,31090,30927,31061,30900,...,10947,10386,9704,8868,8372,7420,6775,6088,5266,23110
1,2010,female,29115,29284,30509,30350,29990,29716,29670,29698,...,14436,13913,13596,13086,12666,11591,11204,10435,9385,53133
2,2011,male,30443,30183,30578,31646,31751,31109,30999,31135,...,11724,10404,9791,9096,8256,7826,6819,6145,5474,24011
3,2011,female,28723,29203,29399,30543,30360,29965,29806,29748,...,15400,13943,13402,12851,12487,12017,10824,10485,9704,53836
4,2012,male,29730,30522,30260,30539,31589,31789,31075,30964,...,11783,11100,9871,9175,8459,7681,7206,6201,5537,25043
5,2012,female,28787,28724,29164,29354,30566,30309,29974,29759,...,15451,14832,13454,12689,12299,11847,11257,10099,9761,54955
6,2013,male,29176,29737,30541,30207,30496,31608,31669,31045,...,11764,11238,10451,9228,8494,7914,7057,6524,5591,25770
7,2013,female,28220,28879,28832,29249,29358,30539,30275,30027,...,15353,14873,14344,12713,12113,11684,11064,10500,9376,55764
8,2014,male,29715,29304,29770,30592,30257,30552,31603,31630,...,12443,11198,10646,9814,8626,7944,7343,6434,5888,26690
9,2014,female,28606,28392,28837,28835,29251,29356,30664,30312,...,16331,14794,14397,13592,12202,11471,10915,10360,9816,56858


In [494]:
# rename bracket and Nan to year and sex respectively
temp.rename(columns={np.nan: "sex", "bracket": "year"}) 

Unnamed: 0,year,sex,_0,_1,_2,_3,_4,_5,_6,_7,...,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over
0,2010,male,30156,30481,31545,31737,31090,30927,31061,30900,...,10947,10386,9704,8868,8372,7420,6775,6088,5266,23110
1,2010,female,29115,29284,30509,30350,29990,29716,29670,29698,...,14436,13913,13596,13086,12666,11591,11204,10435,9385,53133
2,2011,male,30443,30183,30578,31646,31751,31109,30999,31135,...,11724,10404,9791,9096,8256,7826,6819,6145,5474,24011
3,2011,female,28723,29203,29399,30543,30360,29965,29806,29748,...,15400,13943,13402,12851,12487,12017,10824,10485,9704,53836
4,2012,male,29730,30522,30260,30539,31589,31789,31075,30964,...,11783,11100,9871,9175,8459,7681,7206,6201,5537,25043
5,2012,female,28787,28724,29164,29354,30566,30309,29974,29759,...,15451,14832,13454,12689,12299,11847,11257,10099,9761,54955
6,2013,male,29176,29737,30541,30207,30496,31608,31669,31045,...,11764,11238,10451,9228,8494,7914,7057,6524,5591,25770
7,2013,female,28220,28879,28832,29249,29358,30539,30275,30027,...,15353,14873,14344,12713,12113,11684,11064,10500,9376,55764
8,2014,male,29715,29304,29770,30592,30257,30552,31603,31630,...,12443,11198,10646,9814,8626,7944,7343,6434,5888,26690
9,2014,female,28606,28392,28837,28835,29251,29356,30664,30312,...,16331,14794,14397,13592,12202,11471,10915,10360,9816,56858


In [495]:
model_population_table(test_df_10_19, "Alabama", cols_to_remove, year_range="2010-2019")

Unnamed: 0,year,sex,_0,_1,_2,_3,_4,_5,_6,_7,...,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over,state
0,2010,male,30156,30481,31545,31737,31090,30927,31061,30900,...,10386,9704,8868,8372,7420,6775,6088,5266,23110,Alabama
1,2010,female,29115,29284,30509,30350,29990,29716,29670,29698,...,13913,13596,13086,12666,11591,11204,10435,9385,53133,Alabama
2,2011,male,30443,30183,30578,31646,31751,31109,30999,31135,...,10404,9791,9096,8256,7826,6819,6145,5474,24011,Alabama
3,2011,female,28723,29203,29399,30543,30360,29965,29806,29748,...,13943,13402,12851,12487,12017,10824,10485,9704,53836,Alabama
4,2012,male,29730,30522,30260,30539,31589,31789,31075,30964,...,11100,9871,9175,8459,7681,7206,6201,5537,25043,Alabama
5,2012,female,28787,28724,29164,29354,30566,30309,29974,29759,...,14832,13454,12689,12299,11847,11257,10099,9761,54955,Alabama
6,2013,male,29176,29737,30541,30207,30496,31608,31669,31045,...,11238,10451,9228,8494,7914,7057,6524,5591,25770,Alabama
7,2013,female,28220,28879,28832,29249,29358,30539,30275,30027,...,14873,14344,12713,12113,11684,11064,10500,9376,55764,Alabama
8,2014,male,29715,29304,29770,30592,30257,30552,31603,31630,...,11198,10646,9814,8626,7944,7343,6434,5888,26690,Alabama
9,2014,female,28606,28392,28837,28835,29251,29356,30664,30312,...,14794,14397,13592,12202,11471,10915,10360,9816,56858,Alabama


In [496]:
with ThreadPoolExecutor() as exe:
    state_populations_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_10_19, 
        [cols_to_remove] * len(populations_10_19),
        ["2010-2019"] * len(populations_10_19)
    ))

state_populations_df_10_19 = pd.concat(state_populations_10_19, axis=0, ignore_index=True)

In [497]:
state_populations_df_10_19

Unnamed: 0,year,sex,_0,_1,_2,_3,_4,_5,_6,_7,...,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over,state
0,2010,male,30156,30481,31545,31737,31090,30927,31061,30900,...,10386,9704,8868,8372,7420,6775,6088,5266,23110,Alabama
1,2010,female,29115,29284,30509,30350,29990,29716,29670,29698,...,13913,13596,13086,12666,11591,11204,10435,9385,53133,Alabama
2,2011,male,30443,30183,30578,31646,31751,31109,30999,31135,...,10404,9791,9096,8256,7826,6819,6145,5474,24011,Alabama
3,2011,female,28723,29203,29399,30543,30360,29965,29806,29748,...,13943,13402,12851,12487,12017,10824,10485,9704,53836,Alabama
4,2012,male,29730,30522,30260,30539,31589,31789,31075,30964,...,11100,9871,9175,8459,7681,7206,6201,5537,25043,Alabama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,2017,female,3480,3447,3512,3599,3627,3508,3700,3939,...,1618,1468,1345,1269,1178,1126,1024,916,6741,Wyoming
1016,2018,male,3419,3661,3817,3882,3772,3774,3907,3817,...,1448,1361,1291,1154,1027,927,815,717,4109,Wyoming
1017,2018,female,3260,3427,3453,3496,3600,3590,3496,3657,...,1604,1555,1400,1272,1227,1110,1047,984,6801,Wyoming
1018,2019,male,3367,3415,3667,3788,3867,3737,3797,3888,...,1609,1379,1281,1218,1078,969,862,751,4128,Wyoming


# reading sample excel file from year 2020-2023

In [498]:
test_df_20_23 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_2020-2023.xlsx"), dtype=object, header=None)
test_df_20_23

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,table with row headers in column A and column ...,,,,,,,,,,,,,,,
1,Annual Estimates of the Resident Population by...,,,,,,,,,,,,,,,
2,Age,"April 1, 2020 Estimates Base",,,Population Estimate (as of July 1),,,,,,,,,,,
3,,,,,2020,,,2021,,,2022,,,2023,,
4,,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Notes: The estimates are developed from a base...,,,,,,,,,,,,,,,
96,Suggested Citation:,,,,,,,,,,,,,,,
97,Annual Estimates of the Resident Population by...,,,,,,,,,,,,,,,
98,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,,,


#### clearly we now know we can discard columns 1, 2, 3, 4, 7, 10, and 13

In [500]:
cols_to_remove = [1, 2, 3, 4] + list(range(7, test_df_20_23.shape[1], 3))
cols_to_remove

[1, 2, 3, 4, 7, 10, 13]

In [501]:
with ThreadPoolExecutor() as exe:
    state_populations_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_20_23, 
        [cols_to_remove] * len(populations_20_23),
        ["2020-2023"] * len(populations_20_23)
    ))

state_populations_df_20_23 = pd.concat(state_populations_20_23, axis=0, ignore_index=True)

In [None]:
state_populations_df_20_23  

Unnamed: 0,year,sex,_0,_1,_2,_3,_4,_5,_6,_7,...,_77,_78,_79,_80,_81,_82,_83,_84,_85_and_over,state
0,2020,male,29226,29793,30649,30918,31404,31427,31161,30767,...,15009,12590,11076,9931,9194,8338,7114,6371,29300,Alabama
1,2020,female,27808,28444,29270,29548,30261,30440,30093,29812,...,19130,16251,14509,13607,12813,11985,10503,9722,57072,Alabama
2,2021,male,28809,29500,29962,30875,31193,31593,31607,31411,...,14160,14069,11674,10116,9140,8311,7522,6245,29522,Alabama
3,2021,female,27932,28003,28675,29474,29746,30432,30652,30372,...,17970,18305,15264,13750,12834,11861,11040,9683,56638,Alabama
4,2022,male,29383,28991,29804,30200,31144,31366,31859,31843,...,13888,13382,13190,10782,9390,8309,7514,6728,30198,Alabama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,2021,female,2956,3119,3083,3226,3364,3503,3488,3536,...,1852,1782,1518,1362,1253,1143,1054,988,6201,Wyoming
404,2022,male,3182,3126,3191,3252,3353,3572,3762,3801,...,1709,1643,1618,1323,1131,997,905,807,3964,Wyoming
405,2022,female,2814,2971,3104,3101,3252,3358,3530,3507,...,1816,1786,1686,1446,1294,1190,1058,1007,6131,Wyoming
406,2023,male,3101,3242,3136,3205,3283,3382,3585,3807,...,1707,1634,1554,1541,1235,1047,917,835,4158,Wyoming
