In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import ast

from concurrent.futures import ThreadPoolExecutor

from utilities.preprocessors import column_summary, model_population_table, model_population_by_sex_race_ho_table
from utilities.visualizers import disp_cat_feat, view_feat_outliers


%load_ext autoreload
%autoreload 2

In [2]:
DATA_DIR = './data/population-data'
EXCLUSIONS = ["us_populations_per_state_2001_to_2021.csv"]
files = list(filter(lambda file: not file in EXCLUSIONS, os.listdir(DATA_DIR)))
populations_by_sex_age_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_race_and_ho" in file, files))
len(populations_by_sex_age_00_10), len(populations_by_sex_age_10_19), len(populations_by_sex_age_20_23),

(51, 51, 51)

In [3]:
len(populations_by_sex_race_ho_00_10), len(populations_by_sex_race_ho_10_19), len(populations_by_sex_race_ho_20_23),

(51, 51, 51)

# Read sample excel sheet

In [4]:
test_df = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2000-2010.xls"), dtype=object, header=None)
test_df.head(40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,Sex and Age,"April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
4,BOTH SEXES,4447207,4452173,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736,4785298
5,.Under 5 years,296000,295185,296624.0,296046.0,295204.0,295970.0,296441.0,297222.0,300300.0,304842.0,305412.0,304957,304840
6,.5 to 9 years,315369,313178,307526.0,302632.0,299148.0,297554.0,298450.0,303581.0,306013.0,306682.0,307864.0,308229,308125
7,.10 to 14 years,320266,321372,323615.0,325008.0,326642.0,326228.0,323028.0,321867.0,320407.0,319503.0,319072.0,319655,319314
8,.15 to 19 years,324583,325612,321866.0,320749.0,321655.0,325095.0,330753.0,337003.0,341279.0,345580.0,346611.0,343471,341504
9,.20 to 24 years,306876,309170,318741.0,322812.0,326983.0,326749.0,326727.0,326239.0,327293.0,328751.0,332117.0,335322,336601


In [5]:
male_start = test_df[test_df[0] == "MALE"].index.to_list()[0]
male_start

39

In [6]:
pop_brackets = test_df.iloc[male_start:]
pop_brackets

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560,2149338,2158138,2165719,2179422,2192872,2213382,2243501,2265565,2287949,2309779,2320188,2323317
40,.Under 5 years,151071,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463,155265,155196
41,.5 to 9 years,161798,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145,157340,157294
42,.10 to 14 years,164637,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165,163417,163222
43,.15 to 19 years,164416,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744,175151,174172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Note: Median age is calculated based on single...,,,,,,,,,,,,,
113,Suggested Citation:,,,,,,,,,,,,,
114,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
115,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


In [7]:
female_start = pop_brackets[pop_brackets[0] == "FEMALE"].index.to_list()[0]
male_end, female_end = pop_brackets[pop_brackets[0] == ".Median age (years)"].index.to_list()
male_end, female_end

(73, 108)

# split the excel spreadsheet into the male and female population brackets

In [8]:
male_pop_bracket = test_df.iloc[male_start:male_end]
male_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560.0,2149338.0,2158138.0,2165719.0,2179422.0,2192872.0,2213382.0,2243501.0,2265565.0,2287949.0,2309779.0,2320188.0,2323317.0
40,.Under 5 years,151071.0,150609.0,151410.0,150856.0,150594.0,150699.0,150960.0,151442.0,153128.0,155061.0,155463.0,155265.0,155196.0
41,.5 to 9 years,161798.0,160685.0,157513.0,154832.0,152874.0,151948.0,152574.0,155157.0,156345.0,156770.0,157145.0,157340.0,157294.0
42,.10 to 14 years,164637.0,165170.0,166253.0,166796.0,167376.0,167198.0,165333.0,164608.0,163819.0,163445.0,163165.0,163417.0,163222.0
43,.15 to 19 years,164416.0,165156.0,163598.0,163527.0,164178.0,165836.0,169052.0,172295.0,174268.0,176205.0,176744.0,175151.0,174172.0
44,.20 to 24 years,151811.0,152937.0,157924.0,160193.0,163064.0,163013.0,163055.0,163368.0,163868.0,164488.0,165830.0,167520.0,168170.0
45,.25 to 29 years,149270.0,148063.0,141826.0,138866.0,138346.0,139913.0,143069.0,148916.0,151122.0,153665.0,154238.0,153716.0,154413.0
46,.30 to 34 years,148685.0,148363.0,148924.0,149479.0,149716.0,147796.0,145535.0,141715.0,140442.0,140890.0,144437.0,146424.0,147553.0
47,.35 to 39 years,166595.0,165784.0,161913.0,156961.0,152711.0,149728.0,148720.0,151475.0,153426.0,153863.0,153311.0,151078.0,150161.0
48,.40 to 44 years,168344.0,168611.0,169104.0,168292.0,167519.0,167409.0,165646.0,163182.0,159582.0,155950.0,154308.0,152707.0,152560.0


In [9]:
female_pop_bracket = test_df.iloc[female_start:female_end]
female_pop_bracket

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
74,FEMALE,2300647.0,2302835.0,2309496.0,2314370.0,2324069.0,2337857.0,2356423.0,2385480.0,2407275.0,2430257.0,2448159.0,2459548.0,2461981.0
75,.Under 5 years,144929.0,144576.0,145214.0,145190.0,144610.0,145271.0,145481.0,145780.0,147172.0,149781.0,149949.0,149692.0,149644.0
76,.5 to 9 years,153571.0,152493.0,150013.0,147800.0,146274.0,145606.0,145876.0,148424.0,149668.0,149912.0,150719.0,150889.0,150831.0
77,.10 to 14 years,155629.0,156202.0,157362.0,158212.0,159266.0,159030.0,157695.0,157259.0,156588.0,156058.0,155907.0,156238.0,156092.0
78,.15 to 19 years,160167.0,160456.0,158268.0,157222.0,157477.0,159259.0,161701.0,164708.0,167011.0,169375.0,169867.0,168320.0,167332.0
79,.20 to 24 years,155065.0,156233.0,160817.0,162619.0,163919.0,163736.0,163672.0,162871.0,163425.0,164263.0,166287.0,167802.0,168431.0
80,.25 to 29 years,151927.0,150616.0,144937.0,142137.0,142041.0,143951.0,147499.0,153897.0,156497.0,157839.0,157961.0,157318.0,157516.0
81,.30 to 34 years,153157.0,152816.0,153262.0,153189.0,153006.0,152100.0,149728.0,146533.0,145591.0,146382.0,148915.0,151464.0,152567.0
82,.35 to 39 years,173718.0,172909.0,168957.0,164153.0,159449.0,155830.0,154736.0,157188.0,159044.0,160006.0,159486.0,157352.0,156281.0
83,.40 to 44 years,176874.0,177209.0,178029.0,176731.0,175672.0,174844.0,173653.0,170861.0,166904.0,162815.0,159544.0,158364.0,158196.0


#### Remove the following
* column `1`, column `12`, and column `13` (the reasoning is these contain only the population estimates of april 1 and not the most recent one which is supposed to be at july 1, and that column `13` is the year 2010 which already exists in the next population years)
* rows with mostly Nan and the a dot symbol in column `1` i.e. `[. Nan Nan Nan Nan Nan ... Nan]`
* and the male column 

#### we also rename the columns to be `bracket`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`

In [10]:
cols_to_remove = [1, 12, 13]
cond = (male_pop_bracket[0] != ".") & (male_pop_bracket[0] != "MALE")
name_map = {0: "bracket", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009}
temp_male = male_pop_bracket[cond].drop(columns=cols_to_remove).rename(columns=name_map).reset_index(drop=True)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


#### we remove the brackets that have duplicates

In [11]:
temp_male = temp_male.drop_duplicates(ignore_index=True)
temp_male

Unnamed: 0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
1,.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
2,.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
3,.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
4,.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
5,.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
6,.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
7,.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
8,.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
9,.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


In [12]:
temp_male.index = temp_male["bracket"]
temp_male

Unnamed: 0_level_0,bracket,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
bracket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
.Under 5 years,.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
.5 to 9 years,.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
.10 to 14 years,.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
.15 to 19 years,.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
.20 to 24 years,.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
.25 to 29 years,.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
.30 to 34 years,.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
.35 to 39 years,.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
.40 to 44 years,.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
.45 to 49 years,.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


In [13]:
del temp_male["bracket"]
temp_male

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
bracket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
.Under 5 years,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463
.5 to 9 years,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145
.10 to 14 years,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165
.15 to 19 years,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744
.20 to 24 years,152937,157924,160193,163064,163013,163055,163368,163868,164488,165830
.25 to 29 years,148063,141826,138866,138346,139913,143069,148916,151122,153665,154238
.30 to 34 years,148363,148924,149479,149716,147796,145535,141715,140442,140890,144437
.35 to 39 years,165784,161913,156961,152711,149728,148720,151475,153426,153863,153311
.40 to 44 years,168611,169104,168292,167519,167409,165646,163182,159582,155950,154308
.45 to 49 years,153919,157109,160859,163830,165310,167466,169420,169469,169523,170289


In [14]:
temp_male.shape

(29, 10)

#### in order to achieve the ff:
![modelling table from population data by sex and age 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20and%20age%202000%20to%202009.png)
#### we need to somehow at least make our age brackets our index so that when each row is stacked vertically and the column becomes now the row index, that we are able to still keep track of our original row indeces which are our age brackets so that when the dataframe is stacked later and it becomes a multi index dataframe we can just reset the index so that our multi index of our age brackets and years now become columns themselves

In [15]:
temp_male = temp_male.stack().reset_index()
temp_male

Unnamed: 0,bracket,level_1,0
0,.Under 5 years,2000,150609
1,.Under 5 years,2001,151410
2,.Under 5 years,2002,150856
3,.Under 5 years,2003,150594
4,.Under 5 years,2004,150699
...,...,...,...
285,.15 to 44 years,2005,935077
286,.15 to 44 years,2006,940951
287,.15 to 44 years,2007,942708
288,.15 to 44 years,2008,945061


In [16]:
temp_male = temp_male.rename(columns={"level_1": "year", 0: "population"})
temp_male

Unnamed: 0,bracket,year,population
0,.Under 5 years,2000,150609
1,.Under 5 years,2001,151410
2,.Under 5 years,2002,150856
3,.Under 5 years,2003,150594
4,.Under 5 years,2004,150699
...,...,...,...
285,.15 to 44 years,2005,935077
286,.15 to 44 years,2006,940951
287,.15 to 44 years,2007,942708
288,.15 to 44 years,2008,945061


#### we also apply transformations to the `bracket` column by splitting say `.5 to 9 years` to 5 and 9 and have separate columns named `age_start` and `age_end` to take in these values

In [17]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over|\+)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    numbers = [ast.literal_eval(number) for number in numbers]
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "_under_5"
    if keyword == "under":
        return (0, numbers[-1])
    
    # e.g. "5 to 9" becomes "_5_to_9"
    elif keyword == "to":
        return (numbers[0], numbers[-1])
    
    # e.g. "9 and over" becomes "_9_and_over"
    elif keyword == "and over" or keyword == "+": 
        return (numbers[-1], float('inf'))
    
    # if it is a single number just return that number
    return (np.nan, numbers[-1])

In [18]:
age_ranges = temp_male["bracket"].apply(helper).to_list()
age_ranges

[(0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (0, 5),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (5, 9),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (10, 14),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (15, 19),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (20, 24),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (25, 29),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (30, 34),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (35, 39),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (40, 44),
 (45, 49),
 (45, 49),
 (45, 49),
 (45, 49),
 (45, 

In [19]:
temp_male["age_start"], temp_male["age_end"] = list(zip(*age_ranges))
temp_male

Unnamed: 0,bracket,year,population,age_start,age_end
0,.Under 5 years,2000,150609,0,5.0
1,.Under 5 years,2001,151410,0,5.0
2,.Under 5 years,2002,150856,0,5.0
3,.Under 5 years,2003,150594,0,5.0
4,.Under 5 years,2004,150699,0,5.0
...,...,...,...,...,...
285,.15 to 44 years,2005,935077,15,44.0
286,.15 to 44 years,2006,940951,15,44.0
287,.15 to 44 years,2007,942708,15,44.0
288,.15 to 44 years,2008,945061,15,44.0


#### delete the bracket column for the last time

In [20]:
del temp_male["bracket"]
temp_male

Unnamed: 0,year,population,age_start,age_end
0,2000,150609,0,5.0
1,2001,151410,0,5.0
2,2002,150856,0,5.0
3,2003,150594,0,5.0
4,2004,150699,0,5.0
...,...,...,...,...
285,2005,935077,15,44.0
286,2006,940951,15,44.0
287,2007,942708,15,44.0
288,2008,945061,15,44.0


In [21]:
temp_male["sex"] = "Male"

In [22]:
temp_male["state"] = "Alabama"

In [23]:
final_male_pop_bracket = temp_male
final_male_pop_bracket

Unnamed: 0,year,population,age_start,age_end,sex,state
0,2000,150609,0,5.0,Male,Alabama
1,2001,151410,0,5.0,Male,Alabama
2,2002,150856,0,5.0,Male,Alabama
3,2003,150594,0,5.0,Male,Alabama
4,2004,150699,0,5.0,Male,Alabama
...,...,...,...,...,...,...
285,2005,935077,15,44.0,Male,Alabama
286,2006,940951,15,44.0,Male,Alabama
287,2007,942708,15,44.0,Male,Alabama
288,2008,945061,15,44.0,Male,Alabama


In [24]:
(final_male_pop_bracket["population"] <= 0).sum()

np.int64(0)

In [25]:
column_summary(final_male_pop_bracket)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,year,int64,0,290,10,"{2000: 29, 2001: 29, 2002: 29, 2003: 29, 2004:..."
1,population,object,0,290,290,"{948868: 1, 150609: 1, 1609934: 1, 1594431: 1,..."
2,age_start,int64,0,290,21,"{18: 30, 5: 20, 15: 20, 25: 20, 0: 20, 65: 20,..."
3,age_end,float64,0,290,21,"{inf: 40, 44.0: 30, 64.0: 30, 24.0: 20, 5.0: 1..."
4,sex,object,0,290,1,{'Male': 290}
5,state,object,0,290,1,{'Alabama': 290}


#### We've done our preprocessing on the male population age brackets now we have to this same preprocessing on the female demographic. We can achieve this by writing a function that implements our above prototype that not only does it to the male population but also that of the female one, adn combines the resulting dataframes into one single dataframe for easy collation

In [26]:
model_population_table(test_df, "Alabama", cols_to_remove, year_range="2000-2009")

Unnamed: 0,bracket,year,population,age_start,age_end,sex,state
0,under 5 years,2000,150609,0,5.0,male,Alabama
1,under 5 years,2001,151410,0,5.0,male,Alabama
2,under 5 years,2002,150856,0,5.0,male,Alabama
3,under 5 years,2003,150594,0,5.0,male,Alabama
4,under 5 years,2004,150699,0,5.0,male,Alabama
...,...,...,...,...,...,...,...
575,15 to 44 years,2005,950989,15,44.0,female,Alabama
576,15 to 44 years,2006,956058,15,44.0,female,Alabama
577,15 to 44 years,2007,958472,15,44.0,female,Alabama
578,15 to 44 years,2008,960680,15,44.0,female,Alabama


In [None]:
def concur_model_pop_tables(file, cols_to_remove, year_range, callback_fn=model_population_table):
    FILE_PATH = os.path.join(DATA_DIR, file)
    state = re.search(r"(^[A-Za-z]+)", file)
    state = "Unknown" if not state else state[0]

    # print(cols_to_remove)
    # print(year_range)
    # read excel file
    df = pd.read_excel(FILE_PATH, dtype=object, header=None)
    
    state_population = callback_fn(df, state, cols_to_remove, year_range=year_range)
    return state_population

In [28]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_00_09 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_00_10, 
        [cols_to_remove] * len(populations_by_sex_age_00_10),
        ["2000-2009"] * len(populations_by_sex_age_00_10)
    ))

state_populations_by_sex_age_df_00_09 = pd.concat(state_populations_by_sex_age_00_09, axis=0, ignore_index=True)

In [29]:
state_populations_by_sex_age_df_00_09

Unnamed: 0,bracket,year,population,age_start,age_end,sex,state
0,under 5 years,2000,150609,0,5.0,male,Alabama
1,under 5 years,2001,151410,0,5.0,male,Alabama
2,under 5 years,2002,150856,0,5.0,male,Alabama
3,under 5 years,2003,150594,0,5.0,male,Alabama
4,under 5 years,2004,150699,0,5.0,male,Alabama
...,...,...,...,...,...,...,...
29575,15 to 44 years,2005,101425,15,44.0,female,Wyoming
29576,15 to 44 years,2006,101849,15,44.0,female,Wyoming
29577,15 to 44 years,2007,103196,15,44.0,female,Wyoming
29578,15 to 44 years,2008,104431,15,44.0,female,Wyoming


#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

In [31]:
column_summary(state_populations_by_sex_age_df_00_09)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,bracket,object,0,29580,29,"{'under 5 years': 1020, '5 to 9 years': 1020, ..."
1,year,int64,0,29580,10,"{2000: 2958, 2001: 2958, 2002: 2958, 2003: 295..."
2,population,int64,0,29580,28128,"{149728: 4, 19485: 4, 82886: 4, 73095: 3, 1255..."
3,age_start,int64,0,29580,21,"{18: 3060, 5: 2040, 15: 2040, 25: 2040, 0: 204..."
4,age_end,float64,0,29580,21,"{inf: 4080, 44.0: 3060, 64.0: 3060, 24.0: 2040..."
5,sex,object,0,29580,2,"{'male': 14790, 'female': 14790}"
6,state,object,0,29580,46,"{'New': 2320, 'North': 1160, 'South': 1160, 'A..."


In [32]:
# take note this is just hte below five age bracket, 
# if we include all other age brackets we might have
# a bigger total population value per year
test = state_populations_by_sex_age_df_00_09.groupby(by=["year", "bracket", "sex", "state"]).agg(total_population=("population", "sum"))
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,total_population
year,bracket,sex,state,Unnamed: 4_level_1
2000,10 to 14 years,female,Alabama,156202
2000,10 to 14 years,female,Alaska,156202
2000,10 to 14 years,female,Arizona,27698
2000,10 to 14 years,female,Arkansas,185914
2000,10 to 14 years,female,California,94030
...,...,...,...,...
2009,under 5 years,male,Virginia,259590
2009,under 5 years,male,Washington,223929
2009,under 5 years,male,West,53543
2009,under 5 years,male,Wisconsin,183374


# Reading sample excel file for year 2010-2019

In [33]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,table with row headers in column A and column ...,,,,,,,,,,...,,,,,,,,,,
1,Annual Estimates of the Resident Population by...,,,,,,,,,,...,,,,,,,,,,
2,.Age,2010-04-01 00:00:00,,,,,,Population Estimate (as of July 1),,,...,,,,,,,,,,
3,,Census,,,Estimates Base,,,2010,,,...,,2017,,,2018,,,2019,,
4,,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,...,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Note: The estimates are based on the 2010 Cens...,,,,,,,,,,...,,,,,,,,,,
95,Suggested Citation:,,,,,,,,,,...,,,,,,,,,,
96,Annual Estimates of the Resident Population by...,,,,,,,,,,...,,,,,,,,,,
97,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,...,,,,,,,,,,


In [34]:
start_index = test_df_10_19[test_df_10_19[0] == ".0"].index.to_list()[0]
start_index

6

In [35]:
end_index = test_df_10_19[test_df_10_19[0] == ".Median Age (years)"].index.to_list()[0]
end_index

93

#### Extract necessary rows

In [36]:
pop_brackets_10_19 = test_df_10_19.iloc[start_index: end_index]
pop_brackets_10_19

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101
91,.85+,75684,22859,52825,75715,22864,52851,76243,23110,53133,...,58765,89262,29812,59450,90410,30582,59828,91543,31322,60221


#### remove duplicates

In [37]:
temp = pop_brackets_10_19.drop_duplicates()
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101
91,.85+,75684,22859,52825,75715,22864,52851,76243,23110,53133,...,58765,89262,29812,59450,90410,30582,59828,91543,31322,60221


#### remove rows with at least 5 nan values

In [38]:
temp = temp.dropna(thresh=5, axis=0)
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
6,.0,60056,30465,29591,60056,30465,29591,59271,30156,29115,...,28978,58460,29701,28759,57853,29542,28311,56901,29080,27821
7,.1,59832,30587,29245,59832,30587,29245,59765,30481,29284,...,29268,59091,29940,29151,58664,29775,28889,58290,29711,28579
8,.2,62283,31607,30676,62283,31607,30676,62054,31545,30509,...,28909,59753,30298,29455,59507,30189,29318,59073,29954,29119
9,.3,61996,31725,30271,61998,31727,30271,62087,31737,30350,...,28496,59131,30103,29028,59981,30465,29516,59799,30366,29433
10,.4,60790,30881,29909,60791,30881,29910,61080,31090,29990,...,29070,58169,29636,28533,59290,30197,29093,60294,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,.81,18920,7346,11574,18925,7348,11577,19011,7420,11591,...,12383,21075,8660,12415,20996,8668,12328,22580,9291,13289
88,.82,18069,6760,11309,18077,6764,11313,17979,6775,11204,...,10904,19441,7867,11574,19627,7987,11640,19594,8020,11574
89,.83,16401,6056,10345,16401,6056,10345,16523,6088,10435,...,10066,16997,6832,10165,18047,7243,10804,18222,7337,10885
90,.84,14530,5200,9330,14537,5202,9335,14651,5266,9385,...,9539,15426,6016,9410,15689,6179,9510,16660,6559,10101


#### remove columns 1 to 7, then increment by 3

In [39]:
cols_to_remove = [1, 2, 3, 4, 5, 6] + list(range(7, temp.shape[1], 3))
cols_to_remove

[1, 2, 3, 4, 5, 6, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34]

In [40]:
temp = temp.drop(columns=cols_to_remove)
temp

Unnamed: 0,0,8,9,11,12,14,15,17,18,20,...,23,24,26,27,29,30,32,33,35,36
6,.0,30156,29115,30443,28723,29730,28787,29176,28220,29715,...,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
7,.1,30481,29284,30183,29203,30522,28724,29737,28879,29304,...,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
8,.2,31545,30509,30578,29399,30260,29164,30541,28832,29770,...,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
9,.3,31737,30350,31646,30543,30539,29354,30207,29249,30592,...,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
10,.4,31090,29990,31751,30360,31589,30566,30496,29358,30257,...,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,.81,7420,11591,7826,12017,7681,11847,7914,11684,7944,...,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
88,.82,6775,11204,6819,10824,7206,11257,7057,11064,7343,...,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
89,.83,6088,10435,6145,10485,6201,10099,6524,10500,6434,...,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
90,.84,5266,9385,5474,9704,5537,9761,5591,9376,5888,...,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [41]:
temp.index = temp[0]
temp

Unnamed: 0_level_0,0,8,9,11,12,14,15,17,18,20,...,23,24,26,27,29,30,32,33,35,36
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.0,.0,30156,29115,30443,28723,29730,28787,29176,28220,29715,...,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
.1,.1,30481,29284,30183,29203,30522,28724,29737,28879,29304,...,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
.2,.2,31545,30509,30578,29399,30260,29164,30541,28832,29770,...,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
.3,.3,31737,30350,31646,30543,30539,29354,30207,29249,30592,...,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
.4,.4,31090,29990,31751,30360,31589,30566,30496,29358,30257,...,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
.81,.81,7420,11591,7826,12017,7681,11847,7914,11684,7944,...,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
.82,.82,6775,11204,6819,10824,7206,11257,7057,11064,7343,...,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
.83,.83,6088,10435,6145,10485,6201,10099,6524,10500,6434,...,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
.84,.84,5266,9385,5474,9704,5537,9761,5591,9376,5888,...,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [42]:
del temp[0]
temp

Unnamed: 0_level_0,8,9,11,12,14,15,17,18,20,21,23,24,26,27,29,30,32,33,35,36
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
.0,30156,29115,30443,28723,29730,28787,29176,28220,29715,28606,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
.1,30481,29284,30183,29203,30522,28724,29737,28879,29304,28392,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
.2,31545,30509,30578,29399,30260,29164,30541,28832,29770,28837,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
.3,31737,30350,31646,30543,30539,29354,30207,29249,30592,28835,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
.4,31090,29990,31751,30360,31589,30566,30496,29358,30257,29251,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
.81,7420,11591,7826,12017,7681,11847,7914,11684,7944,11471,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
.82,6775,11204,6819,10824,7206,11257,7057,11064,7343,10915,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
.83,6088,10435,6145,10485,6201,10099,6524,10500,6434,10360,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
.84,5266,9385,5474,9704,5537,9761,5591,9376,5888,9816,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


In [43]:
# generate and create multi index for columns
years = sorted(list(range(2010, 2020)) * 2)
genders = ["male", "female"] * 10
multi_index_list = list(zip(years, genders))
multi_index_list

[(2010, 'male'),
 (2010, 'female'),
 (2011, 'male'),
 (2011, 'female'),
 (2012, 'male'),
 (2012, 'female'),
 (2013, 'male'),
 (2013, 'female'),
 (2014, 'male'),
 (2014, 'female'),
 (2015, 'male'),
 (2015, 'female'),
 (2016, 'male'),
 (2016, 'female'),
 (2017, 'male'),
 (2017, 'female'),
 (2018, 'male'),
 (2018, 'female'),
 (2019, 'male'),
 (2019, 'female')]

In [44]:
multi_index = pd.MultiIndex.from_tuples(multi_index_list)
multi_index

MultiIndex([(2010,   'male'),
            (2010, 'female'),
            (2011,   'male'),
            (2011, 'female'),
            (2012,   'male'),
            (2012, 'female'),
            (2013,   'male'),
            (2013, 'female'),
            (2014,   'male'),
            (2014, 'female'),
            (2015,   'male'),
            (2015, 'female'),
            (2016,   'male'),
            (2016, 'female'),
            (2017,   'male'),
            (2017, 'female'),
            (2018,   'male'),
            (2018, 'female'),
            (2019,   'male'),
            (2019, 'female')],
           )

In [45]:
multi_index[0]

(np.int64(2010), 'male')

#### set multi indexed columns and delete index name of rows

In [46]:
temp.columns = multi_index
temp.index.name = "bracket"
temp

Unnamed: 0_level_0,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Unnamed: 0_level_1,male,female,male,female,male,female,male,female,male,female,male,female,male,female,male,female,male,female,male,female
bracket,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
.0,30156,29115,30443,28723,29730,28787,29176,28220,29715,28606,30028,29090,29856,28978,29701,28759,29542,28311,29080,27821
.1,30481,29284,30183,29203,30522,28724,29737,28879,29304,28392,29846,28846,30152,29268,29940,29151,29775,28889,29711,28579
.2,31545,30509,30578,29399,30260,29164,30541,28832,29770,28837,29463,28489,30047,28909,30298,29455,30189,29318,29954,29119
.3,31737,30350,31646,30543,30539,29354,30207,29249,30592,28835,29896,28929,29530,28496,30103,29028,30465,29516,30366,29433
.4,31090,29990,31751,30360,31589,30566,30496,29358,30257,29251,30557,28826,29903,29070,29636,28533,30197,29093,30604,29690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
.81,7420,11591,7826,12017,7681,11847,7914,11684,7944,11471,8018,11611,8528,12383,8660,12415,8668,12328,9291,13289
.82,6775,11204,6819,10824,7206,11257,7057,11064,7343,10915,7257,10734,7409,10904,7867,11574,7987,11640,8020,11574
.83,6088,10435,6145,10485,6201,10099,6524,10500,6434,10360,6677,10200,6666,10066,6832,10165,7243,10804,7337,10885
.84,5266,9385,5474,9704,5537,9761,5591,9376,5888,9816,5855,9658,6054,9539,6016,9410,6179,9510,6559,10101


#### now we will have to stack each row vertically on each other and because we have multi indexed columns we will need to stack it twice in order to make these column indeces now be the row indeces

In [47]:
temp = temp.stack().stack()
temp

  temp = temp.stack().stack()


bracket              
.0       female  2010    29115
                 2011    28723
                 2012    28787
                 2013    28220
                 2014    28606
                         ...  
.85+     male    2015    27676
                 2016    28611
                 2017    29812
                 2018    30582
                 2019    31322
Length: 1720, dtype: object

#### now we can reset the index such that these multi index rows now become columns of our new dataframe

In [48]:
temp = temp.reset_index()
temp

Unnamed: 0,bracket,level_1,level_2,0
0,.0,female,2010,29115
1,.0,female,2011,28723
2,.0,female,2012,28787
3,.0,female,2013,28220
4,.0,female,2014,28606
...,...,...,...,...
1715,.85+,male,2015,27676
1716,.85+,male,2016,28611
1717,.85+,male,2017,29812
1718,.85+,male,2018,30582


In [49]:
# rename the newly converted columns to bracket, sex, year, and population respectively
temp = temp.rename(columns={"level_1": "sex", "level_2": "year", 0: "population"})
temp

Unnamed: 0,bracket,sex,year,population
0,.0,female,2010,29115
1,.0,female,2011,28723
2,.0,female,2012,28787
3,.0,female,2013,28220
4,.0,female,2014,28606
...,...,...,...,...
1715,.85+,male,2015,27676
1716,.85+,male,2016,28611
1717,.85+,male,2017,29812
1718,.85+,male,2018,30582


In [50]:
age_ranges_00_10 = temp["bracket"].apply(helper).to_list()
age_ranges_00_10

[(nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 0),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 1),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 2),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 3),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),
 (nan, 4),

In [51]:
temp["age_start"], temp["age_end"] = list(zip(*age_ranges_00_10))
temp

Unnamed: 0,bracket,sex,year,population,age_start,age_end
0,.0,female,2010,29115,,0.0
1,.0,female,2011,28723,,0.0
2,.0,female,2012,28787,,0.0
3,.0,female,2013,28220,,0.0
4,.0,female,2014,28606,,0.0
...,...,...,...,...,...,...
1715,.85+,male,2015,27676,85.0,inf
1716,.85+,male,2016,28611,85.0,inf
1717,.85+,male,2017,29812,85.0,inf
1718,.85+,male,2018,30582,85.0,inf


In [52]:
temp["state"] = "Alabama"
temp

Unnamed: 0,bracket,sex,year,population,age_start,age_end,state
0,.0,female,2010,29115,,0.0,Alabama
1,.0,female,2011,28723,,0.0,Alabama
2,.0,female,2012,28787,,0.0,Alabama
3,.0,female,2013,28220,,0.0,Alabama
4,.0,female,2014,28606,,0.0,Alabama
...,...,...,...,...,...,...,...
1715,.85+,male,2015,27676,85.0,inf,Alabama
1716,.85+,male,2016,28611,85.0,inf,Alabama
1717,.85+,male,2017,29812,85.0,inf,Alabama
1718,.85+,male,2018,30582,85.0,inf,Alabama


In [53]:
model_population_table(test_df_10_19, "Alabama", cols_to_remove, year_range="2010-2019")

Unnamed: 0,bracket,sex,year,population,age_start,age_end,state
0,0,male,2010,30156.0,,0.0,Alabama
1,0,male,2011,30443.0,,0.0,Alabama
2,0,male,2012,29730.0,,0.0,Alabama
3,0,male,2013,29176.0,,0.0,Alabama
4,0,male,2014,29715.0,,0.0,Alabama
...,...,...,...,...,...,...,...
1715,85+,female,2015,57723.0,85.0,inf,Alabama
1716,85+,female,2016,58765.0,85.0,inf,Alabama
1717,85+,female,2017,59450.0,85.0,inf,Alabama
1718,85+,female,2018,59828.0,85.0,inf,Alabama


In [54]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_10_19, 
        [cols_to_remove] * len(populations_by_sex_age_10_19),
        ["2010-2019"] * len(populations_by_sex_age_10_19)
    ))

state_populations_by_sex_age_df_10_19 = pd.concat(state_populations_by_sex_age_10_19, axis=0, ignore_index=True)

In [55]:
state_populations_by_sex_age_df_10_19

Unnamed: 0,bracket,sex,year,population,age_start,age_end,state
0,0,male,2010,30156.0,,0.0,Alabama
1,0,male,2011,30443.0,,0.0,Alabama
2,0,male,2012,29730.0,,0.0,Alabama
3,0,male,2013,29176.0,,0.0,Alabama
4,0,male,2014,29715.0,,0.0,Alabama
...,...,...,...,...,...,...,...
87715,85+,female,2015,6371.0,85.0,inf,Alabama
87716,85+,female,2016,6543.0,85.0,inf,Alabama
87717,85+,female,2017,6741.0,85.0,inf,Alabama
87718,85+,female,2018,6801.0,85.0,inf,Alabama


In [56]:
column_summary(state_populations_by_sex_age_df_10_19)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,bracket,object,0,87720,86,"{'0': 1020, '1': 1020, '2': 1020, '3': 1020, '..."
1,sex,object,0,87720,2,"{'male': 43860, 'female': 43860}"
2,year,int64,0,87720,10,"{2010: 8772, 2011: 8772, 2012: 8772, 2013: 877..."
3,population,float64,0,87720,50663,"{5774.0: 12, 6118.0: 12, 5761.0: 12, 6145.0: 1..."
4,age_start,float64,86700,1020,1,{85.0: 1020}
5,age_end,float64,0,87720,86,"{0.0: 1020, 1.0: 1020, 2.0: 1020, 3.0: 1020, 4..."
6,state,object,0,87720,1,{'Alabama': 87720}


#### again we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# reading sample excel file from year 2020-2023

In [58]:
test_df_20_23 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2020-2023.xlsx"), dtype=object, header=None)
test_df_20_23

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,table with row headers in column A and column ...,,,,,,,,,,,,,,,
1,Annual Estimates of the Resident Population by...,,,,,,,,,,,,,,,
2,Age,"April 1, 2020 Estimates Base",,,Population Estimate (as of July 1),,,,,,,,,,,
3,,,,,2020,,,2021,,,2022,,,2023,,
4,,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female,Total\nPopulation,Male,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Notes: The estimates are developed from a base...,,,,,,,,,,,,,,,
96,Suggested Citation:,,,,,,,,,,,,,,,
97,Annual Estimates of the Resident Population by...,,,,,,,,,,,,,,,
98,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,,,


#### clearly we now know we can discard columns 1, 2, 3, 4, 7, 10, and 13

In [59]:
cols_to_remove = [1, 2, 3, 4] + list(range(7, test_df_20_23.shape[1], 3))
cols_to_remove

[1, 2, 3, 4, 7, 10, 13]

In [60]:
model_population_table(test_df_20_23, "Alabama", cols_to_remove, year_range="2020-2023")

Unnamed: 0,bracket,sex,year,population,age_start,age_end,state
0,0,male,2020,29226.0,,0.0,Alabama
1,0,male,2021,28809.0,,0.0,Alabama
2,0,male,2022,29383.0,,0.0,Alabama
3,0,male,2023,29459.0,,0.0,Alabama
4,0,female,2020,27808.0,,0.0,Alabama
...,...,...,...,...,...,...,...
683,85+,male,2023,31941.0,85.0,inf,Alabama
684,85+,female,2020,57072.0,85.0,inf,Alabama
685,85+,female,2021,56638.0,85.0,inf,Alabama
686,85+,female,2022,56924.0,85.0,inf,Alabama


In [61]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_20_23, 
        [cols_to_remove] * len(populations_by_sex_age_20_23),
        ["2020-2023"] * len(populations_by_sex_age_20_23)
    ))

state_populations_by_sex_age_df_20_23 = pd.concat(state_populations_by_sex_age_20_23, axis=0, ignore_index=True)

In [62]:
state_populations_by_sex_age_df_20_23

Unnamed: 0,bracket,sex,year,population,age_start,age_end,state
0,0,male,2020,29226.0,,0.0,Alabama
1,0,male,2021,28809.0,,0.0,Alabama
2,0,male,2022,29383.0,,0.0,Alabama
3,0,male,2023,29459.0,,0.0,Alabama
4,0,female,2020,27808.0,,0.0,Alabama
...,...,...,...,...,...,...,...
35083,85+,male,2023,4158.0,85.0,inf,Alabama
35084,85+,female,2020,6228.0,85.0,inf,Alabama
35085,85+,female,2021,6201.0,85.0,inf,Alabama
35086,85+,female,2022,6131.0,85.0,inf,Alabama


#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2000 - 2009

![modelling table from population data by sex race and ethnicity 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202000%20to%202009.png)

In [64]:
test_df_00_10 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_race_and_ho_2000-2010.xls"), dtype=object, header=None)
test_df_00_10

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 3. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,"Sex, Race, and Hispanic Origin","April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,,
4,BOTH SEXES,4447207,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4779736,4785298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Note: Hispanic origin is considered an ethnici...,,,,,,,,,,,,,
80,Suggested Citation:,,,,,,,,,,,,,
81,Table 3. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
82,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


#### delete columns 1 and 12

In [65]:
cols_to_remove = [1, 12]
temp = test_df_00_10.drop(columns=cols_to_remove)
temp = temp.rename(columns={0: "ethnicity", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009, 13: 2010})
temp

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,table with row headers in column A and column ...,,,,,,,,,,,
1,Table 3. Intercensal Estimates of the Resident...,,,,,,,,,,,
2,"Sex, Race, and Hispanic Origin",Intercensal Estimates (as of July 1),,,,,,,,,,"July 1, 20103"
3,,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,
4,BOTH SEXES,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4785298
...,...,...,...,...,...,...,...,...,...,...,...,...
79,Note: Hispanic origin is considered an ethnici...,,,,,,,,,,,
80,Suggested Citation:,,,,,,,,,,,
81,Table 3. Intercensal Estimates of the Resident...,,,,,,,,,,,
82,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,


In [66]:
temp["ethnicity"] = temp["ethnicity"].apply(lambda string: np.nan if pd.isna(string) else string.strip(".").lower())
temp

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,table with row headers in column a and column ...,,,,,,,,,,,
1,table 3. intercensal estimates of the resident...,,,,,,,,,,,
2,"sex, race, and hispanic origin",Intercensal Estimates (as of July 1),,,,,,,,,,"July 1, 20103"
3,,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,
4,both sexes,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4785298
...,...,...,...,...,...,...,...,...,...,...,...,...
79,note: hispanic origin is considered an ethnici...,,,,,,,,,,,
80,suggested citation:,,,,,,,,,,,
81,table 3. intercensal estimates of the resident...,,,,,,,,,,,
82,"source: u.s. census bureau, population division",,,,,,,,,,,


#### start partitioning the spreadsheet by its important rows like the sex, and whether or not it is of hispanic origin

In [67]:
male_start = temp.index[temp["ethnicity"] == "male"].to_list()[0]
male_start

28

In [68]:
female_start = temp.index[temp["ethnicity"] == "female"].to_list()[0]
female_start

52

In [69]:
temp.iloc[75:]

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
75,two or more races,695.0,873.0,1032.0,1271.0,1439.0,1674.0,1894.0,2129.0,2358.0,2583.0,2871.0
76,"1 the april 1, 2000 population estimates base ...",,,,,,,,,,,
77,"2 the data source for april 1, 2010 is the 201...",,,,,,,,,,,
78,"3 the values for july 1, 2010 were produced by...",,,,,,,,,,,
79,note: hispanic origin is considered an ethnici...,,,,,,,,,,,
80,suggested citation:,,,,,,,,,,,
81,table 3. intercensal estimates of the resident...,,,,,,,,,,,
82,"source: u.s. census bureau, population division",,,,,,,,,,,
83,release date: october 2012,,,,,,,,,,,


In [70]:
# since there are multiple indeces with the two 
# or more races value we need to pick out the last value
female_end = temp.index[temp["ethnicity"] == "two or more races"].to_list()[-1]
female_end

75

In [71]:
male_pop_bracket = temp.iloc[male_start:female_start].reset_index(drop=True)
male_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,male,2149338,2158138,2165719,2179422,2192872,2213382,2243501,2265565,2287949,2309779,2323317
1,one race,2131735,2139363,2145750,2158203,2170339,2189399,2218038,2238614,2259304,2279448,2291585
2,white,1564949,1568320,1570843,1577854,1584510,1596161,1613829,1627055,1639481,1651374,1656912
3,black,538795,541736,543961,547635,551123,556584,565542,571152,577562,583504,588483
4,aian,11750,12178,12671,13154,13566,14031,14571,15008,15633,16196,16854
5,asian,15375,16047,17020,18080,19472,20771,21981,23122,24086,25653,26365
6,nhpi,866,1082,1255,1480,1668,1852,2115,2277,2542,2721,2971
7,two or more races,17603,18775,19969,21219,22533,23983,25463,26951,28645,30331,31732
8,not hispanic,2105469,2109450,2112343,2120877,2128979,2143111,2166023,2181290,2196499,2210995,2218813
9,one race,2088598,2091638,2093575,2101107,2108101,2121100,2142783,2156794,2170625,2183729,2190418


In [72]:
male_non_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
male_non_hisp_start

8

In [73]:
male_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
male_hisp_start

16

In [74]:
male_hisp_end = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "two or more races"].to_list()[-1]
male_hisp_end

23

In [75]:
male_non_hisp_pop_bracket = male_pop_bracket.iloc[male_non_hisp_start + 2:male_hisp_start].reset_index(drop=True)
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,white,1526743,1526185,1524987,1527793,1530095,1536529,1548231,1555756,1562282,1568357,1569557
1,black,535274,538069,540038,543496,546710,551811,560325,565599,571562,577049,581540
2,aian,10993,11143,11336,11530,11664,11839,12057,12198,12439,12573,12848
3,asian,15059,15663,16605,17609,18923,20177,21333,22373,23384,24794,25418
4,nhpi,529,578,609,679,709,744,837,868,958,956,1055
5,two or more races,16871,17812,18768,19770,20878,22011,23240,24496,25874,27266,28395


In [76]:
male_non_hisp_pop_bracket["origin"] = "non-hispanic"
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin
0,white,1526743,1526185,1524987,1527793,1530095,1536529,1548231,1555756,1562282,1568357,1569557,non-hispanic
1,black,535274,538069,540038,543496,546710,551811,560325,565599,571562,577049,581540,non-hispanic
2,aian,10993,11143,11336,11530,11664,11839,12057,12198,12439,12573,12848,non-hispanic
3,asian,15059,15663,16605,17609,18923,20177,21333,22373,23384,24794,25418,non-hispanic
4,nhpi,529,578,609,679,709,744,837,868,958,956,1055,non-hispanic
5,two or more races,16871,17812,18768,19770,20878,22011,23240,24496,25874,27266,28395,non-hispanic


In [77]:
male_non_hisp_pop_bracket["sex"] = "male"
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin,sex
0,white,1526743,1526185,1524987,1527793,1530095,1536529,1548231,1555756,1562282,1568357,1569557,non-hispanic,male
1,black,535274,538069,540038,543496,546710,551811,560325,565599,571562,577049,581540,non-hispanic,male
2,aian,10993,11143,11336,11530,11664,11839,12057,12198,12439,12573,12848,non-hispanic,male
3,asian,15059,15663,16605,17609,18923,20177,21333,22373,23384,24794,25418,non-hispanic,male
4,nhpi,529,578,609,679,709,744,837,868,958,956,1055,non-hispanic,male
5,two or more races,16871,17812,18768,19770,20878,22011,23240,24496,25874,27266,28395,non-hispanic,male


In [78]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_non_hisp_pop_bracket

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
ethnicity,origin,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
white,non-hispanic,male,1526743,1526185,1524987,1527793,1530095,1536529,1548231,1555756,1562282,1568357,1569557
black,non-hispanic,male,535274,538069,540038,543496,546710,551811,560325,565599,571562,577049,581540
aian,non-hispanic,male,10993,11143,11336,11530,11664,11839,12057,12198,12439,12573,12848
asian,non-hispanic,male,15059,15663,16605,17609,18923,20177,21333,22373,23384,24794,25418
nhpi,non-hispanic,male,529,578,609,679,709,744,837,868,958,956,1055
two or more races,non-hispanic,male,16871,17812,18768,19770,20878,22011,23240,24496,25874,27266,28395


In [79]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.stack().reset_index()
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,level_3,0
0,white,non-hispanic,male,2000,1526743
1,white,non-hispanic,male,2001,1526185
2,white,non-hispanic,male,2002,1524987
3,white,non-hispanic,male,2003,1527793
4,white,non-hispanic,male,2004,1530095
...,...,...,...,...,...
61,two or more races,non-hispanic,male,2006,23240
62,two or more races,non-hispanic,male,2007,24496
63,two or more races,non-hispanic,male,2008,25874
64,two or more races,non-hispanic,male,2009,27266


In [80]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,non-hispanic,male,2000,1526743
1,white,non-hispanic,male,2001,1526185
2,white,non-hispanic,male,2002,1524987
3,white,non-hispanic,male,2003,1527793
4,white,non-hispanic,male,2004,1530095
...,...,...,...,...,...
61,two or more races,non-hispanic,male,2006,23240
62,two or more races,non-hispanic,male,2007,24496
63,two or more races,non-hispanic,male,2008,25874
64,two or more races,non-hispanic,male,2009,27266


In [81]:
male_non_hisp_pop_bracket["population"] = male_non_hisp_pop_bracket["population"].astype(int)
male_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,non-hispanic,male,2000,1526743
1,white,non-hispanic,male,2001,1526185
2,white,non-hispanic,male,2002,1524987
3,white,non-hispanic,male,2003,1527793
4,white,non-hispanic,male,2004,1530095
...,...,...,...,...,...
61,two or more races,non-hispanic,male,2006,23240
62,two or more races,non-hispanic,male,2007,24496
63,two or more races,non-hispanic,male,2008,25874
64,two or more races,non-hispanic,male,2009,27266


In [82]:
column_summary(male_non_hisp_pop_bracket)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,ethnicity,object,0,66,6,"{'white': 11, 'black': 11, 'aian': 11, 'asian'..."
1,origin,object,0,66,1,{'non-hispanic': 66}
2,sex,object,0,66,1,{'male': 66}
3,year,int64,0,66,11,"{2000: 6, 2001: 6, 2002: 6, 2003: 6, 2004: 6, ..."
4,population,int64,0,66,66,"{1526743: 1, 1526185: 1, 1524987: 1, 1527793: ..."


In [83]:
male_hisp_pop_bracket = male_pop_bracket.iloc[male_hisp_start + 2:].reset_index(drop=True)
male_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,white,38206,42135,45856,50061,54415,59632,65598,71299,77199,83017,87355
1,black,3521,3667,3923,4139,4413,4773,5217,5553,6000,6455,6943
2,aian,757,1035,1335,1624,1902,2192,2514,2810,3194,3623,4006
3,asian,316,384,415,471,549,594,648,749,702,859,947
4,nhpi,337,504,646,801,959,1108,1278,1409,1584,1765,1916
5,two or more races,732,963,1201,1449,1655,1972,2223,2455,2771,3065,3337


In [84]:
male_hisp_pop_bracket["origin"] = "hispanic"
male_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin
0,white,38206,42135,45856,50061,54415,59632,65598,71299,77199,83017,87355,hispanic
1,black,3521,3667,3923,4139,4413,4773,5217,5553,6000,6455,6943,hispanic
2,aian,757,1035,1335,1624,1902,2192,2514,2810,3194,3623,4006,hispanic
3,asian,316,384,415,471,549,594,648,749,702,859,947,hispanic
4,nhpi,337,504,646,801,959,1108,1278,1409,1584,1765,1916,hispanic
5,two or more races,732,963,1201,1449,1655,1972,2223,2455,2771,3065,3337,hispanic


In [85]:
male_hisp_pop_bracket["sex"] = "male"
male_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin,sex
0,white,38206,42135,45856,50061,54415,59632,65598,71299,77199,83017,87355,hispanic,male
1,black,3521,3667,3923,4139,4413,4773,5217,5553,6000,6455,6943,hispanic,male
2,aian,757,1035,1335,1624,1902,2192,2514,2810,3194,3623,4006,hispanic,male
3,asian,316,384,415,471,549,594,648,749,702,859,947,hispanic,male
4,nhpi,337,504,646,801,959,1108,1278,1409,1584,1765,1916,hispanic,male
5,two or more races,732,963,1201,1449,1655,1972,2223,2455,2771,3065,3337,hispanic,male


In [86]:
male_hisp_pop_bracket = male_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
ethnicity,origin,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
white,hispanic,male,38206,42135,45856,50061,54415,59632,65598,71299,77199,83017,87355
black,hispanic,male,3521,3667,3923,4139,4413,4773,5217,5553,6000,6455,6943
aian,hispanic,male,757,1035,1335,1624,1902,2192,2514,2810,3194,3623,4006
asian,hispanic,male,316,384,415,471,549,594,648,749,702,859,947
nhpi,hispanic,male,337,504,646,801,959,1108,1278,1409,1584,1765,1916
two or more races,hispanic,male,732,963,1201,1449,1655,1972,2223,2455,2771,3065,3337


In [87]:
male_hisp_pop_bracket = male_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,level_3,0
0,white,hispanic,male,2000,38206
1,white,hispanic,male,2001,42135
2,white,hispanic,male,2002,45856
3,white,hispanic,male,2003,50061
4,white,hispanic,male,2004,54415
...,...,...,...,...,...
61,two or more races,hispanic,male,2006,2223
62,two or more races,hispanic,male,2007,2455
63,two or more races,hispanic,male,2008,2771
64,two or more races,hispanic,male,2009,3065


In [88]:
male_hisp_pop_bracket = male_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,hispanic,male,2000,38206
1,white,hispanic,male,2001,42135
2,white,hispanic,male,2002,45856
3,white,hispanic,male,2003,50061
4,white,hispanic,male,2004,54415
...,...,...,...,...,...
61,two or more races,hispanic,male,2006,2223
62,two or more races,hispanic,male,2007,2455
63,two or more races,hispanic,male,2008,2771
64,two or more races,hispanic,male,2009,3065


In [89]:
male_hisp_pop_bracket["population"] = male_hisp_pop_bracket["population"].astype(int)
male_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,hispanic,male,2000,38206
1,white,hispanic,male,2001,42135
2,white,hispanic,male,2002,45856
3,white,hispanic,male,2003,50061
4,white,hispanic,male,2004,54415
...,...,...,...,...,...
61,two or more races,hispanic,male,2006,2223
62,two or more races,hispanic,male,2007,2455
63,two or more races,hispanic,male,2008,2771
64,two or more races,hispanic,male,2009,3065


In [90]:
column_summary(male_hisp_pop_bracket)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,ethnicity,object,0,66,6,"{'white': 11, 'black': 11, 'aian': 11, 'asian'..."
1,origin,object,0,66,1,{'hispanic': 66}
2,sex,object,0,66,1,{'male': 66}
3,year,int64,0,66,11,"{2000: 6, 2001: 6, 2002: 6, 2003: 6, 2004: 6, ..."
4,population,int64,0,66,66,"{38206: 1, 42135: 1, 45856: 1, 50061: 1, 54415..."


In [91]:
female_pop_bracket = temp.iloc[female_start:female_end + 1].reset_index(drop=True)
female_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,female,2302835,2309496,2314370,2324069,2337857,2356423,2385480,2407275,2430257,2448159,2461981
1,one race,2284008,2289445,2293249,2301748,2314258,2331408,2358905,2379127,2400298,2416471,2428941
2,white,1631926,1633067,1633436,1637225,1643262,1653282,1669227,1680950,1692660,1701870,1707553
3,black,622659,625667,627711,630763,635252,640478,649939,656468,664051,668803,673858
4,aian,11512,11906,12295,12766,13189,13575,14095,14658,15089,15629,16154
5,asian,17169,17924,18790,19849,21240,22635,24049,25332,26597,28094,29145
6,nhpi,742,881,1017,1145,1315,1438,1595,1719,1901,2075,2231
7,two or more races,18827,20051,21121,22321,23599,25015,26575,28148,29959,31688,33040
8,not hispanic,2269688,2272462,2273567,2279142,2288466,2301872,2325116,2340898,2357998,2370425,2379638
9,one race,2251556,2253284,2253478,2258092,2266306,2278531,2300435,2314879,2330397,2341320,2349469


In [92]:
female_non_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
female_non_hisp_start

8

In [93]:
female_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
female_hisp_start

16

In [94]:
female_non_hisp_pop_bracket = female_pop_bracket.iloc[female_non_hisp_start + 2:female_hisp_start].reset_index(drop=True)
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,white,1604285,1602254,1599464,1599903,1602170,1607868,1618826,1625403,1632261,1637009,1639059
1,black,618835,621680,623616,626494,630827,635793,645025,651326,658530,662967,667757
2,aian,10963,11140,11313,11572,11784,11960,12246,12588,12775,12996,13262
3,asian,16919,17627,18461,19476,20810,22185,23557,24771,25987,27450,28423
4,nhpi,554,583,624,647,715,725,781,791,844,898,968
5,two or more races,18132,19178,20089,21050,22160,23341,24681,26019,27601,29105,30169


In [95]:
female_non_hisp_pop_bracket["origin"] = "non-hispanic"
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin
0,white,1604285,1602254,1599464,1599903,1602170,1607868,1618826,1625403,1632261,1637009,1639059,non-hispanic
1,black,618835,621680,623616,626494,630827,635793,645025,651326,658530,662967,667757,non-hispanic
2,aian,10963,11140,11313,11572,11784,11960,12246,12588,12775,12996,13262,non-hispanic
3,asian,16919,17627,18461,19476,20810,22185,23557,24771,25987,27450,28423,non-hispanic
4,nhpi,554,583,624,647,715,725,781,791,844,898,968,non-hispanic
5,two or more races,18132,19178,20089,21050,22160,23341,24681,26019,27601,29105,30169,non-hispanic


In [96]:
female_non_hisp_pop_bracket["sex"] = "female"
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin,sex
0,white,1604285,1602254,1599464,1599903,1602170,1607868,1618826,1625403,1632261,1637009,1639059,non-hispanic,female
1,black,618835,621680,623616,626494,630827,635793,645025,651326,658530,662967,667757,non-hispanic,female
2,aian,10963,11140,11313,11572,11784,11960,12246,12588,12775,12996,13262,non-hispanic,female
3,asian,16919,17627,18461,19476,20810,22185,23557,24771,25987,27450,28423,non-hispanic,female
4,nhpi,554,583,624,647,715,725,781,791,844,898,968,non-hispanic,female
5,two or more races,18132,19178,20089,21050,22160,23341,24681,26019,27601,29105,30169,non-hispanic,female


In [97]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_non_hisp_pop_bracket

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
ethnicity,origin,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
white,non-hispanic,female,1604285,1602254,1599464,1599903,1602170,1607868,1618826,1625403,1632261,1637009,1639059
black,non-hispanic,female,618835,621680,623616,626494,630827,635793,645025,651326,658530,662967,667757
aian,non-hispanic,female,10963,11140,11313,11572,11784,11960,12246,12588,12775,12996,13262
asian,non-hispanic,female,16919,17627,18461,19476,20810,22185,23557,24771,25987,27450,28423
nhpi,non-hispanic,female,554,583,624,647,715,725,781,791,844,898,968
two or more races,non-hispanic,female,18132,19178,20089,21050,22160,23341,24681,26019,27601,29105,30169


In [98]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.stack().reset_index()
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,level_3,0
0,white,non-hispanic,female,2000,1604285
1,white,non-hispanic,female,2001,1602254
2,white,non-hispanic,female,2002,1599464
3,white,non-hispanic,female,2003,1599903
4,white,non-hispanic,female,2004,1602170
...,...,...,...,...,...
61,two or more races,non-hispanic,female,2006,24681
62,two or more races,non-hispanic,female,2007,26019
63,two or more races,non-hispanic,female,2008,27601
64,two or more races,non-hispanic,female,2009,29105


In [99]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,non-hispanic,female,2000,1604285
1,white,non-hispanic,female,2001,1602254
2,white,non-hispanic,female,2002,1599464
3,white,non-hispanic,female,2003,1599903
4,white,non-hispanic,female,2004,1602170
...,...,...,...,...,...
61,two or more races,non-hispanic,female,2006,24681
62,two or more races,non-hispanic,female,2007,26019
63,two or more races,non-hispanic,female,2008,27601
64,two or more races,non-hispanic,female,2009,29105


In [100]:
female_non_hisp_pop_bracket["population"] = female_non_hisp_pop_bracket["population"].astype(int)
female_non_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,non-hispanic,female,2000,1604285
1,white,non-hispanic,female,2001,1602254
2,white,non-hispanic,female,2002,1599464
3,white,non-hispanic,female,2003,1599903
4,white,non-hispanic,female,2004,1602170
...,...,...,...,...,...
61,two or more races,non-hispanic,female,2006,24681
62,two or more races,non-hispanic,female,2007,26019
63,two or more races,non-hispanic,female,2008,27601
64,two or more races,non-hispanic,female,2009,29105


In [101]:
female_hisp_pop_bracket = female_pop_bracket.iloc[female_hisp_start + 2:].reset_index(drop=True)
female_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,white,27641,30813,33972,37322,41092,45414,50401,55547,60399,64861,68494
1,black,3824,3987,4095,4269,4425,4685,4914,5142,5521,5836,6101
2,aian,549,766,982,1194,1405,1615,1849,2070,2314,2633,2892
3,asian,250,297,329,373,430,450,492,561,610,644,722
4,nhpi,188,298,393,498,600,713,814,928,1057,1177,1263
5,two or more races,695,873,1032,1271,1439,1674,1894,2129,2358,2583,2871


In [102]:
female_hisp_pop_bracket["origin"] = "hispanic"
female_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin
0,white,27641,30813,33972,37322,41092,45414,50401,55547,60399,64861,68494,hispanic
1,black,3824,3987,4095,4269,4425,4685,4914,5142,5521,5836,6101,hispanic
2,aian,549,766,982,1194,1405,1615,1849,2070,2314,2633,2892,hispanic
3,asian,250,297,329,373,430,450,492,561,610,644,722,hispanic
4,nhpi,188,298,393,498,600,713,814,928,1057,1177,1263,hispanic
5,two or more races,695,873,1032,1271,1439,1674,1894,2129,2358,2583,2871,hispanic


In [103]:
female_hisp_pop_bracket["sex"] = "female"
female_hisp_pop_bracket

Unnamed: 0,ethnicity,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,origin,sex
0,white,27641,30813,33972,37322,41092,45414,50401,55547,60399,64861,68494,hispanic,female
1,black,3824,3987,4095,4269,4425,4685,4914,5142,5521,5836,6101,hispanic,female
2,aian,549,766,982,1194,1405,1615,1849,2070,2314,2633,2892,hispanic,female
3,asian,250,297,329,373,430,450,492,561,610,644,722,hispanic,female
4,nhpi,188,298,393,498,600,713,814,928,1057,1177,1263,hispanic,female
5,two or more races,695,873,1032,1271,1439,1674,1894,2129,2358,2583,2871,hispanic,female


In [104]:
female_hisp_pop_bracket = female_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_hisp_pop_bracket

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
ethnicity,origin,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
white,hispanic,female,27641,30813,33972,37322,41092,45414,50401,55547,60399,64861,68494
black,hispanic,female,3824,3987,4095,4269,4425,4685,4914,5142,5521,5836,6101
aian,hispanic,female,549,766,982,1194,1405,1615,1849,2070,2314,2633,2892
asian,hispanic,female,250,297,329,373,430,450,492,561,610,644,722
nhpi,hispanic,female,188,298,393,498,600,713,814,928,1057,1177,1263
two or more races,hispanic,female,695,873,1032,1271,1439,1674,1894,2129,2358,2583,2871


In [105]:
female_hisp_pop_bracket = female_hisp_pop_bracket.stack().reset_index()
female_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,level_3,0
0,white,hispanic,female,2000,27641
1,white,hispanic,female,2001,30813
2,white,hispanic,female,2002,33972
3,white,hispanic,female,2003,37322
4,white,hispanic,female,2004,41092
...,...,...,...,...,...
61,two or more races,hispanic,female,2006,1894
62,two or more races,hispanic,female,2007,2129
63,two or more races,hispanic,female,2008,2358
64,two or more races,hispanic,female,2009,2583


In [106]:
female_hisp_pop_bracket = female_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,hispanic,female,2000,27641
1,white,hispanic,female,2001,30813
2,white,hispanic,female,2002,33972
3,white,hispanic,female,2003,37322
4,white,hispanic,female,2004,41092
...,...,...,...,...,...
61,two or more races,hispanic,female,2006,1894
62,two or more races,hispanic,female,2007,2129
63,two or more races,hispanic,female,2008,2358
64,two or more races,hispanic,female,2009,2583


In [107]:
female_hisp_pop_bracket["population"] = female_hisp_pop_bracket["population"].astype(int)
female_hisp_pop_bracket

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,hispanic,female,2000,27641
1,white,hispanic,female,2001,30813
2,white,hispanic,female,2002,33972
3,white,hispanic,female,2003,37322
4,white,hispanic,female,2004,41092
...,...,...,...,...,...
61,two or more races,hispanic,female,2006,1894
62,two or more races,hispanic,female,2007,2129
63,two or more races,hispanic,female,2008,2358
64,two or more races,hispanic,female,2009,2583


In [108]:
final = pd.concat([male_non_hisp_pop_bracket, male_hisp_pop_bracket, female_non_hisp_pop_bracket, female_hisp_pop_bracket], axis=0, ignore_index=True)
final

Unnamed: 0,ethnicity,origin,sex,year,population
0,white,non-hispanic,male,2000,1526743
1,white,non-hispanic,male,2001,1526185
2,white,non-hispanic,male,2002,1524987
3,white,non-hispanic,male,2003,1527793
4,white,non-hispanic,male,2004,1530095
...,...,...,...,...,...
259,two or more races,hispanic,female,2006,1894
260,two or more races,hispanic,female,2007,2129
261,two or more races,hispanic,female,2008,2358
262,two or more races,hispanic,female,2009,2583


In [109]:
final = model_population_by_sex_race_ho_table(test_df_00_10, "Alabama", cols_to_remove, year_range="2000-2009")
final

Unnamed: 0,ethnicity,origin,sex,year,population,state
0,white,not hispanic,male,2000,1526743,Alabama
1,white,not hispanic,male,2001,1526185,Alabama
2,white,not hispanic,male,2002,1524987,Alabama
3,white,not hispanic,male,2003,1527793,Alabama
4,white,not hispanic,male,2004,1530095,Alabama
...,...,...,...,...,...,...
259,two or more races,hispanic,female,2006,1894,Alabama
260,two or more races,hispanic,female,2007,2129,Alabama
261,two or more races,hispanic,female,2008,2358,Alabama
262,two or more races,hispanic,female,2009,2583,Alabama


In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_20_23, 
        [cols_to_remove] * len(populations_by_sex_age_20_23),
        ["2020-2023"] * len(populations_by_sex_age_20_23)
    ))

state_populations_by_sex_age_df_20_23 = pd.concat(state_populations_by_sex_age_20_23, axis=0, ignore_index=True)

In [110]:
column_summary(final)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,ethnicity,object,0,264,6,"{'white': 44, 'black': 44, 'aian': 44, 'asian'..."
1,origin,object,0,264,2,"{'not hispanic': 132, 'hispanic': 132}"
2,sex,object,0,264,2,"{'male': 132, 'female': 132}"
3,year,int64,0,264,11,"{2000: 24, 2001: 24, 2002: 24, 2003: 24, 2004:..."
4,population,int64,0,264,263,"{549: 2, 713: 1, 1555756: 1, 2871: 1, 1526743:..."
5,state,object,0,264,1,{'Alabama': 264}


# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2010 - 2019

![modelling table from population data by sex race and ethnicity 2010 to 2019.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202010%20to%202019.png)