In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import ast

from concurrent.futures import ThreadPoolExecutor

from utilities.preprocessors import column_summary, model_population_table, model_population_by_sex_race_ho_table, get_state_populations
from utilities.visualizers import disp_cat_feat, view_feat_outliers


%load_ext autoreload
%autoreload 2

In [2]:
DATA_DIR = './data/population-data'
EXCLUSIONS = ["us_populations_per_state_2001_to_2021.csv"]
files = list(filter(lambda file: not file in EXCLUSIONS, os.listdir(DATA_DIR)))
populations_by_sex_age_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_race_and_ho" in file, files))
len(populations_by_sex_age_00_10), len(populations_by_sex_age_10_19), len(populations_by_sex_age_20_23),

(51, 51, 51)

In [3]:
len(populations_by_sex_race_ho_00_10), len(populations_by_sex_race_ho_10_19), len(populations_by_sex_race_ho_20_23),

(51, 51, 51)

# Read sample excel sheet

In [4]:
test_df = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2000-2010.xls"), dtype=object, header=None)
test_df.head(40)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,Sex and Age,"April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
4,BOTH SEXES,4447207,4452173,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736,4785298
5,.Under 5 years,296000,295185,296624.0,296046.0,295204.0,295970.0,296441.0,297222.0,300300.0,304842.0,305412.0,304957,304840
6,.5 to 9 years,315369,313178,307526.0,302632.0,299148.0,297554.0,298450.0,303581.0,306013.0,306682.0,307864.0,308229,308125
7,.10 to 14 years,320266,321372,323615.0,325008.0,326642.0,326228.0,323028.0,321867.0,320407.0,319503.0,319072.0,319655,319314
8,.15 to 19 years,324583,325612,321866.0,320749.0,321655.0,325095.0,330753.0,337003.0,341279.0,345580.0,346611.0,343471,341504
9,.20 to 24 years,306876,309170,318741.0,322812.0,326983.0,326749.0,326727.0,326239.0,327293.0,328751.0,332117.0,335322,336601


In [5]:
male_start = test_df[test_df[0] == "MALE"].index.to_list()[0]
male_start

39

In [6]:
pop_brackets = test_df.iloc[male_start:]
pop_brackets

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
39,MALE,2146560,2149338,2158138,2165719,2179422,2192872,2213382,2243501,2265565,2287949,2309779,2320188,2323317
40,.Under 5 years,151071,150609,151410,150856,150594,150699,150960,151442,153128,155061,155463,155265,155196
41,.5 to 9 years,161798,160685,157513,154832,152874,151948,152574,155157,156345,156770,157145,157340,157294
42,.10 to 14 years,164637,165170,166253,166796,167376,167198,165333,164608,163819,163445,163165,163417,163222
43,.15 to 19 years,164416,165156,163598,163527,164178,165836,169052,172295,174268,176205,176744,175151,174172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Note: Median age is calculated based on single...,,,,,,,,,,,,,
113,Suggested Citation:,,,,,,,,,,,,,
114,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
115,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


In [7]:
female_start = pop_brackets[pop_brackets[0] == "FEMALE"].index.to_list()[0]
male_end, female_end = pop_brackets[pop_brackets[0] == ".Median age (years)"].index.to_list()
male_end, female_end

(73, 108)

# split the excel spreadsheet into the male and female population brackets

In [None]:
male_pop_bracket = test_df.iloc[male_start:male_end]
male_pop_bracket

In [None]:
female_pop_bracket = test_df.iloc[female_start:female_end]
female_pop_bracket

#### Remove the following
* column `1`, column `12`, and column `13` (the reasoning is these contain only the population estimates of april 1 and not the most recent one which is supposed to be at july 1, and that column `13` is the year 2010 which already exists in the next population years)
* rows with mostly Nan and the a dot symbol in column `1` i.e. `[. Nan Nan Nan Nan Nan ... Nan]`
* and the male column 

#### we also rename the columns to be `bracket`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`

In [None]:
cols_to_remove = [1, 12, 13]
cond = (male_pop_bracket[0] != ".") & (male_pop_bracket[0] != "MALE")
name_map = {0: "bracket", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009}
temp_male = male_pop_bracket[cond].drop(columns=cols_to_remove).rename(columns=name_map).reset_index(drop=True)
temp_male

#### we remove the brackets that have duplicates

In [None]:
temp_male = temp_male.drop_duplicates(ignore_index=True)
temp_male

In [None]:
temp_male.index = temp_male["bracket"]
temp_male

In [None]:
del temp_male["bracket"]
temp_male

In [None]:
temp_male.shape

#### in order to achieve the ff:
![modelling table from population data by sex and age 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20and%20age%202000%20to%202009.png)
#### we need to somehow at least make our age brackets our index so that when each row is stacked vertically and the column becomes now the row index, that we are able to still keep track of our original row indeces which are our age brackets so that when the dataframe is stacked later and it becomes a multi index dataframe we can just reset the index so that our multi index of our age brackets and years now become columns themselves

In [None]:
temp_male = temp_male.stack().reset_index()
temp_male

In [None]:
temp_male = temp_male.rename(columns={"level_1": "year", 0: "population"})
temp_male

#### we also apply transformations to the `bracket` column by splitting say `.5 to 9 years` to 5 and 9 and have separate columns named `age_start` and `age_end` to take in these values

In [None]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over|\+)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    numbers = [ast.literal_eval(number) for number in numbers]
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "_under_5"
    if keyword == "under":
        return (0, numbers[-1])
    
    # e.g. "5 to 9" becomes "_5_to_9"
    elif keyword == "to":
        return (numbers[0], numbers[-1])
    
    # e.g. "9 and over" becomes "_9_and_over"
    elif keyword == "and over" or keyword == "+": 
        return (numbers[-1], float('inf'))
    
    # if it is a single number just return that number
    return (np.nan, numbers[-1])

In [None]:
age_ranges = temp_male["bracket"].apply(helper).to_list()
age_ranges

In [None]:
temp_male["age_start"], temp_male["age_end"] = list(zip(*age_ranges))
temp_male

#### delete the bracket column for the last time

In [None]:
del temp_male["bracket"]
temp_male

In [None]:
temp_male["sex"] = "Male"

In [None]:
temp_male["state"] = "Alabama"

In [None]:
final_male_pop_bracket = temp_male
final_male_pop_bracket

In [None]:
(final_male_pop_bracket["population"] <= 0).sum()

In [None]:
column_summary(final_male_pop_bracket)

#### We've done our preprocessing on the male population age brackets now we have to this same preprocessing on the female demographic. We can achieve this by writing a function that implements our above prototype that not only does it to the male population but also that of the female one, adn combines the resulting dataframes into one single dataframe for easy collation

In [None]:
model_population_table(test_df, "Alabama", cols_to_remove, year_range="2000-2009")

In [None]:
def concur_model_pop_tables(file, cols_to_remove, year_range, callback_fn=model_population_table):
    FILE_PATH = os.path.join(DATA_DIR, file)
    state = re.search(r"(^[A-Za-z\s]+)", file)
    state = "Unknown" if not state else state[0]

    # print(cols_to_remove)
    # print(year_range)
    # read excel file
    df = pd.read_excel(FILE_PATH, dtype=object, header=None)
    
    state_population = callback_fn(df, state, cols_to_remove, year_range=year_range)
    return state_population

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_00_09 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_00_10, 
        [cols_to_remove] * len(populations_by_sex_age_00_10),
        ["2000-2009"] * len(populations_by_sex_age_00_10)
    ))

state_populations_by_sex_age_df_00_09 = pd.concat(state_populations_by_sex_age_00_09, axis=0, ignore_index=True)
state_populations_by_sex_age_df_00_09["id"] = state_populations_by_sex_age_df_00_09.index + 1

In [None]:
state_populations_by_sex_age_df_00_09

#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

In [None]:
column_summary(state_populations_by_sex_age_df_00_09)

In [None]:
# take note this is just hte below five age bracket, 
# if we include all other age brackets we might have
# a bigger total population value per year
test = state_populations_by_sex_age_df_00_09.groupby(by=["year", "bracket", "sex", "state"]).agg(total_population=("population", "sum"))
test

# Reading sample excel file for year 2010-2019

In [None]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

In [None]:
start_index = test_df_10_19[test_df_10_19[0] == ".0"].index.to_list()[0]
start_index

In [None]:
end_index = test_df_10_19[test_df_10_19[0] == ".Median Age (years)"].index.to_list()[0]
end_index

#### Extract necessary rows

In [None]:
pop_brackets_10_19 = test_df_10_19.iloc[start_index: end_index]
pop_brackets_10_19

#### remove duplicates

In [None]:
temp = pop_brackets_10_19.drop_duplicates()
temp

#### remove rows with at least 5 nan values

In [None]:
temp = temp.dropna(thresh=5, axis=0)
temp

#### remove columns 1 to 7, then increment by 3

In [None]:
cols_to_remove = [1, 2, 3, 4, 5, 6] + list(range(7, temp.shape[1], 3))
cols_to_remove

In [None]:
temp = temp.drop(columns=cols_to_remove)
temp

In [None]:
temp.index = temp[0]
temp

In [None]:
del temp[0]
temp

In [None]:
# generate and create multi index for columns
years = sorted(list(range(2010, 2020)) * 2)
genders = ["male", "female"] * 10
multi_index_list = list(zip(years, genders))
multi_index_list

In [None]:
multi_index = pd.MultiIndex.from_tuples(multi_index_list)
multi_index

In [None]:
multi_index[0]

#### set multi indexed columns and delete index name of rows

In [None]:
temp.columns = multi_index
temp.index.name = "bracket"
temp

#### now we will have to stack each row vertically on each other and because we have multi indexed columns we will need to stack it twice in order to make these column indeces now be the row indeces

In [None]:
temp = temp.stack().stack()
temp

#### now we can reset the index such that these multi index rows now become columns of our new dataframe

In [None]:
temp = temp.reset_index()
temp

In [None]:
# rename the newly converted columns to bracket, sex, year, and population respectively
temp = temp.rename(columns={"level_1": "sex", "level_2": "year", 0: "population"})
temp

In [None]:
age_ranges_00_10 = temp["bracket"].apply(helper).to_list()
age_ranges_00_10

In [None]:
temp["age_start"], temp["age_end"] = list(zip(*age_ranges_00_10))
temp

In [None]:
temp["state"] = "Alabama"
temp

In [None]:
model_population_table(test_df_10_19, "Alabama", cols_to_remove, year_range="2010-2019")

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_10_19, 
        [cols_to_remove] * len(populations_by_sex_age_10_19),
        ["2010-2019"] * len(populations_by_sex_age_10_19)
    ))

state_populations_by_sex_age_df_10_19 = pd.concat(state_populations_by_sex_age_10_19, axis=0, ignore_index=True)
state_populations_by_sex_age_df_10_19["id"] = state_populations_by_sex_age_df_10_19.index + 1

In [None]:
state_populations_by_sex_age_df_10_19

In [None]:
column_summary(state_populations_by_sex_age_df_10_19)

#### again we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# reading sample excel file from year 2020-2023

In [None]:
test_df_20_23 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2020-2023.xlsx"), dtype=object, header=None)
test_df_20_23

#### clearly we now know we can discard columns 1, 2, 3, 4, 7, 10, and 13

In [None]:
cols_to_remove = [1, 2, 3, 4] + list(range(7, test_df_20_23.shape[1], 3))
cols_to_remove

In [None]:
model_population_table(test_df_20_23, "Alabama", cols_to_remove, year_range="2020-2023")

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_20_23, 
        [cols_to_remove] * len(populations_by_sex_age_20_23),
        ["2020-2023"] * len(populations_by_sex_age_20_23)
    ))

state_populations_by_sex_age_df_20_23 = pd.concat(state_populations_by_sex_age_20_23, axis=0, ignore_index=True)
state_populations_by_sex_age_df_20_23["id"] = state_populations_by_sex_age_df_20_23.index + 1

In [None]:
state_populations_by_sex_age_df_20_23

#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2000 - 2009

![modelling table from population data by sex race and ethnicity 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202000%20to%202009.png)

In [None]:
test_df_00_10 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_race_and_ho_2000-2010.xls"), dtype=object, header=None)
test_df_00_10

#### delete columns 1 and 12

In [None]:
cols_to_remove = [1, 12, 13]
temp = test_df_00_10.drop(columns=cols_to_remove)
temp = temp.rename(columns={0: "ethnicity", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009, 13: 2010})
temp

In [None]:
temp["ethnicity"] = temp["ethnicity"].apply(lambda string: np.nan if pd.isna(string) else string.strip(".").lower())
temp

#### start partitioning the spreadsheet by its important rows like the sex, and whether or not it is of hispanic origin

In [None]:
male_start = temp.index[temp["ethnicity"] == "male"].to_list()[0]
male_start

In [None]:
female_start = temp.index[temp["ethnicity"] == "female"].to_list()[0]
female_start

In [None]:
temp.iloc[75:]

In [None]:
# since there are multiple indeces with the two 
# or more races value we need to pick out the last value
female_end = temp.index[temp["ethnicity"] == "two or more races"].to_list()[-1]
female_end

In [None]:
male_pop_bracket = temp.iloc[male_start:female_start].reset_index(drop=True)
male_pop_bracket

In [None]:
male_non_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
male_non_hisp_start

In [None]:
male_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
male_hisp_start

In [None]:
male_hisp_end = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "two or more races"].to_list()[-1]
male_hisp_end

In [None]:
male_non_hisp_pop_bracket = male_pop_bracket.iloc[male_non_hisp_start + 2:male_hisp_start].reset_index(drop=True)
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["origin"] = "non-hispanic"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["sex"] = "male"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.stack().reset_index()
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["population"] = male_non_hisp_pop_bracket["population"].astype(int)
male_non_hisp_pop_bracket

In [None]:
column_summary(male_non_hisp_pop_bracket)

In [None]:
male_hisp_pop_bracket = male_pop_bracket.iloc[male_hisp_start + 2:].reset_index(drop=True)
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["origin"] = "hispanic"
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["sex"] = "male"
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["population"] = male_hisp_pop_bracket["population"].astype(int)
male_hisp_pop_bracket

In [None]:
column_summary(male_hisp_pop_bracket)

In [None]:
female_pop_bracket = temp.iloc[female_start:female_end + 1].reset_index(drop=True)
female_pop_bracket

In [None]:
female_non_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
female_non_hisp_start

In [None]:
female_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
female_hisp_start

In [None]:
female_non_hisp_pop_bracket = female_pop_bracket.iloc[female_non_hisp_start + 2:female_hisp_start].reset_index(drop=True)
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["origin"] = "non-hispanic"
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["sex"] = "female"
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.stack().reset_index()
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["population"] = female_non_hisp_pop_bracket["population"].astype(int)
female_non_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_pop_bracket.iloc[female_hisp_start + 2:].reset_index(drop=True)
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["origin"] = "hispanic"
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["sex"] = "female"
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.stack().reset_index()
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["population"] = female_hisp_pop_bracket["population"].astype(int)
female_hisp_pop_bracket

In [None]:
final = pd.concat([male_non_hisp_pop_bracket, male_hisp_pop_bracket, female_non_hisp_pop_bracket, female_hisp_pop_bracket], axis=0, ignore_index=True)
final

In [None]:
final = model_population_by_sex_race_ho_table(test_df_00_10, "Alabama", cols_to_remove, year_range="2000-2009")
final

In [None]:
cols_to_remove

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_00_09 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_00_10, 
        [cols_to_remove] * len(populations_by_sex_race_ho_00_10),
        ["2000-2009"] * len(populations_by_sex_race_ho_00_10),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_00_10)
    ))

state_populations_by_sex_race_ho_df_00_09 = pd.concat(state_populations_by_sex_race_ho_00_09, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_00_09["id"] = state_populations_by_sex_race_ho_df_00_09.index + 1

In [None]:
state_populations_by_sex_race_ho_df_00_09

In [None]:
column_summary(final)

# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2010 - 2019

![modelling table from population data by sex race and ethnicity 2010 to 2019.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202010%20to%202019.png)

In [None]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_race_and_ho_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

#### remove columns 1 and 2 and rename remaining columns to ethnicity and years 2010 to 2019

In [None]:
cols_to_remove = [1, 2]
temp = test_df_10_19.drop(columns=cols_to_remove)
temp = temp.rename(columns={0: "ethnicity", 3: 2010, 4: 2011, 5: 2012, 6: 2013, 7: 2014, 8: 2015, 9: 2016, 10: 2017, 11: 2018, 12: 2019})
temp

#### we can use set theory to use dictionary comprehension and build the new names for the columns instead always hardcoding the new names of the columns based on the years

In [None]:
test_cols_to_remove = [1, 12]

In [None]:
lo_year = 2010
hi_year = 2019

In [None]:
years_list = list(range(lo_year, hi_year + 1)) * 2
years_list

In [None]:
new_cols = list(set(test_df_10_19.columns) - set(test_cols_to_remove + [0]))
new_cols

In [None]:
{new_col: years_list[i] for i, new_col in enumerate(new_cols)}
# {new_col: "ethnicity" if new_col == 0 else 2 for i, new_col in enumerate(new_cols)}

In [None]:
temp["ethnicity"] = temp["ethnicity"].apply(lambda string: np.nan if pd.isna(string) else string.strip(".").lower())
temp

#### start partitioning the spreadsheet by its important rows like the sex, and whether or not it is of hispanic origin

In [None]:
male_start = temp.index[temp["ethnicity"] == "male"].to_list()[0]
male_start

In [None]:
temp.iloc[male_start]

In [None]:
female_start = temp.index[temp["ethnicity"] == "female"].to_list()[0]
female_start

In [None]:
# since there are multiple indeces with the two 
# or more races value we need to pick out the last value
female_end = temp.index[temp["ethnicity"] == "two or more races"].to_list()[-1]
female_end

In [None]:
temp.iloc[female_start: female_end + 1]

In [None]:
male_pop_bracket = temp.iloc[male_start:female_start].reset_index(drop=True)
male_pop_bracket

In [None]:
male_non_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "not hispanic"].to_list()[0]
male_non_hisp_start

In [None]:
# get the first occurence of the index as we are 
# not looking for multiple occurences until it reaches 
# the last occurence
male_non_hisp_end = male_pop_bracket.loc[male_non_hisp_start:, :] \
.index[male_pop_bracket.loc[male_non_hisp_start:, "ethnicity"].str.contains("race alone or in combination")].to_list()[0]
male_non_hisp_end

In [None]:
male_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
male_hisp_start

In [None]:
male_hisp_end = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "two or more races"].to_list()[-1]
male_hisp_end

#### once table is partioned by hispanic origin and sex we will now add the origin and sex columns and do typical stacking afterwards 

In [None]:
male_non_hisp_pop_bracket = male_pop_bracket.iloc[male_non_hisp_start + 2:male_non_hisp_end].reset_index(drop=True)
male_non_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_pop_bracket.iloc[male_hisp_start + 2:male_hisp_end + 1].reset_index(drop=True)
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["origin"] = "non-hispanic"
male_hisp_pop_bracket["origin"] = "hispanic"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["sex"] = "male"
male_hisp_pop_bracket["sex"] = "male"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket = male_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket = male_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_hisp_pop_bracket = male_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["population"] = male_non_hisp_pop_bracket["population"].astype(int)
male_hisp_pop_bracket["population"] = male_hisp_pop_bracket["population"].astype(int)
column_summary(male_hisp_pop_bracket)

In [None]:
female_pop_bracket = temp.iloc[female_start:female_end + 1].reset_index(drop=True)
female_pop_bracket

In [None]:
# calculate the list slices here for origin
female_non_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]

# get the first occurence of the index as we are 
# not looking for multiple occurences until it reaches 
# the last occurence
female_non_hisp_end = female_pop_bracket.loc[female_non_hisp_start:, :] \
.index[female_pop_bracket.loc[female_non_hisp_start:, "ethnicity"].str.contains("race alone or in combination")].to_list()[0]

female_non_hisp_start, female_non_hisp_end

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_10_19, 
        [cols_to_remove] * len(populations_by_sex_race_ho_10_19),
        ["2010-2019"] * len(populations_by_sex_race_ho_10_19),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_10_19)
    ))

state_populations_by_sex_race_ho_df_10_19 = pd.concat(state_populations_by_sex_race_ho_10_19, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_10_19["id"] = state_populations_by_sex_race_ho_df_10_19.index + 1

In [None]:
state_populations_by_sex_race_ho_df_10_19

In [None]:
state_populations_by_sex_race_ho_df_00_09["ethnicity"].value_counts()

# Modelling population table by sex, race, hispanic origin years 2020 to 2023

In [None]:
cols_to_remove = [1]

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_20_23, 
        [cols_to_remove] * len(populations_by_sex_race_ho_20_23),
        ["2020-2023"] * len(populations_by_sex_race_ho_20_23),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_20_23)
    ))

state_populations_by_sex_race_ho_df_20_23 = pd.concat(state_populations_by_sex_race_ho_20_23, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_20_23["id"] = state_populations_by_sex_race_ho_df_20_23.index + 1

In [None]:
state_populations_by_sex_race_ho_df_20_23

In [None]:
cols_to_remove_00_09 = [1, 12, 13]
cols_to_remove_10_19 = [1, 2, 3, 4, 5, 6, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34]
cols_to_remove_20_23 = [1, 2, 3, 4, 7, 10, 13]

In [None]:
state_populations_by_sex_age_df_00_09 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_00_09, 
    populations=populations_by_sex_age_00_10, 
    year_range="2000-2009",
    by="sex and age")
state_populations_by_sex_age_df_00_09

In [None]:
state_populations_by_sex_age_df_10_19 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_10_19, 
    populations=populations_by_sex_age_10_19, 
    year_range="2010-2019",
    by="sex and age")
state_populations_by_sex_age_df_10_19

In [None]:
state_populations_by_sex_age_df_20_23 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_20_23, 
    populations=populations_by_sex_age_20_23, 
    year_range="2020-2023",
    by="sex and age")
state_populations_by_sex_age_df_20_23

In [None]:
state_populations_by_sex_age_df_00_23 = pd.concat([state_populations_by_sex_age_df_00_09, state_populations_by_sex_age_df_10_19, state_populations_by_sex_age_df_20_23], axis=0, ignore_index=True)
state_populations_by_sex_age_df_00_23

In [None]:
state_populations_by_sex_race_ho_df_00_09 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1, 12, 13], 
    populations=populations_by_sex_race_ho_00_10, 
    year_range="2000-2009",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_00_09

In [None]:
state_populations_by_sex_race_ho_df_10_19 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1, 2], 
    populations=populations_by_sex_race_ho_10_19, 
    year_range="2010-2019",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_10_19

In [None]:
state_populations_by_sex_race_ho_df_20_23 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1], 
    populations=populations_by_sex_race_ho_20_23, 
    year_range="2020-2023",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_20_23

In [None]:
state_populations_by_sex_race_ho_df_00_23 = pd.concat([state_populations_by_sex_race_ho_df_00_09, state_populations_by_sex_race_ho_df_10_19, state_populations_by_sex_race_ho_df_20_23], axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_00_23

In [None]:
state_populations_by_sex_race_ho_df_00_23["id"] = state_populations_by_sex_race_ho_df_00_23.index + 1
state_populations_by_sex_age_df_00_23["id"] = state_populations_by_sex_age_df_00_23.index + 1

# Export to image

In [None]:
import dataframe_image as dfi

In [None]:
state_populations_by_sex_age_df_00_23

In [None]:
state_populations_by_sex_age_df_00_23[:100].dfi.export("./figures & images/state_populations_by_sex_age_00_23.png")
state_populations_by_sex_age_df_00_23[-100:].dfi.export("./figures & images/state_populations_by_sex_age_00_23_last.png")

In [None]:
state_populations_by_sex_race_ho_df_00_23

In [None]:
state_populations_by_sex_race_ho_df_00_23[:100].dfi.export("./figures & images/state_populations_by_sex_race_ho_00_23.png")
state_populations_by_sex_race_ho_df_00_23[-100:].dfi.export("./figures & images/state_populations_by_sex_race_ho_00_23_last.png")

# Converting all code to pyspark for faster processing

In [None]:
# note that pyarrow 4.0.0 is a dependency of pyspark pandas api
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StringType, StructField, StructType

In [None]:
pyspark.__file__

In [None]:
path = os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2000-2010.xls")
path

In [None]:
test_df_00_10 = pd.read_excel(path, dtype=object, header=None)
test_df_00_10

In [None]:
conf = SparkConf()
conf.set("spark.jars.packages", "com.crealytics:spark-excel_2.12:3.5.1_0.20.4")

In [None]:
spark = SparkSession.builder.appName('test')\
    .config(conf=conf)\
    .getOrCreate()

In [None]:
test_spark_df_00_10 = spark.read.format("com.crealytics.spark.excel")\
    .option("header", "false")\
    .option("inferSchema", "true")\
    .load(path)