In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import ast

from concurrent.futures import ThreadPoolExecutor

from utilities.preprocessors import column_summary, model_population_table, model_population_by_sex_race_ho_table, get_state_populations
from utilities.visualizers import disp_cat_feat, view_feat_outliers


%load_ext autoreload
%autoreload 2

In [2]:
DATA_DIR = './data/population-data'
EXCLUSIONS = ["us_populations_per_state_2001_to_2021.csv"]
files = list(filter(lambda file: not file in EXCLUSIONS, os.listdir(DATA_DIR)))
populations_by_sex_age_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_00_10 = list(filter(lambda file: "2000-2010" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_10_19 = list(filter(lambda file: "2010-2019" in file and "by_sex_race_and_ho" in file, files))
populations_by_sex_age_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_and_age" in file, files))
populations_by_sex_race_ho_20_23 = list(filter(lambda file: "2020-2023" in file and "by_sex_race_and_ho" in file, files))
len(populations_by_sex_age_00_10), len(populations_by_sex_age_10_19), len(populations_by_sex_age_20_23),

(51, 51, 51)

In [None]:
len(populations_by_sex_race_ho_00_10), len(populations_by_sex_race_ho_10_19), len(populations_by_sex_race_ho_20_23),

# Read sample excel sheet

In [None]:
test_df = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2000-2010.xls"), dtype=object, header=None)
test_df.head(40)

In [None]:
male_start = test_df[test_df[0] == "MALE"].index.to_list()[0]
male_start

In [None]:
pop_brackets = test_df.iloc[male_start:]
pop_brackets

In [None]:
female_start = pop_brackets[pop_brackets[0] == "FEMALE"].index.to_list()[0]
male_end, female_end = pop_brackets[pop_brackets[0] == ".Median age (years)"].index.to_list()
male_end, female_end

# split the excel spreadsheet into the male and female population brackets

In [None]:
male_pop_bracket = test_df.iloc[male_start:male_end]
male_pop_bracket

In [None]:
female_pop_bracket = test_df.iloc[female_start:female_end]
female_pop_bracket

#### Remove the following
* column `1`, column `12`, and column `13` (the reasoning is these contain only the population estimates of april 1 and not the most recent one which is supposed to be at july 1, and that column `13` is the year 2010 which already exists in the next population years)
* rows with mostly Nan and the a dot symbol in column `1` i.e. `[. Nan Nan Nan Nan Nan ... Nan]`
* and the male column 

#### we also rename the columns to be `bracket`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`

In [None]:
cols_to_remove = [1, 12, 13]
cond = (male_pop_bracket[0] != ".") & (male_pop_bracket[0] != "MALE")
name_map = {0: "bracket", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009}
temp_male = male_pop_bracket[cond].drop(columns=cols_to_remove).rename(columns=name_map).reset_index(drop=True)
temp_male

#### we remove the brackets that have duplicates

In [None]:
temp_male = temp_male.drop_duplicates(ignore_index=True)
temp_male

In [None]:
temp_male.index = temp_male["bracket"]
temp_male

In [None]:
del temp_male["bracket"]
temp_male

In [None]:
temp_male.shape

#### in order to achieve the ff:
![modelling table from population data by sex and age 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20and%20age%202000%20to%202009.png)
#### we need to somehow at least make our age brackets our index so that when each row is stacked vertically and the column becomes now the row index, that we are able to still keep track of our original row indeces which are our age brackets so that when the dataframe is stacked later and it becomes a multi index dataframe we can just reset the index so that our multi index of our age brackets and years now become columns themselves

In [None]:
temp_male = temp_male.stack().reset_index()
temp_male

In [None]:
temp_male = temp_male.rename(columns={"level_1": "year", 0: "population"})
temp_male

#### we also apply transformations to the `bracket` column by splitting say `.5 to 9 years` to 5 and 9 and have separate columns named `age_start` and `age_end` to take in these values

In [None]:
def helper(bracket: str | None):
    bracket = bracket.lower()
    keyword = re.search(r"(under|to|and over|\+)", bracket)
    keyword = np.nan if not keyword else keyword[0]
    numbers = re.findall(r"\d+", bracket)
    numbers = [ast.literal_eval(number) for number in numbers]
    # print(keyword)
    # print(numbers)

    # e.g. "under 5" becomes "_under_5"
    if keyword == "under":
        return (0, numbers[-1])
    
    # e.g. "5 to 9" becomes "_5_to_9"
    elif keyword == "to":
        return (numbers[0], numbers[-1])
    
    # e.g. "9 and over" becomes "_9_and_over"
    elif keyword == "and over" or keyword == "+": 
        return (numbers[-1], float('inf'))
    
    # if it is a single number just return that number
    return (np.nan, numbers[-1])

In [None]:
age_ranges = temp_male["bracket"].apply(helper).to_list()
age_ranges

In [None]:
temp_male["age_start"], temp_male["age_end"] = list(zip(*age_ranges))
temp_male

#### delete the bracket column for the last time

In [None]:
del temp_male["bracket"]
temp_male

In [None]:
temp_male["sex"] = "Male"

In [None]:
temp_male["state"] = "Alabama"

In [None]:
final_male_pop_bracket = temp_male
final_male_pop_bracket

In [None]:
(final_male_pop_bracket["population"] <= 0).sum()

In [None]:
column_summary(final_male_pop_bracket)

#### We've done our preprocessing on the male population age brackets now we have to this same preprocessing on the female demographic. We can achieve this by writing a function that implements our above prototype that not only does it to the male population but also that of the female one, adn combines the resulting dataframes into one single dataframe for easy collation

In [None]:
model_population_table(test_df, "Alabama", cols_to_remove, year_range="2000-2009")

In [None]:
def concur_model_pop_tables(file, cols_to_remove, year_range, callback_fn=model_population_table):
    FILE_PATH = os.path.join(DATA_DIR, file)
    state = re.search(r"(^[A-Za-z\s]+)", file)
    state = "Unknown" if not state else state[0]

    # print(cols_to_remove)
    # print(year_range)
    # read excel file
    df = pd.read_excel(FILE_PATH, dtype=object, header=None)
    
    state_population = callback_fn(df, state, cols_to_remove, year_range=year_range)
    return state_population

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_00_09 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_00_10, 
        [cols_to_remove] * len(populations_by_sex_age_00_10),
        ["2000-2009"] * len(populations_by_sex_age_00_10)
    ))

state_populations_by_sex_age_df_00_09 = pd.concat(state_populations_by_sex_age_00_09, axis=0, ignore_index=True)
state_populations_by_sex_age_df_00_09["id"] = state_populations_by_sex_age_df_00_09.index + 1

In [None]:
state_populations_by_sex_age_df_00_09

#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

In [None]:
column_summary(state_populations_by_sex_age_df_00_09)

In [None]:
# take note this is just hte below five age bracket, 
# if we include all other age brackets we might have
# a bigger total population value per year
test = state_populations_by_sex_age_df_00_09.groupby(by=["year", "bracket", "sex", "state"]).agg(total_population=("population", "sum"))
test

# Reading sample excel file for year 2010-2019

In [None]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

In [None]:
start_index = test_df_10_19[test_df_10_19[0] == ".0"].index.to_list()[0]
start_index

In [None]:
end_index = test_df_10_19[test_df_10_19[0] == ".Median Age (years)"].index.to_list()[0]
end_index

#### Extract necessary rows

In [None]:
pop_brackets_10_19 = test_df_10_19.iloc[start_index: end_index]
pop_brackets_10_19

#### remove duplicates

In [None]:
temp = pop_brackets_10_19.drop_duplicates()
temp

#### remove rows with at least 5 nan values

In [None]:
temp = temp.dropna(thresh=5, axis=0)
temp

#### remove columns 1 to 7, then increment by 3

In [None]:
cols_to_remove = [1, 2, 3, 4, 5, 6] + list(range(7, temp.shape[1], 3))
cols_to_remove

In [None]:
temp = temp.drop(columns=cols_to_remove)
temp

In [None]:
temp.index = temp[0]
temp

In [None]:
del temp[0]
temp

In [None]:
# generate and create multi index for columns
years = sorted(list(range(2010, 2020)) * 2)
genders = ["male", "female"] * 10
multi_index_list = list(zip(years, genders))
multi_index_list

In [None]:
multi_index = pd.MultiIndex.from_tuples(multi_index_list)
multi_index

In [None]:
multi_index[0]

#### set multi indexed columns and delete index name of rows

In [None]:
temp.columns = multi_index
temp.index.name = "bracket"
temp

#### now we will have to stack each row vertically on each other and because we have multi indexed columns we will need to stack it twice in order to make these column indeces now be the row indeces

In [None]:
temp = temp.stack().stack()
temp

#### now we can reset the index such that these multi index rows now become columns of our new dataframe

In [None]:
temp = temp.reset_index()
temp

In [None]:
# rename the newly converted columns to bracket, sex, year, and population respectively
temp = temp.rename(columns={"level_1": "sex", "level_2": "year", 0: "population"})
temp

In [None]:
age_ranges_00_10 = temp["bracket"].apply(helper).to_list()
age_ranges_00_10

In [None]:
temp["age_start"], temp["age_end"] = list(zip(*age_ranges_00_10))
temp

In [None]:
temp["state"] = "Alabama"
temp

In [None]:
model_population_table(test_df_10_19, "Alabama", cols_to_remove, year_range="2010-2019")

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_10_19, 
        [cols_to_remove] * len(populations_by_sex_age_10_19),
        ["2010-2019"] * len(populations_by_sex_age_10_19)
    ))

state_populations_by_sex_age_df_10_19 = pd.concat(state_populations_by_sex_age_10_19, axis=0, ignore_index=True)
state_populations_by_sex_age_df_10_19["id"] = state_populations_by_sex_age_df_10_19.index + 1

In [None]:
state_populations_by_sex_age_df_10_19

In [None]:
column_summary(state_populations_by_sex_age_df_10_19)

#### again we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# reading sample excel file from year 2020-2023

In [None]:
test_df_20_23 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2020-2023.xlsx"), dtype=object, header=None)
test_df_20_23

#### clearly we now know we can discard columns 1, 2, 3, 4, 7, 10, and 13

In [None]:
cols_to_remove = [1, 2, 3, 4] + list(range(7, test_df_20_23.shape[1], 3))
cols_to_remove

In [None]:
model_population_table(test_df_20_23, "Alabama", cols_to_remove, year_range="2020-2023")

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_age_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_age_20_23, 
        [cols_to_remove] * len(populations_by_sex_age_20_23),
        ["2020-2023"] * len(populations_by_sex_age_20_23)
    ))

state_populations_by_sex_age_df_20_23 = pd.concat(state_populations_by_sex_age_20_23, axis=0, ignore_index=True)
state_populations_by_sex_age_df_20_23["id"] = state_populations_by_sex_age_df_20_23.index + 1

In [None]:
state_populations_by_sex_age_df_20_23

#### we don't save this modelled dataset as we will instead be uploading automatically using an orchestration tool like airflow to a data warehouse like databricks

# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2000 - 2009

![modelling table from population data by sex race and ethnicity 2000 to 2009.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202000%20to%202009.png)

In [None]:
test_df_00_10 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_race_and_ho_2000-2010.xls"), dtype=object, header=None)
test_df_00_10

#### delete columns 1 and 12

In [None]:
cols_to_remove = [1, 12, 13]
temp = test_df_00_10.drop(columns=cols_to_remove)
temp = temp.rename(columns={0: "ethnicity", 2: 2000, 3: 2001, 4: 2002, 5: 2003, 6: 2004, 7: 2005, 8: 2006, 9: 2007, 10: 2008, 11: 2009, 13: 2010})
temp

In [None]:
temp["ethnicity"] = temp["ethnicity"].apply(lambda string: np.nan if pd.isna(string) else string.strip(".").lower())
temp

#### start partitioning the spreadsheet by its important rows like the sex, and whether or not it is of hispanic origin

In [None]:
male_start = temp.index[temp["ethnicity"] == "male"].to_list()[0]
male_start

In [None]:
female_start = temp.index[temp["ethnicity"] == "female"].to_list()[0]
female_start

In [None]:
temp.iloc[75:]

In [None]:
# since there are multiple indeces with the two 
# or more races value we need to pick out the last value
female_end = temp.index[temp["ethnicity"] == "two or more races"].to_list()[-1]
female_end

In [None]:
male_pop_bracket = temp.iloc[male_start:female_start].reset_index(drop=True)
male_pop_bracket

In [None]:
male_non_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
male_non_hisp_start

In [None]:
male_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
male_hisp_start

In [None]:
male_hisp_end = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "two or more races"].to_list()[-1]
male_hisp_end

In [None]:
male_non_hisp_pop_bracket = male_pop_bracket.iloc[male_non_hisp_start + 2:male_hisp_start].reset_index(drop=True)
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["origin"] = "non-hispanic"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["sex"] = "male"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.stack().reset_index()
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["population"] = male_non_hisp_pop_bracket["population"].astype(int)
male_non_hisp_pop_bracket

In [None]:
column_summary(male_non_hisp_pop_bracket)

In [None]:
male_hisp_pop_bracket = male_pop_bracket.iloc[male_hisp_start + 2:].reset_index(drop=True)
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["origin"] = "hispanic"
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["sex"] = "male"
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket["population"] = male_hisp_pop_bracket["population"].astype(int)
male_hisp_pop_bracket

In [None]:
column_summary(male_hisp_pop_bracket)

In [None]:
female_pop_bracket = temp.iloc[female_start:female_end + 1].reset_index(drop=True)
female_pop_bracket

In [None]:
female_non_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]
female_non_hisp_start

In [None]:
female_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
female_hisp_start

In [None]:
female_non_hisp_pop_bracket = female_pop_bracket.iloc[female_non_hisp_start + 2:female_hisp_start].reset_index(drop=True)
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["origin"] = "non-hispanic"
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["sex"] = "female"
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.stack().reset_index()
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket = female_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_non_hisp_pop_bracket

In [None]:
female_non_hisp_pop_bracket["population"] = female_non_hisp_pop_bracket["population"].astype(int)
female_non_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_pop_bracket.iloc[female_hisp_start + 2:].reset_index(drop=True)
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["origin"] = "hispanic"
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["sex"] = "female"
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.stack().reset_index()
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket = female_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
female_hisp_pop_bracket

In [None]:
female_hisp_pop_bracket["population"] = female_hisp_pop_bracket["population"].astype(int)
female_hisp_pop_bracket

In [None]:
final = pd.concat([male_non_hisp_pop_bracket, male_hisp_pop_bracket, female_non_hisp_pop_bracket, female_hisp_pop_bracket], axis=0, ignore_index=True)
final

In [None]:
final = model_population_by_sex_race_ho_table(test_df_00_10, "Alabama", cols_to_remove, year_range="2000-2009")
final

In [None]:
cols_to_remove

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_00_09 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_00_10, 
        [cols_to_remove] * len(populations_by_sex_race_ho_00_10),
        ["2000-2009"] * len(populations_by_sex_race_ho_00_10),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_00_10)
    ))

state_populations_by_sex_race_ho_df_00_09 = pd.concat(state_populations_by_sex_race_ho_00_09, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_00_09["id"] = state_populations_by_sex_race_ho_df_00_09.index + 1

In [None]:
state_populations_by_sex_race_ho_df_00_09

In [None]:
column_summary(final)

# Modelling excel spreadsheets with population values based on sex, race, and hispanic origin 2010 - 2019

![modelling table from population data by sex race and ethnicity 2010 to 2019.png](./figures%20&%20images/modelling%20table%20from%20population%20data%20by%20sex%20race%20and%20ethnicity%202010%20to%202019.png)

In [None]:
test_df_10_19 = pd.read_excel(os.path.join(DATA_DIR, "Alabama_pop_by_sex_race_and_ho_2010-2019.xlsx"), dtype=object, header=None)
test_df_10_19

#### remove columns 1 and 2 and rename remaining columns to ethnicity and years 2010 to 2019

In [None]:
cols_to_remove = [1, 2]
temp = test_df_10_19.drop(columns=cols_to_remove)
temp = temp.rename(columns={0: "ethnicity", 3: 2010, 4: 2011, 5: 2012, 6: 2013, 7: 2014, 8: 2015, 9: 2016, 10: 2017, 11: 2018, 12: 2019})
temp

#### we can use set theory to use dictionary comprehension and build the new names for the columns instead always hardcoding the new names of the columns based on the years

In [None]:
test_cols_to_remove = [1, 12]

In [None]:
lo_year = 2010
hi_year = 2019

In [None]:
years_list = list(range(lo_year, hi_year + 1)) * 2
years_list

In [None]:
new_cols = list(set(test_df_10_19.columns) - set(test_cols_to_remove + [0]))
new_cols

In [None]:
{new_col: years_list[i] for i, new_col in enumerate(new_cols)}
# {new_col: "ethnicity" if new_col == 0 else 2 for i, new_col in enumerate(new_cols)}

In [None]:
temp["ethnicity"] = temp["ethnicity"].apply(lambda string: np.nan if pd.isna(string) else string.strip(".").lower())
temp

#### start partitioning the spreadsheet by its important rows like the sex, and whether or not it is of hispanic origin

In [None]:
male_start = temp.index[temp["ethnicity"] == "male"].to_list()[0]
male_start

In [None]:
temp.iloc[male_start]

In [None]:
female_start = temp.index[temp["ethnicity"] == "female"].to_list()[0]
female_start

In [None]:
# since there are multiple indeces with the two 
# or more races value we need to pick out the last value
female_end = temp.index[temp["ethnicity"] == "two or more races"].to_list()[-1]
female_end

In [None]:
temp.iloc[female_start: female_end + 1]

In [None]:
male_pop_bracket = temp.iloc[male_start:female_start].reset_index(drop=True)
male_pop_bracket

In [None]:
male_non_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "not hispanic"].to_list()[0]
male_non_hisp_start

In [None]:
# get the first occurence of the index as we are 
# not looking for multiple occurences until it reaches 
# the last occurence
male_non_hisp_end = male_pop_bracket.loc[male_non_hisp_start:, :] \
.index[male_pop_bracket.loc[male_non_hisp_start:, "ethnicity"].str.contains("race alone or in combination")].to_list()[0]
male_non_hisp_end

In [None]:
male_hisp_start = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "hispanic"].to_list()[-1]
male_hisp_start

In [None]:
male_hisp_end = male_pop_bracket.index[male_pop_bracket["ethnicity"] == "two or more races"].to_list()[-1]
male_hisp_end

#### once table is partioned by hispanic origin and sex we will now add the origin and sex columns and do typical stacking afterwards 

In [None]:
male_non_hisp_pop_bracket = male_pop_bracket.iloc[male_non_hisp_start + 2:male_non_hisp_end].reset_index(drop=True)
male_non_hisp_pop_bracket

In [None]:
male_hisp_pop_bracket = male_pop_bracket.iloc[male_hisp_start + 2:male_hisp_end + 1].reset_index(drop=True)
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["origin"] = "non-hispanic"
male_hisp_pop_bracket["origin"] = "hispanic"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["sex"] = "male"
male_hisp_pop_bracket["sex"] = "male"
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket = male_hisp_pop_bracket.set_index(keys=["ethnicity", "origin", "sex"])
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket = male_hisp_pop_bracket.stack().reset_index()
male_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket = male_non_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_hisp_pop_bracket = male_hisp_pop_bracket.rename(columns={"level_3": "year", 0: "population"})
male_non_hisp_pop_bracket

In [None]:
male_non_hisp_pop_bracket["population"] = male_non_hisp_pop_bracket["population"].astype(int)
male_hisp_pop_bracket["population"] = male_hisp_pop_bracket["population"].astype(int)
column_summary(male_hisp_pop_bracket)

In [None]:
female_pop_bracket = temp.iloc[female_start:female_end + 1].reset_index(drop=True)
female_pop_bracket

In [None]:
# calculate the list slices here for origin
female_non_hisp_start = female_pop_bracket.index[female_pop_bracket["ethnicity"] == "not hispanic"].to_list()[-1]

# get the first occurence of the index as we are 
# not looking for multiple occurences until it reaches 
# the last occurence
female_non_hisp_end = female_pop_bracket.loc[female_non_hisp_start:, :] \
.index[female_pop_bracket.loc[female_non_hisp_start:, "ethnicity"].str.contains("race alone or in combination")].to_list()[0]

female_non_hisp_start, female_non_hisp_end

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_10_19 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_10_19, 
        [cols_to_remove] * len(populations_by_sex_race_ho_10_19),
        ["2010-2019"] * len(populations_by_sex_race_ho_10_19),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_10_19)
    ))

state_populations_by_sex_race_ho_df_10_19 = pd.concat(state_populations_by_sex_race_ho_10_19, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_10_19["id"] = state_populations_by_sex_race_ho_df_10_19.index + 1

In [None]:
state_populations_by_sex_race_ho_df_10_19

In [None]:
state_populations_by_sex_race_ho_df_00_09["ethnicity"].value_counts()

# Modelling population table by sex, race, hispanic origin years 2020 to 2023

In [None]:
cols_to_remove = [1]

In [None]:
with ThreadPoolExecutor() as exe:
    state_populations_by_sex_race_ho_20_23 = list(exe.map(
        concur_model_pop_tables, 
        populations_by_sex_race_ho_20_23, 
        [cols_to_remove] * len(populations_by_sex_race_ho_20_23),
        ["2020-2023"] * len(populations_by_sex_race_ho_20_23),
        [model_population_by_sex_race_ho_table] * len(populations_by_sex_race_ho_20_23)
    ))

state_populations_by_sex_race_ho_df_20_23 = pd.concat(state_populations_by_sex_race_ho_20_23, axis=0, ignore_index=True)
state_populations_by_sex_race_ho_df_20_23["id"] = state_populations_by_sex_race_ho_df_20_23.index + 1

In [None]:
state_populations_by_sex_race_ho_df_20_23

In [None]:
cols_to_remove_00_09 = [1, 12, 13]
cols_to_remove_10_19 = [1, 2, 3, 4, 5, 6, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34]
cols_to_remove_20_23 = [1, 2, 3, 4, 7, 10, 13]

In [None]:
state_populations_by_sex_age_df_00_09 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_00_09, 
    populations=populations_by_sex_age_00_10, 
    year_range="2000-2009",
    by="sex and age")
state_populations_by_sex_age_df_00_09

In [None]:
state_populations_by_sex_age_df_10_19 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_10_19, 
    populations=populations_by_sex_age_10_19, 
    year_range="2010-2019",
    by="sex and age")
state_populations_by_sex_age_df_10_19

In [None]:
state_populations_by_sex_age_df_20_23 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=cols_to_remove_20_23, 
    populations=populations_by_sex_age_20_23, 
    year_range="2020-2023",
    by="sex and age")
state_populations_by_sex_age_df_20_23

In [None]:
state_populations_by_sex_race_ho_df_00_09 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1, 12, 13], 
    populations=populations_by_sex_race_ho_00_10, 
    year_range="2000-2009",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_00_09

In [None]:
state_populations_by_sex_race_ho_df_10_19 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1, 2], 
    populations=populations_by_sex_race_ho_10_19, 
    year_range="2010-2019",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_10_19

In [None]:
state_populations_by_sex_race_ho_df_20_23 = get_state_populations(
    DATA_DIR, 
    cols_to_remove=[1], 
    populations=populations_by_sex_race_ho_20_23, 
    year_range="2020-2023",
    by="sex race and ho")
state_populations_by_sex_race_ho_df_20_23

# Converting all code to pyspark for faster processing

In [3]:
# note that pyarrow 4.0.0 is a dependency of pyspark pandas api
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StringType, StructField, StructType

In [4]:
pyspark.__file__

'c:\\Users\\LARRY\\anaconda3\\envs\\tech-interview\\Lib\\site-packages\\pyspark\\__init__.py'

#### if pyspark is not yet added to our path upon installation in our environment or globally we will need to locate the bin directory inside pyspark directory and add the bin directory path to our `PATH` environment variable. Why we do this is so we can run spark-submit and other spark related commands in our command line.

* if an error 
```
25/04/22 12:52:59 WARN Shell: Did not find winutils.exe: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Apps > Advanced app settings > App execution aliases.
25/04/22 12:52:59 INFO ShutdownHookManager: Shutdown hook called
25/04/22 12:52:59 INFO ShutdownHookManager: Deleting directory C:\Users\LARRY\AppData\Local\Temp\spark-b0654aae-f91c-442d-b27a-66b287ffd557
```
occurs this means that we have to install winutils via pip in our conda environment or globally in our  local machine.

* another solution is gooing to manage app execution aliases and turning off python and python3: https://stackoverflow.com/questions/65348890/python-was-not-found-run-without-arguments-to-install-from-the-microsoft-store

* another error connected to the above is...
```
Missing Python executable 'python3', defaulting to 'C:\Users\LARRY\anaconda3\envs\tech-interview\Scripts\..' for SPARK_HOME environment variable. Please install Python or specify the correct Python executable in PYSPARK_DRIVER_PYTHON or PYSPARK_PYTHON environment variable to detect SPARK_HOME safely.
The system cannot find the path specified.
The system cannot find the path specified.
```
this maybe due to dependency errors and certain values not being added to the path system environment variable or an environment variable not being added such as `SPARK_HOME`, `HADOOP_HOME`, and `JAVA_HOME` as system environment variables containing the installation location of these softwares

take note that spark 3.5.4 requires java 8 or 17 and later. When on the downloads page it will also indicate that it is prevuilt for hadoop 3.3 and later meaning we have to install hadoop 3.3.0 and later releases (but specifically the winutils executable file as it requires winutils) and must be under these release versions.

steps for setting up apache spark from scratch
- java development kit 17: https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html
- apache spark: https://spark.apache.org/downloads.html
- hadoop winutils: https://github.com/kontext-tech/winutils/blob/master/hadoop-3.3.0/bin/winutils.exe
- once downloaded extract the `spark-3.x.x-bin.hadoop3.tgz`
- rename the extracted folder `spark-3.x.x-bin.hadoop3` to just `spark-3.x.x`
- once jdk17 is downloaded run executable file and install JDK and keep track fo installation location which is commonly at `C:\Program Files\Java\jdk-17` 
- create folder named hadoop and inside it create sub directory/ named bin and move the downloaded hadoop `winutils.exe` file inside
- move the spark and hadoop folders in any directory or perhaps the `C:\Program Files` directory
- copy the `C:\Program Files\spark-3.5.5`, `C:\Program Files\hadoop`, `C:\Program Files\Java\jdk-17` paths which contain the bin files of spark, hadoop, and jdk 17
- add new system environment variables named `SPARK_HOME`, `HADOOP_HOME`, and `JAVA_HOME`, with these values respectively. AH so now I know that you can download these software in a docker container and replicate the same process of copying their installation paths and creating system environment variables through `export JAVA_HOME="installation/dir/of/jdk"`, `export SPARK_HOME="installation/dir/of/jdk"`, `export HADOOP_HOME="installation/dir/of/hadoop"` (however note this inly does it for the current shell and all processes in current shell if you want to do it globally or add it as a system environment variable you need to use `sudo -H gedit /etc/environment`)
- in windows we can reference these system environment variables as `%<name of env var>%` e.g. `%SPARK_HOME%` and we'd get the value we assigned to this environment/system environment variable and add backslashes to it to reference sub directories in this directory e.g. `%SPARK_HOME%/bin` will be `C:\Program Files\spark-3.5.5\bin`. In linux we use `$<name of env var>`. But we add new values to the system path environment variable where we will now reference these newly created system environment variables. We add `%SPARK_HOME%\bin`, `%HADOOP_HOME%\bin`, and `%JAVA_HOME%\bin`
- restart the command line and run `javac --version`, `spark-shell`, to check if the installed software has been installed and commands are able to run in command line. spark-shell is a CLI for spark. Now we can use `spark-submit` for our python scripts containing spark sessions
- we need to also add PYSPARK_HOME containing the path to our global python interpreter which would be in path `C:\Users\LARRY\AppData\Local\Programs\Python\Python312\` and appended to it the `python.exe` e.g. `C:\Users\LARRY\AppData\Local\Programs\Python\Python312\python.exe` as this string will be needed in order for `spark-submit` to run our python scripts containing spark commands

```
C:\Users\LARRY>spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/22 13:50:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/22 13:50:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Spark context Web UI available at http://LAPTOP-3GL266K9.bbrouter:4041
Spark context available as 'sc' (master = local[*], app id = local-1745301022738).
Spark session available as 'spark'.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.5
      /_/

Using Scala version 2.12.18 (Java HotSpot(TM) 64-Bit Server VM, Java 17.0.12)
Type in expressions to have them evaluated.
Type :help for more information.

scala>
```

In [5]:
path = os.path.join(DATA_DIR, "Alabama_pop_by_sex_and_age_2000-2010.xls")
path

'./data/population-data\\Alabama_pop_by_sex_and_age_2000-2010.xls'

In [6]:
test_df_00_10 = pd.read_excel(path, dtype=object, header=None)
test_df_00_10

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,table with row headers in column A and column ...,,,,,,,,,,,,,
1,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
2,Sex and Age,"April 1, 20001",Intercensal Estimates (as of July 1),,,,,,,,,,"April 1, 20102","July 1, 20103"
3,,,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,,
4,BOTH SEXES,4447207,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4779736,4785298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Note: Median age is calculated based on single...,,,,,,,,,,,,,
113,Suggested Citation:,,,,,,,,,,,,,
114,Table 2. Intercensal Estimates of the Resident...,,,,,,,,,,,,,
115,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,,


run command `spark-submit --packages com.crealytics:spark-excel_2.12:3.5.1_0.20.4 test_submit.py` in order to execute this spark script it is imperative to add this packages argument as this indicates the dependency that we need installed when running this script transforming excel files

`com.crealytics:spark-excel_2.12:3.5.1_0.20.4` is actuall ythe package we need to read these excel files using spark where `com.crealytics` is the group id, `spark-excel_2.12` is the artifact id, and `3.5.1_0.20.4` is the release version

but how come this works when using spark-submit but when using jupyter notebooks the extra packages are not downloaded 

In [7]:
conf = SparkConf()
conf.set("spark.jars.packages", "com.crealytics:spark-excel_2.12:3.5.1_0.20.4")

<pyspark.conf.SparkConf at 0x274f9a83b30>

In [8]:
spark = SparkSession.builder.appName('test')\
    .config(conf=conf)\
    .getOrCreate()

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.RuntimeException: java.io.FileNotFoundException: Hadoop bin directory does not exist: C:\ProgramData\hadoop\bin\bin -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1139)
	at org.apache.hadoop.fs.FileUtil.chmod(FileUtil.java:1125)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:489)
	at org.apache.spark.SparkContext.addFile(SparkContext.scala:1790)
	at org.apache.spark.SparkContext.$anonfun$new$16(SparkContext.scala:528)
	at org.apache.spark.SparkContext.$anonfun$new$16$adapted(SparkContext.scala:528)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:528)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.FileNotFoundException: Hadoop bin directory does not exist: C:\ProgramData\hadoop\bin\bin -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getQualifiedBinInner(Shell.java:607)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:242)
	at org.apache.spark.util.SparkFileUtils.createTempDir(SparkFileUtils.scala:103)
	at org.apache.spark.util.SparkFileUtils.createTempDir$(SparkFileUtils.scala:102)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:94)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:377)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:969)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1125)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1134)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)


In [None]:
test_spark_df_00_10 = spark.read.format("com.crealytics.spark.excel")\
    .option("header", "false")\
    .option("inferSchema", "true")\
    .load(path)

Py4JJavaError: An error occurred while calling o238.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: com.crealytics.spark.excel. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.lang.ClassNotFoundException: com.crealytics.spark.excel.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more
