In [99]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [100]:
PATH = "../data/census/"
race_df = pd.read_csv(PATH + "SW_race.csv")
sw_finance_df = pd.read_csv(PATH + "SW_finance_char.csv")
food_stamps_df = pd.read_csv(PATH + "SW_food_stamps.csv")
mean_income_df = pd.read_csv(PATH + "SW_mean_income.csv")
poverty_df = pd.read_csv(PATH + "SW_poverty.csv")

In [101]:
def fix_labels(df):
    df["Label (Grouping)"] = df["Label (Grouping)"].map(lambda x: x.lstrip('\xa0'))

    
def get_topics_idcs(df):
    return df[df.isna().any(axis=1)].index


def get_topics(df):
    # Get IDCS of Grouping Rows
    idcs = get_topics_idcs(df)
    idx_pairs = [(idcs[i], idcs[i+1]) if i < (len(idcs) - 1) else (idcs[i],) for i in range(len(idcs))]
    
    # Format the labels
    fix_labels(df)
    
    labels = df.iloc[idcs]["Label (Grouping)"].values
    return list(zip(idx_pairs, labels))


def get_all_sub_dfs(df):
    
    all_subtopics = get_topics(df)
    topic = {}
    subtopics = []
    
    for subtopic_idcs, subtopic in all_subtopics:
        
        if len(subtopic_idcs) == 2:
            topic[subtopic] = df.iloc[subtopic_idcs[0] + 1:subtopic_idcs[1]]
        else:
            topic[subtopic] = df.iloc[subtopic_idcs[0] + 1:]
            
        # Check whether the first row contains values of interest
        if not df.iloc[0].isna().any():
            topic[subtopic].loc[-1] = df.loc[0]                # adding a row
            topic[subtopic].index = topic[subtopic].index + 1  # shifting index
            topic[subtopic] = topic[subtopic].sort_index()     # sorting by index

            
        topic[subtopic] = topic[subtopic].rename(columns={"Label (Grouping)": subtopic})
        topic[subtopic].set_index(subtopic, inplace=True)
        subtopics.append(subtopic)
            
    subtopics = '\n'.join([subtopic for subtopic in subtopics])
    print("The subtopics are:")
    print(subtopics)
    
    return topic

In [102]:
poverty_dict = get_all_sub_dfs(poverty_df)

The subtopics are:
AGE
SEX
RACE AND HISPANIC OR LATINO ORIGIN
EDUCATIONAL ATTAINMENT
EMPLOYMENT STATUS
WORK EXPERIENCE
ALL INDIVIDUALS WITH INCOME BELOW THE FOLLOWING POVERTY RATIOS


In [103]:
# Good for finance datasets
# sw_finance_dict["HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2021 INFLATION-ADJUSTED DOLLARS)"].index = sw_finance_dict["HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2021 INFLATION-ADJUSTED DOLLARS)"].index.str.replace("$", "\$")

In [105]:
poverty_dict["RACE AND HISPANIC OR LATINO ORIGIN"]

Unnamed: 0_level_0,"Census Tract 64, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 64, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 64, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate","Census Tract 102.01, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 102.01, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 102.01, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate","Census Tract 102.02, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 102.02, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 102.02, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate","Census Tract 105, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 105, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 105, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate","Census Tract 110.01, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 110.01, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 110.01, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate","Census Tract 110.02, District of Columbia, District of Columbia!!Total!!Estimate","Census Tract 110.02, District of Columbia, District of Columbia!!Below poverty level!!Estimate","Census Tract 110.02, District of Columbia, District of Columbia!!Percent below poverty level!!Estimate"
RACE AND HISPANIC OR LATINO ORIGIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Population for whom poverty status is determined,2418,1046,43.3%,2522,153,6.1%,1930,138,7.2%,4027,841,20.9%,2385,64,2.7%,1469,90,6.1%
White alone,549,41,7.5%,1313,46,3.5%,951,76,8.0%,2051,66,3.2%,1420,0,0.0%,1051,90,8.6%
Black or African American alone,1769,983,55.6%,695,74,10.6%,628,25,4.0%,1460,667,45.7%,791,22,2.8%,237,0,0.0%
American Indian and Alaska Native alone,0,0,-,0,0,-,0,0,-,0,0,-,0,0,-,0,0,-
Asian alone,38,0,0.0%,157,33,21.0%,71,9,12.7%,137,20,14.6%,58,0,0.0%,60,0,0.0%
Native Hawaiian and Other Pacific Islander alone,0,0,-,0,0,-,0,0,-,14,0,0.0%,0,0,-,0,0,-
Some other race alone,21,11,52.4%,37,0,0.0%,127,0,0.0%,38,0,0.0%,0,0,-,25,0,0.0%
Two or more races,41,11,26.8%,320,0,0.0%,153,28,18.3%,327,88,26.9%,116,42,36.2%,96,0,0.0%
Hispanic or Latino origin (of any race),21,11,52.4%,201,0,0.0%,142,29,20.4%,360,0,0.0%,436,0,0.0%,57,0,0.0%
"White alone, not Hispanic or Latino",549,41,7.5%,1175,46,3.9%,906,47,5.2%,1928,66,3.4%,1016,0,0.0%,1019,90,8.8%
