In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [42]:
PATH = "../data/census/"
race_df = pd.read_csv(PATH + "race.csv")
sw_finance_df = pd.read_csv(PATH + "SW_finance_char.csv")
financial_characteristics_df = pd.read_csv(PATH + "financial_characteristics.csv")
age_and_sex_df = pd.read_csv(PATH + "age_and_sex.csv")
food_stamps_df = pd.read_csv(PATH + "food_stamps_snap.csv")
mean_income_df = pd.read_csv(PATH + "mean_income.csv")
poverty_status_df = pd.read_csv(PATH + "poverty_status.csv")

In [115]:
def fix_labels(df):
    df["Label (Grouping)"] = df["Label (Grouping)"].map(lambda x: x.lstrip('\xa0'))

def get_topics_idcs(df):
    return df[df.isna().any(axis=1)].index

def get_topics(df):
    # Get IDCS of Grouping Rows
    idcs = get_topics_idcs(df)
    idx_pairs = [(idcs[i], idcs[i+1]) if i < (len(idcs) - 1) else (idcs[i],) for i in range(len(idcs))]
    
    # Format the labels
    fix_labels(df)
    
    labels = df.iloc[idcs]["Label (Grouping)"].values
    return list(zip(idx_pairs, labels))


In [151]:
all_poverty_subtopics = get_topics(poverty_status_df)

In [152]:
all_poverty_subtopics

[((1, 11), 'AGE'),
 ((11, 14), 'SEX'),
 ((14, 24), 'RACE AND HISPANIC OR LATINO ORIGIN'),
 ((24, 30), 'EDUCATIONAL ATTAINMENT'),
 ((30, 38), 'EMPLOYMENT STATUS'),
 ((38, 43), 'WORK EXPERIENCE'),
 ((43,), 'ALL INDIVIDUALS WITH INCOME BELOW THE FOLLOWING POVERTY RATIOS')]

In [145]:
def get_all_sub_dfs(df):
    
    poverty_status = {}
    subtopics = []
    
    for subtopic_idcs, subtopic in all_poverty_subtopics:
        if len(subtopic_idcs) == 2:
            poverty_status[subtopic] = df.iloc[subtopic_idcs[0] + 1:subtopic_idcs[1]]
        else:
            poverty_status[subtopic] = df.iloc[subtopic_idcs[0] + 1:]
            
        poverty_status[subtopic] = poverty_status[subtopic].rename(columns={"Label (Grouping)": subtopic})
        subtopics.append(subtopic)
            
    subtopics = '\n'.join([subtopic for subtopic in subtopics])
    print("The subtopics are:")
    print(subtopics)
    
    return poverty_status

In [146]:
poverty_dfs = get_all_sub_dfs(poverty_status_df)

The subtopics are:
AGE
SEX
RACE AND HISPANIC OR LATINO ORIGIN
EDUCATIONAL ATTAINMENT
EMPLOYMENT STATUS
WORK EXPERIENCE
ALL INDIVIDUALS WITH INCOME BELOW THE FOLLOWING POVERTY RATIOS


In [148]:
poverty_dfs["EMPLOYMENT STATUS"]

Unnamed: 0,EMPLOYMENT STATUS,District of Columbia!!Total!!Estimate,District of Columbia!!Below poverty level!!Estimate,District of Columbia!!Percent below poverty level!!Estimate
31,Civilian labor force 16 years and over,379074,29698,7.8%
32,Employed,348505,17164,4.9%
33,Male,169026,7448,4.4%
34,Female,179479,9716,5.4%
35,Unemployed,30569,12534,41.0%
36,Male,16414,5330,32.5%
37,Female,14155,7204,50.9%
