This cleaning file results in one combined file (poverty.csv) containing data on poverty. It is a merged version of files from SAIPE surveys conducted between the years 1998 and 2021. The original datasets were too large to be included here.     

Data source: https://www.census.gov/programs-surveys/saipe/data/datasets.html

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
folder_pathA = 'poverty/dats'
folder_pathB = 'poverty'
output_path = 'poverty/merged/poverty.csv'

result_listA = []
result_listB = []


def extract_year(filename):
    year_str = filename[3:5]  
    year = int(year_str) + 2000  
    return year


for filename in os.listdir(folder_pathA):
    if filename.endswith('.dat'):
        file_path = os.path.join(folder_pathA, filename)
        df = pd.read_fwf(file_path, header=None, dtype={0: str, 1: str})


        df.rename(columns={0: 'stateFIPS', 1: 'countyFIPS',
        2: 'povertyAll', 5:'povertyAllpr', 8:'povertyUnder18',
        11: 'povertyUnder18pr', 20:'medianHouseholdIncome'
            }, inplace=True)


        df['countyFIPS'] = df['countyFIPS'].str.zfill(3)
        df['GeoFIPS'] = df['stateFIPS'] + df['countyFIPS']

        columns_to_keep = ['GeoFIPS', 'povertyAll', 'povertyAllpr',
        'povertyUnder18', 'povertyUnder18pr', 'medianHouseholdIncome']
        df = df.loc[:, columns_to_keep]

        df['GeoFIPS'] = df['GeoFIPS'].astype(int)

        df['Year'] = extract_year(filename)

        result_listA.append(df)
        
        print(filename, 'from A done and ready to roll!')


merged_df = pd.concat(result_listA, ignore_index=True)
merged_df.to_csv('poverty/merged_data.csv', index=False)




In [None]:
for filename in os.listdir(folder_pathB):
    if filename.endswith('.xls'):
        file_path = os.path.join(folder_pathB, filename)
        dfTY = pd.read_excel(file_path, header=1, converters={'State FIPS': str, 'County FIPS': str})

        value = dfTY.columns[0] 

        if len(value) > 12 :
            
            df1 = pd.read_excel(file_path, header=2, converters={'State FIPS': str, 'County FIPS': str,
                                                            'State FIPS Code': str, 'County FIPS Code': str})

        else: 
            df1 = dfTY



        if df1.columns[4] == 'All Ages':
            
            df = pd.read_excel(file_path, header=3, converters={'State FIPS': str, 'County FIPS': str,
                                                            'State FIPS Code': str, 'County FIPS Code': str})
        else:
            df = df1



        df.columns.values[0] = "State FIPS"
        df.columns.values[1] = "County FIPS"


        df = df.loc[(df['State FIPS'].str.len() <= 10) & (~df['State FIPS'].isna())]



        df.iloc[:, :2] = df.iloc[:, :2].astype(str)



        df['State FIPS'] = df['State FIPS'].str.zfill(2)
        df['County FIPS'] = df['County FIPS'].str.zfill(3)


        df['GeoFIPS'] = df['State FIPS'] + df['County FIPS']
        df['GeoFIPS'] = df['GeoFIPS'].astype(int)

        df.columns = df.columns.str.replace(',', '')


        if 'Poverty Percent Age 0-17' in df.columns:
            df.rename(columns={'Poverty Percent Age 0-17': 'Poverty Percent Ages 0-17'}, inplace=True)

        if 'Poverty Estimate Age 0-17' in df.columns:
            df.rename(columns={'Poverty Estimate Age 0-17': 'Poverty Estimate Ages 0-17'}, inplace=True)




        if 'Poverty Estimate Ages 0-17' in df.columns:
            df.rename(columns={
                'Poverty Estimate All Ages': 'povertyAll',
                'Poverty Percent All Ages': 'povertyAllpr',
                'Poverty Estimate Ages 0-17': 'povertyUnder18',
                'Poverty Percent Ages 0-17': 'povertyUnder18pr',
                'Median Household Income': 'medianHouseholdIncome'
            }, inplace=True)
        else:
            df.rename(columns={
                'Poverty Estimate All Ages': 'povertyAll',
                'Poverty Percent All Ages':'povertyAllpr',
                'Poverty Estimate Under Age 18':'povertyUnder18',
                'Poverty Percent Under Age 18': 'povertyUnder18pr',
                'Median Household Income':'medianHouseholdIncome'
            }, inplace=True)



        columns_to_keep = ['GeoFIPS', 'povertyAll', 'povertyAllpr',
                'povertyUnder18', 'povertyUnder18pr', 'medianHouseholdIncome']


        df = df.loc[:, columns_to_keep]

        df['Year'] = extract_year(filename)

        result_listB.append(df)
        
        print(filename, 'from B done and ready to roll!')



merged_df2 = pd.concat(result_listB, ignore_index=True)

In [None]:
allMerged = pd.concat([merged_df, merged_df2], ignore_index=True)
allMerged.to_csv('cities/data/raw/poverty.csv', index=False)