This is to extract population related variables from SEER dataset. The variables are:

* Total Population
* % Black population
* % White population
* % Age 0-19
* % Age 20-39
* % Age 40-64
* % Age 65+
* % Female


In [42]:
import os
os.chdir('/Users/babak.jfard/projects/NASA_IMERG_Respiratory/')

In [43]:
import pandas as pd

def read_SEER_population_file(file_path):
    # Define column widths and names based on the SEER data dictionary
    col_specs = [
        (0, 4),   # Year
        (4, 6),   # State postal abbreviation
        (6, 8),   # State FIPS code
        (8, 11),  # County FIPS code
        (11, 13), # Registry
        (13, 14), # Race
        (14, 15), # Origin
        (15, 16), # Sex
        (16, 18), # Age
        (18, 26)  # Population
    ]
    
    col_names = [
        'Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 
        'Registry', 'Race', 'Origin', 'Sex', 'Age', 'Population'
    ]
    
    # Read the file using read_fwf (fixed-width file reader) in pandas
    df = pd.read_fwf(
        file_path,
        colspecs=col_specs,
        names=col_names,
        header=None,
        dtype=str
    )
    
    # Convert numeric columns from string to appropriate numeric types
    numeric_cols = ['Year', 'Registry', 'Race', 'Origin', 'Sex', 'Age', 'Population']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df['County_FIPS'] = df['State FIPS Code'] + df['County FIPS Code']
    df['County_FIPS'] =df['County_FIPS'].astype('int')
    df.drop(['State FIPS Code','County FIPS Code'], axis=1, inplace=True)

    return df

In [59]:
states = ['CA', 'GA', 'OR', 'SC']
path = 'Data/raw/'

state = 'CA'
state = 'GA'
state = 'OR'
state = 'SC'
pops = read_SEER_population_file(path+state.lower()+'.1969_2022.19ages.txt')
    


In [60]:
# Limiting the years to the study period
pops = pops[pops['Year'].isin(range(2003, 2020))]

In [61]:
# We are interested in 8 variables. Will calculate one by one into one list
# and then merge them all into one final
calculated_vars = [None] * 10
# ranges = [[0, 4], [5, 8], [9, 13], [14, 18]]
ranges = [0, 5, 9, 14, 19]
ages = ['Age 0-19', 'Age 20-39', 'Age 40-64', 'Age 65+']

for i in range(len(ages)):
    calculated_vars[i] = pops[pops['Age'].isin(range(ranges[i], ranges[i+1]))].groupby(['County_FIPS', 'Year'])['Population'].sum().reset_index().rename(columns={'Population': ages[i]})

# Calculating total While and Black populations
filter_vars = ['Race', 'Race', 'Race', 'Sex', 'Sex']
filter_vals = [1, 2, 3, 1, 2]
col_names = ['White', 'Black', 'Other', 'Male', 'Female']

for i in range(len(filter_vars)):
    calculated_vars[i+4] = pops[pops[filter_vars[i]]==filter_vals[i]].groupby(
        ['County_FIPS', 'Year'])['Population'].sum().reset_index().rename(columns={'Population': col_names[i]})

calculated_vars[9] = pops.groupby(['County_FIPS', 'Year'])['Population'].sum().reset_index().rename(columns={'Population': 'Total Population'})

In [62]:
import functools
pop_vars = functools.reduce(lambda left, right: pd.merge(left, right, on=['County_FIPS', 'Year']), calculated_vars)

In [63]:
pop_vars.columns

Index(['County_FIPS', 'Year', 'Age 0-19', 'Age 20-39', 'Age 40-64', 'Age 65+',
       'White', 'Black', 'Other', 'Male', 'Female', 'Total Population'],
      dtype='object')

In [64]:
# Select the columns to calculate the percentage
columns_to_calculate = ['Age 0-19', 'Age 20-39', 'Age 40-64', 'Age 65+',
                         'White', 'Black', 'Other', 'Male', 'Female']

# Divide each column by the 'Total Population' column
for column in columns_to_calculate:
    pop_vars[column + ' (%)'] = round((pop_vars[column] / pop_vars['Total Population']),3)

pop_vars.drop(columns=columns_to_calculate, inplace=True)

In [65]:
pop_vars.to_csv('Data/interim/'+state+'_population_vars.csv', index=False)