In [1]:
import pandas as pd
import numpy as np

In [2]:
# importing 2020-2022 census data by state
#from https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html

pop1 = pd.read_excel('NST-EST2022-POP.xlsx', skiprows=3)
pop1 = pop1.rename(columns= {'Unnamed: 0':'state'})
pop1 = pop1.drop(labels= 'Unnamed: 1', axis=1)
pop1 = pop1.melt(id_vars=['state'], var_name=['fy'], value_name='pop')
pop1['state'] = pop1['state'].str.replace('.','', regex=True)
pop1['state'] = np.where(pop1['state'] == 'United States','U.S.', pop1['state'])
pop1.head()

Unnamed: 0,state,fy,pop
0,U.S.,2020,331511512.0
1,Northeast,2020,57448898.0
2,Midwest,2020,68961043.0
3,South,2020,126450613.0
4,West,2020,78650958.0


In [3]:
#importing population from 2010 - 2019
#https://www.census.gov/data/datasets/time-series/demo/popest/2010s-state-total.html

pop2 = pd.read_csv('nst-est2019-alldata.csv')
pop2 = pop2.rename(columns= {'NAME':'state', 'POPESTIMATE2010':'2010', 'POPESTIMATE2011':'2011',
                            'POPESTIMATE2012':'2012', 'POPESTIMATE2013':'2013', 'POPESTIMATE2014':'2014',
                            'POPESTIMATE2015':'2015', 'POPESTIMATE2016':'2016', 'POPESTIMATE2017':'2017',
                            'POPESTIMATE2018':'2018', 'POPESTIMATE2019':'2019'})
pop2 = pop2[['state', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]
pop2 = pop2.melt(id_vars=['state'], var_name=['fy'], value_name='pop')
pop2['state'] = np.where(pop2['state'] == 'United States','U.S.',pop2['state'])
pop2.head()

Unnamed: 0,state,fy,pop
0,U.S.,2010,309321666
1,Northeast Region,2010,55380134
2,Midwest Region,2010,66974416
3,South Region,2010,114866680
4,West Region,2010,72100436


In [4]:
#importing 2000-2010 census data by state
#https://www.census.gov/data/tables/time-series/demo/popest/intercensal-2000-2010-state.html

pop3 = pd.read_excel('st-est00int-01.xls', skiprows=3)
pop3 = pop3.rename(columns= {'Unnamed: 0':'state'})
pop3 = pop3[['state', 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]]
pop3 = pop3.melt(id_vars=['state'], var_name=['fy'], value_name='pop')
pop3['state'] = pop3['state'].str.replace('.','', regex=True)
pop3['state'] = np.where(pop3['state'] == 'United States','U.S.',pop3['state'])
pop3.head()

Unnamed: 0,state,fy,pop
0,U.S.,2000,282162411.0
1,Northeast,2000,53666295.0
2,Midwest,2000,64491431.0
3,South,2000,100565549.0
4,West,2000,63439136.0


In [5]:
#importing 1990-2000 census data
#https://www.stats.indiana.edu/population/PopTotals/intercensal90s_states.asp

pop4 = pd.read_csv('IndianPublicUtilityData.csv')
pop4.columns.values[0] = 'state'
pop4 = pop4.rename(columns= {'1-Jul-90 Estimate':'1990', '1-Jul-91 Estimate' : '1991', '1-Jul-92 Estimate': '1992',
                  '1-Jul-93 Estimate': '1993', '1-Jul-94 Estimate': '1994', '1-Jul-95 Estimate': '1995',
                  '1-Jul-96 Estimate': '1996', '1-Jul-97 Estimate': '1997', '1-Jul-98 Estimate': '1998',
                  '1-Jul-99 Estimate':'1999'})
pop4 = pop4[['state', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']]
pop4 = pop4.melt(id_vars=['state'], var_name=['fy'], value_name='pop')
pop4['state'] = np.where(pop4['state'] == 'USA','U.S.',pop4['state'])

#cleaning rows
new_row_names = {'New\xa0Mexico':'New Mexico', 'South\xa0Dakota': 'South Dakota', 'Rhode\xa0Island': 'Rhode Island',
                'North\xa0Dakota' : 'North Dakota', 'North\xa0Carolina': 'North Carolina', 'New\xa0York' : 'New York',
                 'New\xa0Jersey': 'New Jersey', 'West\xa0Virginia': 'West Virginia', 'New\xa0Hampshire': 'New Hampshire',
                 'South\xa0Carolina': 'South Carolina'
                }

pop4['state'] = pop4['state'].replace(new_row_names)
pop4.head()

Unnamed: 0,state,fy,pop
0,U.S.,1990,249622814
1,Alabama,1990,4050055
2,Alaska,1990,553290
3,Arizona,1990,3684097
4,Arkansas,1990,2356586


In [6]:
#webscraping 1980 US census data from wikipedia

all_tables = pd.read_html('https://en.wikipedia.org/wiki/1980_United_States_census')
pop1980 = all_tables[2]
pop1980 = pop1980.rename(columns={'State':'state', 'Population as of1980 census':'pop'})
pop1980 = pop1980[['state','pop']]
pop1980.loc[len(pop1980.index)] = ['U.S.', sum(pop1980['pop'])]
pop1980['fy'] = 1980
pop1980.head()

Unnamed: 0,state,pop,fy
0,California,23667902,1980
1,New York,17558072,1980
2,Texas,14229191,1980
3,Pennsylvania,11863895,1980
4,Illinois,11426518,1980


In [7]:
# creating data frame with 1980 and 1990 census data to calculate 1981-1989 data by state assuming pct_change
# remains constant thoroughout decade

# creating new df with 1990 and 1980 dfs
a = pop4[pop4['fy'] == '1990']
popcalc = pd.merge(pop1980, a, on='state')
popcalc['pop_x'] = popcalc['pop_x'].astype(int)
popcalc['pop_y'] = popcalc['pop_y'].astype(int)

#determining pct_change
popcalc['pct_change'] = popcalc[['pop_x','pop_y']].pct_change(axis=1)['pop_y']
popcalc['pct_change_mult'] = popcalc['pct_change']/10

#applying 1/10 of it for each year
popcalc['1981'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['pop_x']
popcalc['1982'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1981']
popcalc['1983'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1982']
popcalc['1984'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1983']
popcalc['1985'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1984']
popcalc['1986'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1985']
popcalc['1987'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1986']
popcalc['1988'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1987']
popcalc['1989'] = popcalc['pop_x']*popcalc['pct_change_mult'] + popcalc['1988']

#final formatting for merge into final census data
popcalc[['1981','1982','1983','1984','1985','1986','1987','1988','1989']] = popcalc[['1981','1982','1983','1984','1985','1986','1987','1988','1989']].astype(int)
popcalc = popcalc.rename(columns={'pop_x':'1980', 'pop_y':'1990'})
popcalc = popcalc[['state','1980','1981','1982','1983','1984','1985','1986','1987','1988','1989']]
popcalc = popcalc.melt(id_vars=['state'], var_name=['fy'], value_name='pop')
popcalc.head(5)

Unnamed: 0,state,fy,pop
0,California,1980,23667902
1,New York,1980,17558072
2,Texas,1980,14229191
3,Pennsylvania,1980,11863895
4,Illinois,1980,11426518


In [8]:
# merging all census data together 
population = pd.concat([pop1, pop2, pop3, pop4, popcalc])

#assessing null values
population.isnull().sum()

#checking null values for population
a = population.loc[population['pop'].isnull()]
b = a.groupby('state')[['pop']].mean().reset_index()

#checking null values for states
a = population.loc[population['state'].isnull()]

#removing null values
population = population.dropna(subset=['state','pop'])

#removing unnecessary states
st_removal_list = ['District of Columbia','Midwest', 'South', 'West', 'Northeast', 'South Region',
                  'District\xa0of\xa0Columbia','West Region', 'Midwest Region','Northeast Region', 'Puerto Rico']

population = population.loc[~population['state'].isin(st_removal_list)]

#converting dtypes
population[['fy','pop']] = population[['fy','pop']].astype(int)
population

Unnamed: 0,state,fy,pop
0,U.S.,2020,331511512
5,Alabama,2020,5031362
6,Alaska,2020,732923
7,Arizona,2020,7179943
8,Arkansas,2020,3014195
...,...,...,...
505,Delaware,1989,662044
506,Vermont,1989,559463
507,Wyoming,1989,455276
508,Alaska,1989,538146


In [9]:
#quality assurance check

#state value counts all equal
check = population['state'].value_counts().reset_index()

#no null values
check1 = population.isnull().sum()

#number of unique states (51)
check2 = population['state'].nunique()

#each state has the correct number of values (43) for 43 years of census data
check3 = check2*(population['fy'].max()-population['fy'].min()+1)

#confirming expectations with final df
check3 == len(population)

True

In [10]:
population.to_csv('census.csv')