In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Import the CSV file
census_data_csv = Path("Data/censusdata.csv")
census_data_df = pd.read_csv(census_data_csv, encoding='latin-1')

In [3]:
census_data_df.columns

Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME',
       'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       ...
       'RNETMIG2011', 'RNETMIG2012', 'RNETMIG2013', 'RNETMIG2014',
       'RNETMIG2015', 'RNETMIG2016', 'RNETMIG2017', 'RNETMIG2018',
       'RNETMIG2019', 'RNETMIG2020'],
      dtype='object', length=180)

In [4]:
# Create a new dataframe with columns E, F, G, J through T, which shows the total population for each state 
# and county based on the following formula: Population base + Birth - deaths + Migration 
census_df = census_data_df.loc[:, ['COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2010', 'POPESTIMATE2011', \
                                   'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015', \
                                   'POPESTIMATE2016', 'POPESTIMATE2017', 'POPESTIMATE2018', 'POPESTIMATE2019', \
                                   'POPESTIMATE2020']]
census_df.head()

Unnamed: 0,COUNTY,STNAME,CTYNAME,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
0,0,Alabama,Alabama,4785514,4799642,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4921532
1,1,Alabama,Autauga County,54761,55229,54970,54747,54922,54903,55302,55448,55533,55769,56145
2,3,Alabama,Baldwin County,183121,186579,190203,194978,199306,203101,207787,212737,218071,223565,229287
3,5,Alabama,Barbour County,27325,27344,27172,26946,26768,26300,25828,25169,24887,24657,24589
4,7,Alabama,Bibb County,22858,22736,22657,22510,22541,22553,22590,22532,22300,22313,22136


In [5]:
# Create another dataframe with rows that have "0" in column E, which only looks at the states (no counties)
state_df = census_df[census_df['COUNTY'] == 0]
state_df.head()

Unnamed: 0,COUNTY,STNAME,CTYNAME,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
0,0,Alabama,Alabama,4785514,4799642,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4921532
68,0,Alaska,Alaska,713982,722349,730810,737626,737075,738430,742575,740983,736624,733603,731158
99,0,Arizona,Arizona,6407342,6473416,6556344,6634690,6732873,6832810,6944767,7048088,7164228,7291843,7421401
115,0,Arkansas,Arkansas,2921998,2941038,2952876,2960459,2968759,2979732,2991815,3003855,3012161,3020985,3030522
191,0,California,California,37319550,37636311,37944551,38253768,38586706,38904296,39149186,39337785,39437463,39437610,39368078


In [6]:
#convert strings to numeric type for calculation of %
state_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', \
                 'POPESTIMATE2014', 'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017', \
                 'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020']] = \
state_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', \
                 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015', \
                 'POPESTIMATE2016', 'POPESTIMATE2017', 'POPESTIMATE2018', \
                 'POPESTIMATE2019', 'POPESTIMATE2020']].apply(pd.to_numeric)
state_df.head()
state_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', \


COUNTY              int64
STNAME             object
CTYNAME            object
POPESTIMATE2010     int64
POPESTIMATE2011     int64
POPESTIMATE2012     int64
POPESTIMATE2013     int64
POPESTIMATE2014     int64
POPESTIMATE2015     int64
POPESTIMATE2016     int64
POPESTIMATE2017     int64
POPESTIMATE2018     int64
POPESTIMATE2019     int64
POPESTIMATE2020     int64
dtype: object

In [7]:
state_df["Popestimate Yearly Change 2011(%)"] = state_df["POPESTIMATE2011"] / \
                                        state_df["POPESTIMATE2010"]

state_df["Popestimate Yearly Change 2012(%)"] = state_df["POPESTIMATE2012"] / \
                                        state_df["POPESTIMATE2011"]
state_df["Popestimate Yearly Change 2013(%)"] = state_df["POPESTIMATE2013"] / \
                                        state_df["POPESTIMATE2012"]
state_df["Popestimate Yearly Change 2014(%)"] = state_df["POPESTIMATE2014"] / \
                                        state_df["POPESTIMATE2013"]
state_df["Popestimate Yearly Change 2015(%)"] = state_df["POPESTIMATE2015"] / \
                                        state_df["POPESTIMATE2014"]
state_df["Popestimate Yearly Change 2016(%)"] = state_df["POPESTIMATE2016"] / \
                                        state_df["POPESTIMATE2015"]
state_df["Popestimate Yearly Change 2017(%)"] = state_df["POPESTIMATE2017"] / \
                                        state_df["POPESTIMATE2016"]
state_df["Popestimate Yearly Change 2018(%)"] = state_df["POPESTIMATE2018"] / \
                                        state_df["POPESTIMATE2017"]
state_df["Popestimate Yearly Change 2019(%)"] = state_df["POPESTIMATE2019"] / \
                                        state_df["POPESTIMATE2018"]
state_df["Popestimate Yearly Change 2020(%)"] = state_df["POPESTIMATE2020"] / \
                                        state_df["POPESTIMATE2019"]
state_df["Popestimate 10-Year Change 2020(%)"] = state_df["POPESTIMATE2020"] / \
                                        state_df["POPESTIMATE2010"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Popestimate Yearly Change 2011(%)"] = state_df["POPESTIMATE2011"] / \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Popestimate Yearly Change 2012(%)"] = state_df["POPESTIMATE2012"] / \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df["Popestimate Yearly Change 2013(%)"]

In [8]:
state_pct_change_df = state_df.loc[:, ['STNAME','CTYNAME', 'Popestimate Yearly Change 2011(%)', \
                                             'Popestimate Yearly Change 2012(%)', \
                                             'Popestimate Yearly Change 2013(%)', \
                                             'Popestimate Yearly Change 2014(%)',
                                             'Popestimate Yearly Change 2015(%)',
                                             'Popestimate Yearly Change 2016(%)',
                                             'Popestimate Yearly Change 2017(%)',
                                             'Popestimate Yearly Change 2018(%)',
                                             'Popestimate Yearly Change 2019(%)',
                                             'Popestimate Yearly Change 2020(%)',
                                             'Popestimate 10-Year Change 2020(%)']]
state_pct_change_df.head()
                           

Unnamed: 0,STNAME,CTYNAME,Popestimate Yearly Change 2011(%),Popestimate Yearly Change 2012(%),Popestimate Yearly Change 2013(%),Popestimate Yearly Change 2014(%),Popestimate Yearly Change 2015(%),Popestimate Yearly Change 2016(%),Popestimate Yearly Change 2017(%),Popestimate Yearly Change 2018(%),Popestimate Yearly Change 2019(%),Popestimate Yearly Change 2020(%),Popestimate 10-Year Change 2020(%)
0,Alabama,Alabama,1.002952,1.00354,1.003105,1.002515,1.002285,1.002476,1.002294,1.002796,1.00334,1.002764,1.028423
68,Alaska,Alaska,1.011719,1.011713,1.009327,0.999253,1.001838,1.005613,0.997856,0.994117,0.995899,0.996667,1.024057
99,Arizona,Arizona,1.010312,1.012811,1.01195,1.014798,1.014843,1.016385,1.014878,1.016478,1.017813,1.017768,1.158265
115,Arkansas,Arkansas,1.006516,1.004025,1.002568,1.002804,1.003696,1.004055,1.004024,1.002765,1.002929,1.003157,1.03714
191,California,California,1.008488,1.00819,1.008149,1.008703,1.008231,1.006295,1.004817,1.002534,1.000004,0.998237,1.054892


In [9]:
# Sort the data by 10-Year Change by State, Highest to Lowest
state_sorted_df = state_pct_change_df.sort_values(["Popestimate 10-Year Change 2020(%)"],
                                           ascending=False)

# Reset Index
state_sorted_df = state_sorted_df.reset_index(drop=True)
state_sorted_df.head()

Unnamed: 0,STNAME,CTYNAME,Popestimate Yearly Change 2011(%),Popestimate Yearly Change 2012(%),Popestimate Yearly Change 2013(%),Popestimate Yearly Change 2014(%),Popestimate Yearly Change 2015(%),Popestimate Yearly Change 2016(%),Popestimate Yearly Change 2017(%),Popestimate Yearly Change 2018(%),Popestimate Yearly Change 2019(%),Popestimate Yearly Change 2020(%),Popestimate 10-Year Change 2020(%)
0,District of Columbia,District of Columbia,1.024795,1.024903,1.024888,1.018485,1.020209,1.015601,1.013821,1.010139,1.005831,1.006443,1.177659
1,Utah,Utah,1.01419,1.013979,1.015636,1.013645,1.015417,1.020316,1.019479,1.01663,1.015286,1.014515,1.170953
2,Texas,Texas,1.01599,1.017103,1.015163,1.018257,1.018746,1.01622,1.013504,1.01179,1.012655,1.012901,1.163176
3,Idaho,Idaho,1.008564,1.007346,1.010115,1.012528,1.012404,1.019087,1.021204,1.018799,1.02111,1.021158,1.163032
4,Nevada,Nevada,1.003934,1.011631,1.011763,1.015117,1.017594,1.017788,1.017997,1.019726,1.019812,1.015364,1.16125


In [10]:
# Print out the data for the state or territory with the highest 10-Year Change
highest_10y_change = state_sorted_df.loc[0, :]
highest_10y_change

STNAME                                District of Columbia
CTYNAME                               District of Columbia
Popestimate Yearly Change 2011(%)                 1.024795
Popestimate Yearly Change 2012(%)                 1.024903
Popestimate Yearly Change 2013(%)                 1.024888
Popestimate Yearly Change 2014(%)                 1.018485
Popestimate Yearly Change 2015(%)                 1.020209
Popestimate Yearly Change 2016(%)                 1.015601
Popestimate Yearly Change 2017(%)                 1.013821
Popestimate Yearly Change 2018(%)                 1.010139
Popestimate Yearly Change 2019(%)                 1.005831
Popestimate Yearly Change 2020(%)                 1.006443
Popestimate 10-Year Change 2020(%)                1.177659
Name: 0, dtype: object

In [11]:
# Print out the data for the state or territory with the lowest 10-Year Change
lowest_10y_change = state_sorted_df.loc[len(state_sorted_df)-1, :]
lowest_10y_change

STNAME                                West Virginia
CTYNAME                               West Virginia
Popestimate Yearly Change 2011(%)          1.001262
Popestimate Yearly Change 2012(%)          1.000452
Popestimate Yearly Change 2013(%)          0.998558
Popestimate Yearly Change 2014(%)          0.997736
Popestimate Yearly Change 2015(%)          0.996089
Popestimate Yearly Change 2016(%)          0.994088
Popestimate Yearly Change 2017(%)          0.992495
Popestimate Yearly Change 2018(%)             0.993
Popestimate Yearly Change 2019(%)          0.994081
Popestimate Yearly Change 2020(%)          0.994165
Popestimate 10-Year Change 2020(%)         0.962531
Name: 50, dtype: object

In [12]:
# Create another dataframe, which only looks at the counties
county_df = census_df[census_df['COUNTY'] != 0]
county_df.head()

Unnamed: 0,COUNTY,STNAME,CTYNAME,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020
1,1,Alabama,Autauga County,54761,55229,54970,54747,54922,54903,55302,55448,55533,55769,56145
2,3,Alabama,Baldwin County,183121,186579,190203,194978,199306,203101,207787,212737,218071,223565,229287
3,5,Alabama,Barbour County,27325,27344,27172,26946,26768,26300,25828,25169,24887,24657,24589
4,7,Alabama,Bibb County,22858,22736,22657,22510,22541,22553,22590,22532,22300,22313,22136
5,9,Alabama,Blount County,57372,57561,57585,57630,57536,57535,57487,57801,57770,57840,57879


In [13]:
#convert strings to numeric type for calculation of %
#convert strings to numeric type for calculation of %
county_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', \
                 'POPESTIMATE2014', 'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017', \
                 'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020']] = \
county_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', \
                 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015', \
                 'POPESTIMATE2016', 'POPESTIMATE2017', 'POPESTIMATE2018', \
                 'POPESTIMATE2019', 'POPESTIMATE2020']].apply(pd.to_numeric)
county_df.head()
county_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_df[['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', \


COUNTY              int64
STNAME             object
CTYNAME            object
POPESTIMATE2010     int64
POPESTIMATE2011     int64
POPESTIMATE2012     int64
POPESTIMATE2013     int64
POPESTIMATE2014     int64
POPESTIMATE2015     int64
POPESTIMATE2016     int64
POPESTIMATE2017     int64
POPESTIMATE2018     int64
POPESTIMATE2019     int64
POPESTIMATE2020     int64
dtype: object

In [14]:
county_df["Popestimate Yearly Change 2011(%)"] = county_df["POPESTIMATE2011"] / \
                                        county_df["POPESTIMATE2010"]

county_df["Popestimate Yearly Change 2012(%)"] = county_df["POPESTIMATE2012"] / \
                                        county_df["POPESTIMATE2011"]
county_df["Popestimate Yearly Change 2013(%)"] = county_df["POPESTIMATE2013"] / \
                                        county_df["POPESTIMATE2012"]
county_df["Popestimate Yearly Change 2014(%)"] = county_df["POPESTIMATE2014"] / \
                                        county_df["POPESTIMATE2013"]
county_df["Popestimate Yearly Change 2015(%)"] = county_df["POPESTIMATE2015"] / \
                                        county_df["POPESTIMATE2014"]
county_df["Popestimate Yearly Change 2016(%)"] = county_df["POPESTIMATE2016"] / \
                                        county_df["POPESTIMATE2015"]
county_df["Popestimate Yearly Change 2017(%)"] = county_df["POPESTIMATE2017"] / \
                                        county_df["POPESTIMATE2016"]
county_df["Popestimate Yearly Change 2018(%)"] = county_df["POPESTIMATE2018"] / \
                                        county_df["POPESTIMATE2017"]
county_df["Popestimate Yearly Change 2019(%)"] = county_df["POPESTIMATE2019"] / \
                                        county_df["POPESTIMATE2018"]
county_df["Popestimate Yearly Change 2020(%)"] = county_df["POPESTIMATE2020"] / \
                                        county_df["POPESTIMATE2019"]
county_df["Popestimate 10-Year Change 2020(%)"] = county_df["POPESTIMATE2020"] / \
                                        county_df["POPESTIMATE2010"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_df["Popestimate Yearly Change 2011(%)"] = county_df["POPESTIMATE2011"] / \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_df["Popestimate Yearly Change 2012(%)"] = county_df["POPESTIMATE2012"] / \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_df["Popestimate Yearly Change 2013

In [15]:
county_pct_change_df = county_df.loc[:, ['STNAME','CTYNAME','Popestimate Yearly Change 2011(%)', \
                                             'Popestimate Yearly Change 2012(%)', \
                                             'Popestimate Yearly Change 2013(%)', \
                                             'Popestimate Yearly Change 2014(%)',
                                             'Popestimate Yearly Change 2015(%)',
                                             'Popestimate Yearly Change 2016(%)',
                                             'Popestimate Yearly Change 2017(%)',
                                             'Popestimate Yearly Change 2018(%)',
                                             'Popestimate Yearly Change 2019(%)',
                                             'Popestimate Yearly Change 2020(%)',
                                             'Popestimate 10-Year Change 2020(%)']]
county_pct_change_df.head()
    

Unnamed: 0,STNAME,CTYNAME,Popestimate Yearly Change 2011(%),Popestimate Yearly Change 2012(%),Popestimate Yearly Change 2013(%),Popestimate Yearly Change 2014(%),Popestimate Yearly Change 2015(%),Popestimate Yearly Change 2016(%),Popestimate Yearly Change 2017(%),Popestimate Yearly Change 2018(%),Popestimate Yearly Change 2019(%),Popestimate Yearly Change 2020(%),Popestimate 10-Year Change 2020(%)
1,Alabama,Autauga County,1.008546,0.99531,0.995943,1.003197,0.999654,1.007267,1.00264,1.001533,1.00425,1.006742,1.025273
2,Alabama,Baldwin County,1.018884,1.019423,1.025105,1.022197,1.019041,1.023072,1.023822,1.025073,1.025194,1.025594,1.252107
3,Alabama,Barbour County,1.000695,0.99371,0.991683,0.993394,0.982516,0.982053,0.974485,0.988796,0.990758,0.997242,0.899872
4,Alabama,Bibb County,0.994663,0.996525,0.993512,1.001377,1.000532,1.001641,0.997432,0.989704,1.000583,0.992067,0.968414
5,Alabama,Blount County,1.003294,1.000417,1.000781,0.998369,0.999983,0.999166,1.005462,0.999464,1.001212,1.000674,1.008837


In [16]:
# Sort the data by 10-Year Change by County, Highest to Lowest
county_sorted_df = county_pct_change_df.sort_values(["Popestimate 10-Year Change 2020(%)"],
                                           ascending=False)

# Reset Index
county_sorted_df = county_sorted_df.reset_index(drop=True)
county_sorted_df.head()

Unnamed: 0,STNAME,CTYNAME,Popestimate Yearly Change 2011(%),Popestimate Yearly Change 2012(%),Popestimate Yearly Change 2013(%),Popestimate Yearly Change 2014(%),Popestimate Yearly Change 2015(%),Popestimate Yearly Change 2016(%),Popestimate Yearly Change 2017(%),Popestimate Yearly Change 2018(%),Popestimate Yearly Change 2019(%),Popestimate Yearly Change 2020(%),Popestimate 10-Year Change 2020(%)
0,North Dakota,McKenzie County,1.09451,1.138359,1.163475,1.183109,1.1655,0.983381,1.008807,1.068109,1.107282,1.013566,2.377105
1,Texas,Loving County,1.130952,0.905263,1.232558,0.839623,1.337079,0.983193,1.136752,1.120301,1.107383,1.09697,2.154762
2,North Dakota,Williams County,1.080829,1.095753,1.107195,1.085845,1.099888,0.969106,0.981129,1.061209,1.0616,1.021189,1.713072
3,Texas,Hays County,1.032103,1.032146,1.044696,1.050195,1.053171,1.051355,1.050259,1.037536,1.033518,1.047591,1.526796
4,Texas,Comal County,1.025771,1.024695,1.032777,1.038807,1.045446,1.042974,1.046969,1.053361,1.055353,1.053549,1.507735


In [17]:
# Print out the data for the county with the highest 10-Year Change
highest_10y_change = county_sorted_df.loc[0, :]
highest_10y_change

STNAME                                   North Dakota
CTYNAME                               McKenzie County
Popestimate Yearly Change 2011(%)             1.09451
Popestimate Yearly Change 2012(%)            1.138359
Popestimate Yearly Change 2013(%)            1.163475
Popestimate Yearly Change 2014(%)            1.183109
Popestimate Yearly Change 2015(%)              1.1655
Popestimate Yearly Change 2016(%)            0.983381
Popestimate Yearly Change 2017(%)            1.008807
Popestimate Yearly Change 2018(%)            1.068109
Popestimate Yearly Change 2019(%)            1.107282
Popestimate Yearly Change 2020(%)            1.013566
Popestimate 10-Year Change 2020(%)           2.377105
Name: 0, dtype: object

In [18]:
# Print out the data for the county with the lowest 10-Year Change
lowest_10y_change = county_sorted_df.loc[len(county_sorted_df)-1, :]
lowest_10y_change

STNAME                                        Illinois
CTYNAME                               Alexander County
Popestimate Yearly Change 2011(%)             0.974162
Popestimate Yearly Change 2012(%)              0.96597
Popestimate Yearly Change 2013(%)             0.937961
Popestimate Yearly Change 2014(%)             0.978183
Popestimate Yearly Change 2015(%)             0.954687
Popestimate Yearly Change 2016(%)             0.953423
Popestimate Yearly Change 2017(%)             0.973325
Popestimate Yearly Change 2018(%)             0.962874
Popestimate Yearly Change 2019(%)             0.958299
Popestimate Yearly Change 2020(%)             0.949232
Popestimate 10-Year Change 2020(%)            0.669957
Name: 3142, dtype: object