In [1]:
# import dependencies
import pandas as pd
import mitosheet

## Cleaning data on total population

In [2]:
# load total county population data into dataframe
county_pop_df = pd.read_csv('Resources/County_population_totals_2010_2019_co-est2019-alldata.csv')
county_pop_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,State and County name,CENSUS2010POP,ESTIMATESBASE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,"Alabama, Alabama",4779736,4780125,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,"Autauga County, Alabama",54571,54597,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,"Baldwin County, Alabama",182265,182265,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,"Barbour County, Alabama",27457,27455,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,"Bibb County, Alabama",22915,22915,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [3]:
# create dataframe with state, county, 2015 population, and 2019 population
county_pop_15_19_df = county_pop_df[['STNAME', 'CTYNAME',
       'State and County name','POPESTIMATE2015','POPESTIMATE2019']]
county_pop_15_19_df.head()

Unnamed: 0,STNAME,CTYNAME,State and County name,POPESTIMATE2015,POPESTIMATE2019
0,Alabama,Alabama,"Alabama, Alabama",4852347,4903185
1,Alabama,Autauga County,"Autauga County, Alabama",54864,55869
2,Alabama,Baldwin County,"Baldwin County, Alabama",202939,223234
3,Alabama,Barbour County,"Barbour County, Alabama",26283,24686
4,Alabama,Bibb County,"Bibb County, Alabama",22566,22394


In [4]:
from mitosheet import *; register_analysis('UUID-179d7662-be95-47c5-885f-749412f02246')

# Reordered State and County name in county_pop_15_19_df
county_pop_15_19_df_columns = [col for col in county_pop_15_19_df.columns if col != 'State and County name']
county_pop_15_19_df_columns.insert(4, 'State and County name')
county_pop_15_19_df = county_pop_15_19_df[county_pop_15_19_df_columns]

# Reordered State and County name in county_pop_15_19_df
county_pop_15_19_df_columns = [col for col in county_pop_15_19_df.columns if col != 'State and County name']
county_pop_15_19_df_columns.insert(2, 'State and County name')
county_pop_15_19_df = county_pop_15_19_df[county_pop_15_19_df_columns]

county_pop_15_19_df.head()

Unnamed: 0,STNAME,CTYNAME,State and County name,POPESTIMATE2015,POPESTIMATE2019
0,Alabama,Alabama,"Alabama, Alabama",4852347,4903185
1,Alabama,Autauga County,"Autauga County, Alabama",54864,55869
2,Alabama,Baldwin County,"Baldwin County, Alabama",202939,223234
3,Alabama,Barbour County,"Barbour County, Alabama",26283,24686
4,Alabama,Bibb County,"Bibb County, Alabama",22566,22394


## Cleaning data on migration

### Reading in data from text file that doesn't hav column names

In [5]:
# create column names for datafame
col_names = ['State Name of Geography A','County Name of Geography A',
            'State/U.S. Island Area/Foreign Region of Geography B', 'County Name of Geography B', 
            'Flow from Geography B to Geography A - Est', 'Flow from Geography B to Geography A - MOE',
            'Counterflow from Geography A to Geography B - Est', 'Counterflow from Geography A to Geography B - MOE',
            'Net Migration from Geography B to Geography A - Est', 'Net Migration from Geography B to Geography A - MOE',
            'Gross Migration between Geography A and Geography B - Est', 'Gross Migration between Geography A and Geography B - MOE']

In [6]:
# load population migration data
migration_data = pd.read_csv('Resources/Net_Gross_US.txt',sep=' ', names = col_names, delimiter=r'\s{3,}')
migration_df = pd.DataFrame(migration_data,index=None)
migration_df.head()

  migration_data = pd.read_csv('Resources/Net_Gross_US.txt',sep=' ', names = col_names, delimiter=r'\s{3,}')


Unnamed: 0,State Name of Geography A,County Name of Geography A,State/U.S. Island Area/Foreign Region of Geography B,County Name of Geography B,Flow from Geography B to Geography A - Est,Flow from Geography B to Geography A - MOE,Counterflow from Geography A to Geography B - Est,Counterflow from Geography A to Geography B - MOE,Net Migration from Geography B to Geography A - Est,Net Migration from Geography B to Geography A - MOE,Gross Migration between Geography A and Geography B - Est,Gross Migration between Geography A and Geography B - MOE
0,001001001003 Alabama,Autauga County,Alabama,Baldwin County,25,26,448,461.0,-423.0,461.0,473.0,463.0
1,001001001005 Alabama,Autauga County,Alabama,Barbour County,4,8,0,22.0,4.0,8.0,4.0,8.0
2,001001001007 Alabama,Autauga County,Alabama,Bibb County,10,16,34,52.0,-24.0,54.0,44.0,54.0
3,001001001009 Alabama,Autauga County,Alabama,Blount County,3,6,0,28.0,3.0,6.0,3.0,6.0
4,001001001015 Alabama,Autauga County,Alabama,Calhoun County,2,4,42,46.0,-40.0,47.0,44.0,46.0


In [7]:
# create cleaned dataframe for with just state and county name and migration
cleaned_migration_df = migration_df[['State Name of Geography A','County Name of Geography A', 'Net Migration from Geography B to Geography A - Est']]
cleaned_migration_df.head()

Unnamed: 0,State Name of Geography A,County Name of Geography A,Net Migration from Geography B to Geography A - Est
0,001001001003 Alabama,Autauga County,-423.0
1,001001001005 Alabama,Autauga County,4.0
2,001001001007 Alabama,Autauga County,-24.0
3,001001001009 Alabama,Autauga County,3.0
4,001001001015 Alabama,Autauga County,-40.0


In [8]:
cleaned_migration_df['State Name of Geography A']=cleaned_migration_df['State Name of Geography A'].str.replace('\d+', '')
cleaned_migration_df.head()

  cleaned_migration_df['State Name of Geography A']=cleaned_migration_df['State Name of Geography A'].str.replace('\d+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_migration_df['State Name of Geography A']=cleaned_migration_df['State Name of Geography A'].str.replace('\d+', '')


Unnamed: 0,State Name of Geography A,County Name of Geography A,Net Migration from Geography B to Geography A - Est
0,Alabama,Autauga County,-423.0
1,Alabama,Autauga County,4.0
2,Alabama,Autauga County,-24.0
3,Alabama,Autauga County,3.0
4,Alabama,Autauga County,-40.0


In [9]:
# rename columns
cleaned_migration_df.rename(columns = 
                        {'State Name of Geography A':'State',
                        'County Name of Geography A':'County',
                        'Net Migration from Geography B to Geography A - Est':'Net Migration 2015-19'},
                       inplace = True)
cleaned_migration_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,State,County,Net Migration 2015-19
0,Alabama,Autauga County,-423.0
1,Alabama,Autauga County,4.0
2,Alabama,Autauga County,-24.0
3,Alabama,Autauga County,3.0
4,Alabama,Autauga County,-40.0


In [10]:
# Create column that combines County and State
cleaned_migration_df['State and County name'] = cleaned_migration_df[['County', 'State']].agg(','.join, axis=1)
cleaned_migration_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_migration_df['State and County name'] = cleaned_migration_df[['County', 'State']].agg(','.join, axis=1)


Unnamed: 0,State,County,Net Migration 2015-19,State and County name
0,Alabama,Autauga County,-423.0,"Autauga County, Alabama"
1,Alabama,Autauga County,4.0,"Autauga County, Alabama"
2,Alabama,Autauga County,-24.0,"Autauga County, Alabama"
3,Alabama,Autauga County,3.0,"Autauga County, Alabama"
4,Alabama,Autauga County,-40.0,"Autauga County, Alabama"


In [11]:
# drop null values
cleaned_migration_df=cleaned_migration_df.dropna()
cleaned_migration_df.head()

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


Unnamed: 0,State,County,Net Migration 2015-19,State and County name
0,Alabama,Autauga County,-423.0,"Autauga County, Alabama"
1,Alabama,Autauga County,4.0,"Autauga County, Alabama"
2,Alabama,Autauga County,-24.0,"Autauga County, Alabama"
3,Alabama,Autauga County,3.0,"Autauga County, Alabama"
4,Alabama,Autauga County,-40.0,"Autauga County, Alabama"


In [12]:
# remove state and county columns
cleaned_migration_df = cleaned_migration_df[['State and County name','Net Migration 2015-19']]
cleaned_migration_df.head()

Unnamed: 0,State and County name,Net Migration 2015-19
0,"Autauga County, Alabama",-423.0
1,"Autauga County, Alabama",4.0
2,"Autauga County, Alabama",-24.0
3,"Autauga County, Alabama",3.0
4,"Autauga County, Alabama",-40.0


In [13]:
# group by county to find net migration for each county
net_migration_df = cleaned_migration_df.groupby(['State and County name'], as_index=False).sum()
net_migration_df.head()

Unnamed: 0,State and County name,Net Migration 2015-19
0,"Abbeville County, South Carolina",110.0
1,"Acadia Parish, Louisiana",-1701.0
2,"Accomack County, Virginia",-953.0
3,"Ada County, Idaho",4908.0
4,"Adair County, Iowa",-408.0


## Merge DataFrames

In [14]:
# merge dataframes
pop_migration_df = county_pop_15_19_df.merge(net_migration_df, on='State and County name')
pop_migration_df.head()

Unnamed: 0,STNAME,CTYNAME,State and County name,POPESTIMATE2015,POPESTIMATE2019,Net Migration 2015-19
0,Alabama,Autauga County,"Autauga County, Alabama",54864,55869,-2535.0
1,Alabama,Baldwin County,"Baldwin County, Alabama",202939,223234,2295.0
2,Alabama,Barbour County,"Barbour County, Alabama",26283,24686,-287.0
3,Alabama,Bibb County,"Bibb County, Alabama",22566,22394,156.0
4,Alabama,Blount County,"Blount County, Alabama",57526,57826,-753.0


In [15]:
from mitosheet import *; register_analysis('UUID-b7812c44-6066-4ec1-9b2b-962204249c11')

# Reordered State and County name in pop_migration_df
pop_migration_df_columns = [col for col in pop_migration_df.columns if col != 'State and County name']
pop_migration_df_columns.insert(2, 'State and County name')
pop_migration_df = pop_migration_df[pop_migration_df_columns]

pop_migration_df.head()

Unnamed: 0,STNAME,CTYNAME,State and County name,POPESTIMATE2015,POPESTIMATE2019,Net Migration 2015-19
0,Alabama,Autauga County,"Autauga County, Alabama",54864,55869,-2535.0
1,Alabama,Baldwin County,"Baldwin County, Alabama",202939,223234,2295.0
2,Alabama,Barbour County,"Barbour County, Alabama",26283,24686,-287.0
3,Alabama,Bibb County,"Bibb County, Alabama",22566,22394,156.0
4,Alabama,Blount County,"Blount County, Alabama",57526,57826,-753.0


## Create pivot table showing population change due to population for each county

In [16]:
# add column showing net migration as percentage of 2015 population
pop_migration_df['% Change Due to Migration'] = pop_migration_df['Net Migration 2015-19']/pop_migration_df['POPESTIMATE2015']
pop_migration_df.head()

Unnamed: 0,STNAME,CTYNAME,State and County name,POPESTIMATE2015,POPESTIMATE2019,Net Migration 2015-19,% Change Due to Migration
0,Alabama,Autauga County,"Autauga County, Alabama",54864,55869,-2535.0,-0.046205
1,Alabama,Baldwin County,"Baldwin County, Alabama",202939,223234,2295.0,0.011309
2,Alabama,Barbour County,"Barbour County, Alabama",26283,24686,-287.0,-0.01092
3,Alabama,Bibb County,"Bibb County, Alabama",22566,22394,156.0,0.006913
4,Alabama,Blount County,"Blount County, Alabama",57526,57826,-753.0,-0.01309


In [17]:
# rename columns
pop_migration_df.rename(columns = 
                        {'STNAME':'State',
                        'CTYNAME':'County',
                        'State and County name':'County and State',
                        'POPESTIMATE2015':'2015 Population',
                        'POPESTIMATE2019':'2019 Population'},
                       inplace = True)

In [18]:
# create pivot table
pop_migration_piv = pd.pivot_table(pop_migration_df, 
               index = ['State','County'], 
               values = ['2015 Population','Net Migration 2015-19','% Change Due to Migration'])

In [19]:
pop_migration_piv

Unnamed: 0_level_0,Unnamed: 1_level_0,% Change Due to Migration,2015 Population,Net Migration 2015-19
State,County,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,Autauga County,-0.046205,54864,-2535.0
Alabama,Baldwin County,0.011309,202939,2295.0
Alabama,Barbour County,-0.010920,26283,-287.0
Alabama,Bibb County,0.006913,22566,156.0
Alabama,Blount County,-0.013090,57526,-753.0
...,...,...,...,...
Wyoming,Sweetwater County,0.006373,44719,285.0
Wyoming,Teton County,0.012800,23047,295.0
Wyoming,Uinta County,-0.002890,20763,-60.0
Wyoming,Washakie County,-0.001450,8278,-12.0


In [20]:
# save final cleaned dataframe to csv
pop_migration_df.to_csv('Resources/county_level_migration_15-19.csv', index = False)