# Data
- Occupations by State and Likelihood of Automation: https://data.world/wnedds/occupations-by-state-and-likelihood-of-automation
- Employment by state data: https://www.bea.gov/data/employment/employment-by-state
- State abbreviations: https://gist.github.com/JeffPaine/3083347

Graphing code from https://plotly.com/python/choropleth-maps/

# Data Handling (ingestion/cleaning/creation)

In [1]:
#Uncomment and run to install package for graph
#!pip install plotly

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import patsy
import statsmodels.api as sm
import plotly.graph_objects as go

In [3]:
# Adding state data for handling
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
          'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois','Indiana', 'Iowa', 'Kansas', 
          'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 
          'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 
          'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 
          'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 
          'West Virginia', 'Wisconsin', 'Wyoming']

states_abbv = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 
               'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
               'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
               'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
               'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

In [4]:
# Read in datasets
df_occ = pd.read_csv('../datasets/raw_state_automation_data.csv', encoding='cp1252')
df_employment = pd.read_excel('../datasets/employmentbystate.xls')

In [5]:
# Clean employment data
df_employment = df_employment[5:]
df_employment.dropna(inplace=True)
df_employment = df_employment.rename(columns={df_employment.columns[1]:'State', df_employment.columns[2]:'Employment'})
df_employment.reset_index(inplace=True)
df_employment = df_employment[['State', 'Employment']]
df_employment.Employment = df_employment.Employment.astype(int)

# Reshape data to easily apply later transformation
df_employment = df_employment.transpose()
df_employment.columns = df_employment.iloc[0]
df_employment = df_employment.iloc[1:]

In [6]:
# Only want to look at probability, state data so remove other columns
df_occ.drop(columns=['SOC', 'Occupation'], inplace=True)

In [7]:
# Transform employment data to reflect employment relative to population
for state in states:
    df_occ[state] = df_occ[state].apply(lambda x: x/df_employment[state])

In [8]:
# Observe the data and see that is it chillin'
df_occ.head()

Unnamed: 0,Probability,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,0.015,0.000393,0.001662,0.001577,0.001663,0.001345,0.00024,0.000613,0.000588,0.003186,...,0.000937,0.001382,0.000353,0.001888,0.000645,0.001229,0.001371,0.001102,0.001023,0.000402
1,0.16,0.010282,0.01419,0.011874,0.012693,0.0113,0.011325,0.014469,0.007052,0.02985,...,0.006244,0.011239,0.010106,0.018721,0.006362,0.010186,0.010153,0.011466,0.008844,0.012153
2,0.039,1.9e-05,8.7e-05,0.000129,6.8e-05,0.000162,0.000131,0.00013,0.0,0.000247,...,0.0,0.00017,7.3e-05,0.000197,9.2e-05,4.7e-05,0.000148,4.5e-05,5.5e-05,0.0
3,0.014,0.000202,0.000437,0.001314,0.000669,0.001441,0.000834,0.002161,0.00102,0.001436,...,0.0001,0.000861,0.000574,0.0012,0.000876,0.000747,0.001353,0.000292,0.000815,7.5e-05
4,0.013,0.000958,0.000875,0.002921,0.001627,0.002986,0.001246,0.003061,0.001486,0.001324,...,0.000502,0.00225,0.001337,0.001738,0.001106,0.001042,0.001715,0.000674,0.001567,0.000452


In [9]:
# Build composite likelihood of unemployment per state
state_likelihood = []
for state in states:
    likelihood = 0
    for index in range(len(df_occ)):
        likelihood += df_occ['Probability'][index] * df_occ[state][index]
    state_likelihood.append(likelihood)
    #print('state: {}\n\t {}'.format(state, likelihood))

# Graphing

In [11]:
fig = go.Figure(data=go.Choropleth(
    locations=states_abbv, # Spatial coordinates
    z = state_likelihood, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Blues',
    colorbar_title =  "Automation Probability",
))

fig.update_layout(
    title_text = 'Likelihood of Job Automation by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()