In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
work_df = pd.read_csv('resources/data/unemployment-by-state.csv')

In [3]:
work_df.head()

Unnamed: 0,State,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,United States,4.6,5.8,9.3,9.6,8.9,8.1,7.4,6.2,5.3,4.9,4.4
1,Alabama,4.0,5.7,11.0,10.5,9.6,8.0,7.2,6.8,6.1,5.9,4.4
2,Alaska,6.3,6.7,7.7,7.9,7.6,7.1,7.0,6.9,6.5,6.9,7.2
3,Arizona,3.9,6.2,9.9,10.4,9.5,8.3,7.7,6.8,6.1,5.4,4.9
4,Arkansas,5.3,5.5,7.8,8.2,8.3,7.6,7.2,6.0,5.0,3.9,3.7


In [4]:
work_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 12 columns):
State    52 non-null object
2007     52 non-null float64
2008     52 non-null float64
2009     52 non-null float64
2010     52 non-null float64
2011     52 non-null float64
2012     52 non-null float64
2013     52 non-null float64
2014     52 non-null float64
2015     52 non-null float64
2016     52 non-null float64
2017     52 non-null float64
dtypes: float64(11), object(1)
memory usage: 5.0+ KB


In [5]:
work_df.State.unique()

array(['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [6]:
work_df.query('State == "District of Columbia"')

Unnamed: 0,State,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
9,District of Columbia,5.5,6.5,9.3,9.4,10.2,9.0,8.5,7.8,6.9,6.1,6.1


Drop unnecessary state names

In [7]:
work_df.drop([0, 9], axis=0, inplace=True)

In [8]:
work_df.shape

(50, 12)

Convert to long format

In [9]:
work_df = pd.melt(work_df, 
        id_vars='State', 
        value_vars=['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'], 
        var_name='year', 
        value_name='unemployment')

In [10]:
work_df.head()

Unnamed: 0,State,year,unemployment
0,Alabama,2007,4.0
1,Alaska,2007,6.3
2,Arizona,2007,3.9
3,Arkansas,2007,5.3
4,California,2007,5.4


Change to lower case column names

In [11]:
cols = ['state', 'year', 'unemployment']
work_df.columns = cols

In [12]:
work_df.head()

Unnamed: 0,state,year,unemployment
0,Alabama,2007,4.0
1,Alaska,2007,6.3
2,Arizona,2007,3.9
3,Arkansas,2007,5.3
4,California,2007,5.4


In [13]:
work_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 3 columns):
state           550 non-null object
year            550 non-null object
unemployment    550 non-null float64
dtypes: float64(1), object(2)
memory usage: 13.0+ KB


In [14]:
work_df.to_csv('datasets/unemployment.csv')

In [18]:
work_df.state.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
       'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
      dtype=object)

### Join to tableau data

In [22]:
df = pd.read_csv('datasets/tableau-data.csv')
df.head()

Unnamed: 0,year,total,individual,family,youth,child,yadult,yparent,yparent_u18,yparent_18to24,ypchild,state,population,per_capita,individual_rent,family_rent
0,2017,1845.0,1354.0,491.0,162.0,15.0,147.0,22.0,0.0,22.0,39.0,Alaska,739795.0,2.493934,878.625,1642.0
1,2017,3793.0,2985.0,808.0,294.0,36.0,258.0,29.0,6.0,23.0,39.0,Alabama,4874747.0,0.778092,625.791667,1027.611111
2,2017,2467.0,2068.0,399.0,208.0,17.0,191.0,10.0,0.0,10.0,13.0,Arkansas,3004279.0,0.821162,544.416667,942.305556
3,2017,8947.0,6488.0,2459.0,578.0,55.0,523.0,81.0,0.0,81.0,112.0,Arizona,7016270.0,1.275179,744.416667,1399.694444
4,2017,134278.0,112756.0,21522.0,15458.0,1649.0,13809.0,890.0,16.0,874.0,1058.0,California,39536653.0,3.396292,1308.875,2402.277778


Convert `work_df.year` to int for joining

In [20]:
work_df.year = work_df.year.astype(int)

In [23]:
df = pd.merge(left=df, right=work_df, how='left', on=['state', 'year'])
df.head()

Unnamed: 0,year,total,individual,family,youth,child,yadult,yparent,yparent_u18,yparent_18to24,ypchild,state,population,per_capita,individual_rent,family_rent,unemployment
0,2017,1845.0,1354.0,491.0,162.0,15.0,147.0,22.0,0.0,22.0,39.0,Alaska,739795.0,2.493934,878.625,1642.0,7.2
1,2017,3793.0,2985.0,808.0,294.0,36.0,258.0,29.0,6.0,23.0,39.0,Alabama,4874747.0,0.778092,625.791667,1027.611111,4.4
2,2017,2467.0,2068.0,399.0,208.0,17.0,191.0,10.0,0.0,10.0,13.0,Arkansas,3004279.0,0.821162,544.416667,942.305556,3.7
3,2017,8947.0,6488.0,2459.0,578.0,55.0,523.0,81.0,0.0,81.0,112.0,Arizona,7016270.0,1.275179,744.416667,1399.694444,4.9
4,2017,134278.0,112756.0,21522.0,15458.0,1649.0,13809.0,890.0,16.0,874.0,1058.0,California,39536653.0,3.396292,1308.875,2402.277778,4.8


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 0 to 560
Data columns (total 17 columns):
year               561 non-null int64
total              561 non-null float64
individual         561 non-null float64
family             561 non-null float64
youth              153 non-null float64
child              153 non-null float64
yadult             153 non-null float64
yparent            153 non-null float64
yparent_u18        153 non-null float64
yparent_18to24     153 non-null float64
ypchild            153 non-null float64
state              561 non-null object
population         550 non-null float64
per_capita         550 non-null float64
individual_rent    200 non-null float64
family_rent        200 non-null float64
unemployment       550 non-null float64
dtypes: float64(15), int64(1), object(1)
memory usage: 78.9+ KB
