## Datasets
- Unemployment information, table(s) 11b: https://www.bls.gov/cps/tables.htm

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import requests
import os
from datetime import date

In [2]:
if not os.path.isdir('../blsdata'):
    !mkdir blsdata
    print('directory created')

In [3]:
# Function tidy-izes the data
def organize(df, year):
    df = df[7:]
    df = df.rename(columns={df.columns[0]: 'Occupation', df.columns[1]: 'Total{}'.format(year)})
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df = df[['Occupation', 'Total{}'.format(year)]]

    
    return df

In [4]:
def download_file(year, file_prefix, url, directory=''):
    # If the file to grab is from last year (the most recent), it will have a different link structure
    if year != (date.today().year - 1):
        r = requests.get(url + str(year) + '/cpsaat11b.xlsx', allow_redirects=True)
    else:
        r = requests.get('https://www.bls.gov/cps/cpsaat11b.xlsx', allow_redirects=True)
    
    open('{}/{}{}.xlsx'.format(directory,file_prefix,year), 'wb').write(r.content)

In [5]:
# Downloads the files if not already downloaded
directory = 'blsdata'
file_prefix = 'cpsaat11b'
for year in range(2011, 2020):
    if os.path.isfile('{}/{}{}'.format(directory, file_prefix, year)):
        download_file(year, file_prefix, 'https://www.bls.gov/cps/aa', directory)
        print('{} data downloaded'.format(year))

In [6]:
# Creates and cleans dataframes for the year
employment2013 = organize(pd.read_excel('../blsdata/cpsaat11b2013.xlsx'), 2013)
employment2014 = organize(pd.read_excel('../blsdata/cpsaat11b2014.xlsx'), 2014)
employment2015 = organize(pd.read_excel('../blsdata/cpsaat11b2015.xlsx'), 2015)
employment2016 = organize(pd.read_excel('../blsdata/cpsaat11b2016.xlsx'), 2016)
employment2017 = organize(pd.read_excel('../blsdata/cpsaat11b2017.xlsx'), 2017)
employment2018 = organize(pd.read_excel('../blsdata/cpsaat11b2018.xlsx'), 2018)
employment2019 = organize(pd.read_excel('../blsdata/cpsaat11b2019.xlsx'), 2019)

In [7]:
# Look at the data
employment2016.head()

Unnamed: 0,Occupation,Total2016
0,"Management, professional, and related occupations",59438
1,"Management, business, and financial operations...",24941
2,Management occupations,17418
3,Chief executives,1649
4,General and operations managers,949


In [8]:
# Continue looking at the data
employment2017.head()

Unnamed: 0,Occupation,Total2017
0,"Management, professional, and related occupations",60901
1,"Management, business, and financial operations...",25379
2,Management occupations,17804
3,Chief executives,1639
4,General and operations managers,1005


In [9]:
# Merge each dataframe into one frame
employment = pd.merge(pd.merge(employment2013, pd.merge(employment2014, employment2015)), pd.merge(pd.merge(employment2016, employment2017), pd.merge(employment2018, employment2019)))

In [10]:
# Look at the data, again
employment.head()

Unnamed: 0,Occupation,Total2013,Total2014,Total2015,Total2016,Total2017,Total2018,Total2019
0,"Management, professional, and related occupations",54712,56050,57960,59438,60901,62436,64218
1,"Management, business, and financial operations...",22794,23171,24108,24941,25379,25850,26981
2,Management occupations,16037,16199,16994,17418,17804,18263,18985
3,Chief executives,1520,1603,1517,1649,1639,1573,1602
4,General and operations managers,1075,887,899,949,1005,1037,1058
