In [None]:
import pandas as pd
import urllib.request
import zipfile
import os

In [None]:
# Data Sources
urls = ['https://www.bls.gov/oes/special.requests/oesm17ma.zip', 
'https://www.bls.gov/oes/special.requests/oesm16ma.zip',
'https://www.bls.gov/oes/special.requests/oesm15ma.zip',
'https://www.bls.gov/oes/special.requests/oesm14ma.zip',
'https://www.bls.gov/oes/special.requests/oesm13ma.zip',
'https://www.bls.gov/oes/special.requests/oesm12ma.zip',
'https://www.bls.gov/oes/special.requests/oesm11ma.zip',
'https://www.bls.gov/oes/special.requests/oesm10ma.zip',
'https://www.bls.gov/oes/special.requests/oesm09ma.zip',
'https://www.bls.gov/oes/special.requests/oesm08ma.zip']

oeszip = ['data/raw/oesm17ma.zip', 
'data/raw/oesm16ma.zip',
'data/raw/oesm15ma.zip',
'data/raw/oesm14ma.zip',
'data/raw/oesm13ma.zip',
'data/raw/oesm12ma.zip',
'data/raw/oesm11ma.zip',
'data/raw/oesm10ma.zip',
'data/raw/oesm09ma.zip',
'data/raw/oesm08ma.zip']

oesData = {'2008': {
            'inFiles': {
                1: {'filename': "./Data/raw/MSA__M2008_dl_1.xls", 'sheet': "MSA_dl_1"},
                2: {'filename': "./Data/raw/MSA_M2008_dl_2.xls", 'sheet': "MSA_dl_2"},
                3: {'filename': "./Data/raw/MSA_M2008_dl_3.xls", 'sheet': "MSA_dl_3"}
            },
            'outFile': 'data/preprocessed/oes2008.csv',
            },
           '2009': {
               'inFiles': { 
                   1: {'filename': "./Data/raw/MSA_dl_1.xls", 'sheet': "MSA_dl_1"},
                   2: {'filename': "./Data/raw/MSA_dl_2.xls", 'sheet': "MSA_dl_2"},
                   3: {'filename': "./Data/raw/MSA_dl_3.xls",'sheet': "MSA_dl_3"}
               },
               'outFile': 'data/preprocessed/oes2009.csv', 
           },
           '2010': {
               'inFiles': { 
                   1: {'filename': "./Data/raw/MSA_M2010_dl_1.xls", 'sheet': "MSA_dl_1"},
                   2: {'filename': "./Data/raw/MSA_M2010_dl_2.xls", 'sheet': "MSA_dl_2"},
                   3: {'filename': "./Data/raw/MSA_M2010_dl_3.xls", 'sheet': "MSA_dl_3"}
               },
               'outFile': 'data/preprocessed/oes2010.csv', 
           },
           '2011': {
               'inFiles': { 
                   1: {'filename': "./Data/raw/MSA_M2011_dl_1_AK_IN.xls", 'sheet': "MSA_dl_1"},
                   2: {'filename': "./Data/raw/MSA_M2011_dl_2_KS_NY.xls", 'sheet': "MSA_dl_2"},
                   3: {'filename': "./Data/raw/MSA_M2011_dl_3_OH_WY.xls", 'sheet': "MSA_dl_3"}
               },
               'outFile': 'data/preprocessed/oes2011.csv', 
           },
           '2012': {
               'inFiles': { 
                   1: {'filename': "./Data/raw/MSA_M2012_dl_1_AK_IN.xls", 'sheet': "MSA_dl_1"},
                   2: {'filename': "./Data/raw/MSA_M2012_dl_2_KS_NY.xls", 'sheet': "MSA_dl_2"},
                   3: {'filename': "./Data/raw/MSA_M2012_dl_3_OH_WY.xls", 'sheet': "MSA_dl_3"}
               },
               'outFile': 'data/preprocessed/oes2012.csv', 
           },
           '2013': {
               'inFiles': { 
                   1: {'filename': "./Data/raw/MSA_M2013_dl_1_AK_IN.xls", 'sheet': "MSA_dl_1"},
                   2: {'filename': "./Data/raw/MSA_M2013_dl_2_KS_NY.xls", 'sheet': "MSA_dl_2"},
                   3: {'filename': "./Data/raw/MSA_M2013_dl_3_OH_WY.xls", 'sheet': "MSA_dl_3"}
               },
               'outFile': 'data/preprocessed/oes2013.csv', 
           },
           '2014': {
               'inFiles': { 
                   1: {'filename': "./data/raw/oesm14ma/MSA_M2014_dl.xls", 'sheet': "MSA_dl"}
               },
               'outFile': 'data/preprocessed/oes2014.csv', 
           },
           '2015': {
               'inFiles': { 
                   1: {'filename': "./data/raw/oesm15ma/MSA_M2015_dl.xls", 'sheet': "MSA_dl"}
               },
               'outFile': 'data/preprocessed/oes2015.csv', 
           },
           '2016': {
               'inFiles': { 
                   1: {'filename': "./data/raw/oesm16ma/MSA_M2016_dl.xls", 'sheet': "MSA_dl"}
               },
               'outFile': 'data/preprocessed/oes2016.csv', 
           },
           '2017': {
               'inFiles': { 
                   1: {'filename': "./data/raw/oesm17ma/MSA_M2017_dl.xls", 'sheet': "MSA_dl"}
               },
               'outFile': 'data/preprocessed/oes2017.csv', 
           }
        }
        

In [None]:
def getData(urls, files):
    for i, j in zip(urls, files):
        urllib.request.urlretrieve(i, j)
        with zipfile.ZipFile(j) as oeszip:
            oeszip.extractall(path = 'data/raw/')            
#getData(urls, oeszip)  

In [None]:
def prepData():
    directory = "./data/preprocessed"
    if not os.path.exists(directory):
        os.makedirs(directory)
    dfcols = ['YEAR' 'PRIM_STATE', 'AREA', 'AREA_NAME', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP',
              'EMP_PRSE', 'JOBS_1000','A_MEAN', 'MEAN_PRSE', 'A_MEDIAN']
    allData = []
    for year, info in oesData.items():
        df = []
        for file in info['inFiles']:
            xl = pd.ExcelFile(info['inFiles'][file]['filename'])
            pdf = xl.parse(info['inFiles'][file]['sheet'])
            df.append(pdf)
        df = pd.concat(df)
        if (year == 2008):
            df2cols = ['AREA', 'TOT_EMP']
            df2 =  (df[(df.OCC_CODE == '00-0000')]
                    .drop(df.columns.difference(df2cols), 1, inplace = False)
                    .rename(columns = {'TOT_EMP': 'TOT_EMP_ALL'}))
            df = (df.merge(df2, how = 'left', on = 'AREA')
              .replace(["**", "*", "#"], pd.np.nan).dropna(axis=0, how = 'any', 
                                                           subset = ['TOT_EMP', 'EMP_PRSE',
                                                                     'MEAN_PRSE','A_MEDIAN'])
              .apply(pd.to_numeric, errors = 'ignore')
              .assign(JOBS_1000 = lambda x: x.TOT_EMP / (x.TOT_EMP_ALL/1000),
                      YEAR = year))
                  
        df.drop(df.columns.difference(dfcols), 1, inplace = False)
        df = df[(df.OCC_CODE == '15-0000')]
        df.to_csv(path_or_buf = info['outFile'])
        allData.append(df)
    return allData
preprocessed = prepData
preprocessed.info()           
            