In [1]:
import pandas as pd
from sqlalchemy import create_engine
import urllib2

In [2]:
test_name = 'mo_wac_S000_JT00_2015.csv.gz'
base_url = 'https://lehd.ces.census.gov/data/lodes/LODES7/mo/wac/'

# test reading directly with pandas
df = pd.read_csv(base_url+test_name)

In [3]:
df.head()

Unnamed: 0,w_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,...,CFA02,CFA03,CFA04,CFA05,CFS01,CFS02,CFS03,CFS04,CFS05,createdate
0,290019501001019,4,0,2,2,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20170919
1,290019501001022,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,20170919
2,290019501001025,2,0,1,1,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,20170919
3,290019501001047,43,12,22,9,8,26,9,0,0,...,0,0,0,0,0,0,0,0,0,20170919
4,290019501001055,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20170919


In [4]:
df.shape

(53112, 53)

In [5]:
# table name
test_name.split('.')[0].lower()

'mo_wac_s000_jt00_2015'

In [6]:
# database connection
DB = 'template_postgis'
USER = 'postgres'
HOST = 'localhost'
PORT = '5432'
PW = 'postgres'

DB_CONNECTION_STRING = 'postgresql://{username}:{password}@{host}:{port}/{database}'.format(
username=USER, password=PW, host=HOST, port=PORT, database=DB)

engine = create_engine(DB_CONNECTION_STRING)

In [7]:
# check column names and types:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53112 entries, 0 to 53111
Data columns (total 53 columns):
w_geocode     53112 non-null int64
C000          53112 non-null int64
CA01          53112 non-null int64
CA02          53112 non-null int64
CA03          53112 non-null int64
CE01          53112 non-null int64
CE02          53112 non-null int64
CE03          53112 non-null int64
CNS01         53112 non-null int64
CNS02         53112 non-null int64
CNS03         53112 non-null int64
CNS04         53112 non-null int64
CNS05         53112 non-null int64
CNS06         53112 non-null int64
CNS07         53112 non-null int64
CNS08         53112 non-null int64
CNS09         53112 non-null int64
CNS10         53112 non-null int64
CNS11         53112 non-null int64
CNS12         53112 non-null int64
CNS13         53112 non-null int64
CNS14         53112 non-null int64
CNS15         53112 non-null int64
CNS16         53112 non-null int64
CNS17         53112 non-null int64
CNS18         53

In [9]:
# make column names lower case
col_names = [c.lower() for c in df.columns.tolist()]

# names as dictionary to replace df column names
new_names = dict(zip(df.columns.tolist(), col_names))

df.rename(columns=new_names, inplace=True)

# look right?
print(df.columns.tolist())

['w_geocode', 'c000', 'ca01', 'ca02', 'ca03', 'ce01', 'ce02', 'ce03', 'cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14', 'cns15', 'cns16', 'cns17', 'cns18', 'cns19', 'cns20', 'cr01', 'cr02', 'cr03', 'cr04', 'cr05', 'cr07', 'ct01', 'ct02', 'cd01', 'cd02', 'cd03', 'cd04', 'cs01', 'cs02', 'cfa01', 'cfa02', 'cfa03', 'cfa04', 'cfa05', 'cfs01', 'cfs02', 'cfs03', 'cfs04', 'cfs05', 'createdate']


In [10]:
# w_geocode and createdate columns should be string
df['w_geocode'] = df['w_geocode'].astype(str)
df['createdate'] = df['createdate'].astype(str)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53112 entries, 0 to 53111
Data columns (total 53 columns):
w_geocode     53112 non-null object
c000          53112 non-null int64
ca01          53112 non-null int64
ca02          53112 non-null int64
ca03          53112 non-null int64
ce01          53112 non-null int64
ce02          53112 non-null int64
ce03          53112 non-null int64
cns01         53112 non-null int64
cns02         53112 non-null int64
cns03         53112 non-null int64
cns04         53112 non-null int64
cns05         53112 non-null int64
cns06         53112 non-null int64
cns07         53112 non-null int64
cns08         53112 non-null int64
cns09         53112 non-null int64
cns10         53112 non-null int64
cns11         53112 non-null int64
cns12         53112 non-null int64
cns13         53112 non-null int64
cns14         53112 non-null int64
cns15         53112 non-null int64
cns16         53112 non-null int64
cns17         53112 non-null int64
cns18         5

In [15]:
# test write to DB:
df.to_sql(test_name.split('.')[0].lower(), engine, index=False)

In [14]:
# read some values
pd.read_sql('SELECT w_geocode, c000, cfs05 FROM {} LIMIT 10;'.format(test_name.split('.')[0].lower()), engine)

Unnamed: 0,w_geocode,c000,cfs05
0,290019501001019,4,0
1,290019501001022,1,0
2,290019501001025,2,0
3,290019501001047,43,0
4,290019501001055,1,0
5,290019501001066,1,0
6,290019501001127,1,0
7,290019501001129,12,0
8,290019501001140,50,0
9,290019501001151,3,0


In [18]:
# generate possible table names:
years = range(2002, 2016)
segs = ['S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03']
jtyps = ['JT00', 'JT01', 'JT02', 'JT03', 'JT04', 'JT05']

table_names = []

for seg in segs:
    for jtyp in jtyps:
        for year in years:
            table_names.append('mo_wac_{segment}_{jobtype}_{yr}.csv.gz'.format(segment=seg, jobtype=jtyp, yr=year))

In [19]:
table_names[-10:]

['mo_wac_SI03_JT05_2006.csv.gz',
 'mo_wac_SI03_JT05_2007.csv.gz',
 'mo_wac_SI03_JT05_2008.csv.gz',
 'mo_wac_SI03_JT05_2009.csv.gz',
 'mo_wac_SI03_JT05_2010.csv.gz',
 'mo_wac_SI03_JT05_2011.csv.gz',
 'mo_wac_SI03_JT05_2012.csv.gz',
 'mo_wac_SI03_JT05_2013.csv.gz',
 'mo_wac_SI03_JT05_2014.csv.gz',
 'mo_wac_SI03_JT05_2015.csv.gz']

In [24]:
# handle error
try:
    df = pd.read_csv(base_url+'mo_wac_SI03_JT05_2009.csv.gz')
except urllib2.HTTPError:
    print('moving on')

moving on


In [25]:
len(table_names)

840

In [26]:
# put it all together:

loaded = []

for table_name in table_names:
    # try to read the dataframe
    try:
        df = pd.read_csv(base_url+table_name)
    except urllib2.HTTPError:
        print('skipping {}, likely does not exist'.format(table_name))
        loaded.append(False)
        continue
        
    # fix table to load to database
    # w_geocode and createdate columns should be string
    df['w_geocode'] = df['w_geocode'].astype(str)
    df['createdate'] = df['createdate'].astype(str)

    # make column names lower case
    col_names = [c.lower() for c in df.columns.tolist()]
    # names as dictionary to replace df column names
    new_names = dict(zip(df.columns.tolist(), col_names))
    df.rename(columns=new_names, inplace=True)
    
    # write table to database
    df.to_sql(table_name.split('.')[0].lower(), engine, index=False)
    loaded.append(True)
    print('saved table {}'.format(table_name))

saved table mo_wac_S000_JT00_2002.csv.gz
saved table mo_wac_S000_JT00_2003.csv.gz
saved table mo_wac_S000_JT00_2004.csv.gz
saved table mo_wac_S000_JT00_2005.csv.gz
saved table mo_wac_S000_JT00_2006.csv.gz
saved table mo_wac_S000_JT00_2007.csv.gz
saved table mo_wac_S000_JT00_2008.csv.gz
saved table mo_wac_S000_JT00_2009.csv.gz
saved table mo_wac_S000_JT00_2010.csv.gz
saved table mo_wac_S000_JT00_2011.csv.gz
saved table mo_wac_S000_JT00_2012.csv.gz
saved table mo_wac_S000_JT00_2013.csv.gz
saved table mo_wac_S000_JT00_2014.csv.gz
saved table mo_wac_S000_JT00_2015.csv.gz
saved table mo_wac_S000_JT01_2002.csv.gz
saved table mo_wac_S000_JT01_2003.csv.gz
saved table mo_wac_S000_JT01_2004.csv.gz
saved table mo_wac_S000_JT01_2005.csv.gz
saved table mo_wac_S000_JT01_2006.csv.gz
saved table mo_wac_S000_JT01_2007.csv.gz
saved table mo_wac_S000_JT01_2008.csv.gz
saved table mo_wac_S000_JT01_2009.csv.gz
saved table mo_wac_S000_JT01_2010.csv.gz
saved table mo_wac_S000_JT01_2011.csv.gz
saved table mo_w

saved table mo_wac_SA02_JT01_2005.csv.gz
saved table mo_wac_SA02_JT01_2006.csv.gz
saved table mo_wac_SA02_JT01_2007.csv.gz
saved table mo_wac_SA02_JT01_2008.csv.gz
saved table mo_wac_SA02_JT01_2009.csv.gz
saved table mo_wac_SA02_JT01_2010.csv.gz
saved table mo_wac_SA02_JT01_2011.csv.gz
saved table mo_wac_SA02_JT01_2012.csv.gz
saved table mo_wac_SA02_JT01_2013.csv.gz
saved table mo_wac_SA02_JT01_2014.csv.gz
saved table mo_wac_SA02_JT01_2015.csv.gz
saved table mo_wac_SA02_JT02_2002.csv.gz
saved table mo_wac_SA02_JT02_2003.csv.gz
saved table mo_wac_SA02_JT02_2004.csv.gz
saved table mo_wac_SA02_JT02_2005.csv.gz
saved table mo_wac_SA02_JT02_2006.csv.gz
saved table mo_wac_SA02_JT02_2007.csv.gz
saved table mo_wac_SA02_JT02_2008.csv.gz
saved table mo_wac_SA02_JT02_2009.csv.gz
saved table mo_wac_SA02_JT02_2010.csv.gz
saved table mo_wac_SA02_JT02_2011.csv.gz
saved table mo_wac_SA02_JT02_2012.csv.gz
saved table mo_wac_SA02_JT02_2013.csv.gz
saved table mo_wac_SA02_JT02_2014.csv.gz
saved table mo_w

saved table mo_wac_SE01_JT02_2008.csv.gz
saved table mo_wac_SE01_JT02_2009.csv.gz
saved table mo_wac_SE01_JT02_2010.csv.gz
saved table mo_wac_SE01_JT02_2011.csv.gz
saved table mo_wac_SE01_JT02_2012.csv.gz
saved table mo_wac_SE01_JT02_2013.csv.gz
saved table mo_wac_SE01_JT02_2014.csv.gz
saved table mo_wac_SE01_JT02_2015.csv.gz
saved table mo_wac_SE01_JT03_2002.csv.gz
saved table mo_wac_SE01_JT03_2003.csv.gz
saved table mo_wac_SE01_JT03_2004.csv.gz
saved table mo_wac_SE01_JT03_2005.csv.gz
saved table mo_wac_SE01_JT03_2006.csv.gz
saved table mo_wac_SE01_JT03_2007.csv.gz
saved table mo_wac_SE01_JT03_2008.csv.gz
saved table mo_wac_SE01_JT03_2009.csv.gz
saved table mo_wac_SE01_JT03_2010.csv.gz
saved table mo_wac_SE01_JT03_2011.csv.gz
saved table mo_wac_SE01_JT03_2012.csv.gz
saved table mo_wac_SE01_JT03_2013.csv.gz
saved table mo_wac_SE01_JT03_2014.csv.gz
saved table mo_wac_SE01_JT03_2015.csv.gz
skipping mo_wac_SE01_JT04_2002.csv.gz, likely does not exist
skipping mo_wac_SE01_JT04_2003.csv.gz

saved table mo_wac_SE03_JT03_2011.csv.gz
saved table mo_wac_SE03_JT03_2012.csv.gz
saved table mo_wac_SE03_JT03_2013.csv.gz
saved table mo_wac_SE03_JT03_2014.csv.gz
saved table mo_wac_SE03_JT03_2015.csv.gz
skipping mo_wac_SE03_JT04_2002.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2003.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2004.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2005.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2006.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2007.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2008.csv.gz, likely does not exist
skipping mo_wac_SE03_JT04_2009.csv.gz, likely does not exist
saved table mo_wac_SE03_JT04_2010.csv.gz
saved table mo_wac_SE03_JT04_2011.csv.gz
saved table mo_wac_SE03_JT04_2012.csv.gz
saved table mo_wac_SE03_JT04_2013.csv.gz
saved table mo_wac_SE03_JT04_2014.csv.gz
saved table mo_wac_SE03_JT04_2015.csv.gz
skipping mo_wac_SE03_JT05_2002.csv.gz, likely does not exist


saved table mo_wac_SI02_JT04_2011.csv.gz
saved table mo_wac_SI02_JT04_2012.csv.gz
saved table mo_wac_SI02_JT04_2013.csv.gz
saved table mo_wac_SI02_JT04_2014.csv.gz
saved table mo_wac_SI02_JT04_2015.csv.gz
skipping mo_wac_SI02_JT05_2002.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2003.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2004.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2005.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2006.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2007.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2008.csv.gz, likely does not exist
skipping mo_wac_SI02_JT05_2009.csv.gz, likely does not exist
saved table mo_wac_SI02_JT05_2010.csv.gz
saved table mo_wac_SI02_JT05_2011.csv.gz
saved table mo_wac_SI02_JT05_2012.csv.gz
saved table mo_wac_SI02_JT05_2013.csv.gz
saved table mo_wac_SI02_JT05_2014.csv.gz
saved table mo_wac_SI02_JT05_2015.csv.gz
saved table mo_wac_SI03_JT00_2002.csv.gz
saved table mo_wac_S

In [27]:
load_status = pd.DataFrame({'table': table_names, 'loaded': loaded})

In [28]:
load_status.loaded.value_counts()

True     680
False    160
Name: loaded, dtype: int64