# Data Preparation

## Python Setup

In [None]:
import pandas as pd
import numpy as np
import psycopg2

In [None]:
db_name = "appliedda"
hostname = "10.10.2.10"
conn = psycopg2.connect(database=db_name, host = hostname) #database connection

## SQL Query

### Total Version

In [None]:
script = '''
drop table if exists ada_18_uchi.dashboard_data_il_jobs;

create temp table dashboard_data_il_temp as
select 
    a.ssn
    , a.wage
    , a.ein
    , a.seinunit
    , a.empr_no
    , a.year
    , a.quarter as qtr
    , b.address_city
    , b.other_address_zip5
    , b.county as cnty
    , b.naics
from (
    select ssn, wage, ein, seinunit, empr_no, year, quarter 
    from il_des_kcmo.il_wage
) as a
left join (
    select ein, seinunit, empr_no, address_city, other_address_zip5, county, naics, year, quarter 
    from il_des_kcmo.il_qcew_employers
) as b
on a.ein = b.ein and a.seinunit = b.seinunit and a.empr_no = b.empr_no 
and a.year = b.year and a.quarter = b.quarter;

create table ada_18_uchi.dashboard_data_il_jobs as
select 
    a.*
    , b.start_year as ein_start_year
    , b.start_qtr as ein_start_qtr
from dashboard_data_il_temp as a
left join (select ein, start_year, start_qtr from ada_18_uchi.ein_lookup) as b
on a.ein = b.ein;

alter table ada_18_uchi.dashboard_data_il_jobs owner to ada_18_uchi_admin;
commit;
'''

In [None]:
overwrite = False

In [None]:
cursor = conn.cursor()
cursor.execute('''
select * from information_schema.tables 
where table_name = 'dashboard_data_il_jobs' and table_schema = 'ada_18_uchi'
''')
if not(cursor.rowcount) or overwrite:
    cursor.execute(script)

### Random Sample

In [None]:
script = '''
drop table if exists ada_18_uchi.dashboard_data_il_jobs_rs;

create table ada_18_uchi.dashboard_data_il_jobs_rs as
select *
from ada_18_uchi.dashboard_data_il_jobs
order by random()
limit 10000000;

alter table ada_18_uchi.dashboard_data_il_jobs_rs owner to ada_18_uchi_admin;
commit;
'''

In [None]:
overwrite = False

In [None]:
cursor = conn.cursor()
cursor.execute('''
select * from information_schema.tables 
where table_name = 'dashboard_data_il_jobs_rs' and table_schema = 'ada_18_uchi'
''')
if not(cursor.rowcount) or overwrite:
    cursor.execute(script)

### Bucket Version

In [None]:
script = '''
drop table if exists ada_18_uchi.dashboard_data_il_buckets;

create table ada_18_uchi.dashboard_data_il_buckets as 
select
    year
    , qtr
    , cnty
    , (wage/1000)*1000 as wage_bucket
    , count(*) as jobs
    , avg(wage) as avg_wage
from ada_18_uchi.dashboard_data_il_jobs
group by year, qtr, cnty, (wage/1000)*1000
order by year, qtr, cnty, (wage/1000)*1000;

alter table ada_18_uchi.dashboard_data_il_buckets owner to ada_18_uchi_admin;
commit;
'''

In [None]:
overwrite = False

In [None]:
cursor = conn.cursor()
cursor.execute('''
select * from information_schema.tables 
where table_name = 'dashboard_data_il_buckets' and table_schema = 'ada_18_uchi'
''')
if not(cursor.rowcount) or overwrite:
    cursor.execute(script)

### Bucket Version with Additional Statistics

In [None]:
script = '''
create table if not exists ada_18_uchi.dashboard_data_il as 
select
    year
    , qtr
    , cnty
    , left(naics, 2) as naics
    , (wage/1000)*1000 as wage_bucket
    , count(*) as jobs
    , avg(wage) as avg_wage
from ada_18_uchi.dashboard_data_il
group by year, qtr, cnty, left(naics, 2), (wage/1000)*1000
order by year, qtr, cnty, left(naics, 2), (wage/1000)*1000;

alter table ada_18_uchi.dashboard_data_il_all owner to ada_18_uchi_admin;
commit;
'''

In [None]:
overwrite = False

In [None]:
cursor = conn.cursor()
cursor.execute('''
select * from information_schema.tables 
where table_name = 'dashboard_data_il' and table_schema = 'ada_18_uchi'
''')
if not(cursor.rowcount) or overwrite:
    cursor.execute(script)

## MO Border Data

In [None]:
# script = '''
# create table if not exists ada_kcmo.dashboard_wages_il as
# select 
#     a.ssn
#     , a.wage
#     , a.ein
#     , a.seinunit
#     , a.empr_no
#     , a.year
#     , a.quarter as qtr
#     , b.address_street1
#     , b.address_city
#     , b.address_zip5
#     , b.county as cnty
#     , b.naics
#     , case when b.flag = 1 then 1 else 0 end as merge_status
# from il_des_kcmo.il_wage as a
# left join (select *, 1 as flag from il_des_kcmo.il_qcew_employers) as b
# on a.ein = b.ein and a.seinunit = b.seinunit and a.empr_no = b.empr_no 
# and a.year = b.year and a.quarter = b.quarter
# where b.county in ('067', '001', '149', '013', '083', '119', '163', '133', '157', '077', '181', '003');

# alter table ada_kcmo.dashboard_wages_il owner to ada_kcmo_admin;
# commit;
# '''

In [None]:
# c = conn.cursor()
# c.execute(script)

In [None]:
# script = '''
# create table if not exists ada_kcmo.dashboard_wage_buckets_il as 
# select
#     year
#     , qtr
#     , cnty
#     , left(naics, 2) as naics
#     , (wage/1000)*1000 as wage_bucket
#     , count(*) as nb_jobs
#     , avg(wage) as avg_wage
# from ada_kcmo.dashboard_wages_il
# group by year, qtr, cnty, left(naics, 2), (wage/1000)*1000
# order by year, qtr, cnty, left(naics, 2), (wage/1000)*1000;

# alter table ada_kcmo.dashboard_wage_buckets_il owner to ada_kcmo_admin;
# commit;
# '''

In [None]:
# c = conn.cursor()
# c.execute(script)