# Measurement error: impute missing wage

If our research question focuses on determining the quarterly wage for 2009 Missouri university and college graduates holding full-term employment in the quarter falling one year after graduation, we encounter bias in our quarterly wage estimates when the data contain wage values for quarter t-1, quarter t+1, but no value for quarter t. In this notebook, we will explore the effects of addressing missing value bias through imputation.

## Python Setup

Before we begin, run the code cell below to initialize the libraries we'll be using in this assignment. We're already familiar with `matplotlib`, `pandas`, and `psycopg2` from previous tutorials.

In [None]:
%pylab inline
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import time

In [None]:
# and set our database connection parameters
db_name = "appliedda"
hostname = "10.10.2.10"

In [None]:
# set database connections - use psycopg2 to more easily execute queries without returning data 
# (eg for series of CREATE queries)
conn = psycopg2.connect(database=db_name, host=hostname)
cursor = conn.cursor()

## Define the study cohort
2009 grads of Missouri public colleges/universtities

In [None]:
# create temp table of all unique 2009 graduates
start_time = time.time()
sql = '''
drop table if exists cohort_2009;

create temp table cohort_2009 AS
select distinct on (deident_id) deident_id, gender, calyear,
    case when acterm = '31' then 1 when acterm = '41' then 2
        when acterm = '11' then 3 when acterm = '21' then 3 else null end as quarter
from mo_dhe.completions
where calyear = 2009;

commit;
'''

cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
start_time = time.time()
sql = '''
alter table cohort_2009
    add column yr_q text;
commit;

update cohort_2009 
    set yr_q = format('%s-%s-1', calyear, quarter*3-2)::date;
commit;
'''

cursor.execute(sql)

## Locate in Missouri's and all states' wage data grads with employment within one year after graduation 

Above we defined our graduate population.
Here we start to look at their post-graduation earnings. We start with Missouri and then add Illinois, Ohio, and Indiana. 

In [None]:
# first up: Missouri employment during quarters t+1, t+2, t+3, t+4 following graduation in quarter t of 2009
start_time = time.time()

sql = '''
drop table if exists cohort_2009_mo_jobs_1yr;

create temp table cohort_2009_mo_jobs_1yr as
select ssn, ein, state, format('%s-%s-1', year, quarter*3-2)::date j_yr_q, wage
from kcmo_lehd.mo_wage
where (year = 2009 and quarter <> 1)
    or year = 2010
    and ssn in (select distinct on (deident_id) deident_id from cohort_2009);

commit;
'''
cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
# next up:workers in Illinois
start_time = time.time()

sql = '''
drop table if exists cohort_2009_il_jobs_1yr;

create temp table cohort_2009_il_jobs_1yr AS
select *
from il_des_kcmo.il_wage
where (year = 2009 and quarter <> 1)
    or year = 2010
    and ssn in (select distinct on (deident_id) deident_id from cohort_2009);

commit;
'''
cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
# next up: workers in Ohio

start_time = time.time()

sql = '''
drop table if exists cohort_2009_oh_jobs_1yr;

create temp table cohort_2009_oh_jobs_1yr as
select a.*, b.ssn_hash as ssn
from data_ohio_olda_2018.oh_ui_wage_by_employer a
join data_ohio_olda_2018.oh_person b
on a.key_id = b.key_id
where (year = 2009 and quarter <> 1)
    or year = 2010
    and b.ssn_hash in (select distinct on (deident_id) deident_id from cohort_2009);

commit;
'''
cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
# last up: workers in Indiana

start_time = time.time()

sql = '''
drop table if exists cohort_2009_in_jobs_1yr;

create temp table cohort_2009_in_jobs_1yr as
select *
from in_data_2019.wages_by_employer
where (year = 2009 and quarter <> 1) 
    or year = 2010
    and ssn in (select distinct on (deident_id) deident_id from cohort_2009);

commit;
'''
cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
# compile cohort jobs from all other states into single table
sql = """ 
drop table if exists cohort_2009_jobs_1yr;

create temp table cohort_2009_jobs_1yr as
select ssn, format('%s-%s-1', year, quarter*3-2)::date j_yr_q, wage
FROM cohort_2009_il_jobs_1yr
union all
select ssn, format('%s-%s-1', year, quarter*3-2)::date j_yr_q, wages as wage
FROM cohort_2009_oh_jobs_1yr
union all
select ssn, format('%s-%s-1', year, quarter*3-2)::date j_yr_q, wages as wage
from cohort_2009_in_jobs_1yr;

commit;
"""
cursor.execute(sql)

print('query complete in {:.2f} seconds'.format(time.time()-start_time))

In [None]:
# quick look at our Missouri wage data
sql = '''
select *
from cohort_2009_mo_jobs_1yr
limit 5
'''
df = pd.read_sql(sql, conn)
df.head()

## Isolate cases where a grad has wages for a preceding and subsequent quarter, but not the quarter in between
* we first look at Missouri jobs
* we then look at other states

In [None]:
# create a table for wages earned in t+1, t+2, t+3, t+4
cohort_2009_mo_q = """
drop table if exists cohort_2009_link_mo;

create temp table cohort_2009_link_mo as

with t_plus_1 as (select a.deident_id, a.yr_q, e.ssn, e.wage, e.j_yr_q
    from cohort_2009 a
    join cohort_2009_mo_jobs_1yr e
        on a.deident_id = e.ssn
        and a.yr_q::date = (e.j_yr_q::date -'3 month'::interval)::date),
    
    t_plus_2 as (select a.deident_id, a.yr_q, d.ssn, d.wage, d.j_yr_q
    from cohort_2009 a
    join cohort_2009_mo_jobs_1yr d
        on a.deident_id = d.ssn
        and a.yr_q::date = (d.j_yr_q::date -'6 month'::interval)::date),

    t_plus_3 as (select a.deident_id, a.yr_q, c.ssn, c.wage, c.j_yr_q
        from cohort_2009 a
        join cohort_2009_mo_jobs_1yr c
            on a.deident_id = c.ssn
            and a.yr_q::date = (c.j_yr_q::date - '9 month'::interval)::date),
    
    t_plus_4 as (select a.deident_id, a.yr_q, b.ssn, b.wage, b.j_yr_q
        from cohort_2009 a
        join cohort_2009_mo_jobs_1yr b
            on a.deident_id = b.ssn
            and a.yr_q::date = (b.j_yr_q::date - '1 year'::interval)::date)
                  
select a.deident_id, t_plus_1.j_yr_q as t_plus_1, t_plus_2.j_yr_q as t_plus_2, 
        t_plus_3.j_yr_q as t_plus_3, t_plus_4.j_yr_q as t_plus_4,
        t_plus_1.wage as wage_t_plus_1, t_plus_2.wage as wage_t_plus_2,
        t_plus_3.wage as wage_t_plus_3, t_plus_4.wage as wage_t_plus_4
from cohort_2009 a
    left join t_plus_1 on a.deident_id = t_plus_1.ssn
    left join t_plus_2 on a.deident_id = t_plus_2.ssn
    left join t_plus_3 on a.deident_id = t_plus_3.ssn
    left join t_plus_4 on a.deident_id = t_plus_4.ssn
where (t_plus_1.wage is not null and t_plus_2.wage is null and t_plus_3.wage is not null) or
      (t_plus_2.wage is not null and t_plus_3.wage is null and t_plus_4.wage is not null)

order by a.deident_id;

commit;
"""
cursor.execute(cohort_2009_mo_q)

In [None]:
# create the same table for other states
cohort_2009_full_q = """
drop table if exists cohort_2009_link_full;

create temp table cohort_2009_link_full as

with t_plus_1 as (select a.deident_id, a.yr_q, e.ssn, e.wage, e.j_yr_q
    from cohort_2009 a
    join cohort_2009_jobs_1yr e
        on a.deident_id = e.ssn
        and a.yr_q::date = (e.j_yr_q::date -'3 month'::interval)::date),
    
    t_plus_2 as (select a.deident_id, a.yr_q, d.ssn, d.wage, d.j_yr_q
    from cohort_2009 a
    join cohort_2009_jobs_1yr d
        on a.deident_id = d.ssn
        and a.yr_q::date = (d.j_yr_q::date -'6 month'::interval)::date),

    t_plus_3 as (select a.deident_id, a.yr_q, c.ssn, c.wage, c.j_yr_q
        from cohort_2009 a
        join cohort_2009_jobs_1yr c
            on a.deident_id = c.ssn
            and a.yr_q::date = (c.j_yr_q::date - '9 month'::interval)::date),
    
    t_plus_4 as (select a.deident_id, a.yr_q, b.ssn, b.wage, b.j_yr_q
        from cohort_2009 a
        join cohort_2009_jobs_1yr b
            on a.deident_id = b.ssn
            and a.yr_q::date = (b.j_yr_q::date - '1 year'::interval)::date)
                  
select a.deident_id, t_plus_1.j_yr_q as t_plus_1, t_plus_2.j_yr_q as t_plus_2, 
        t_plus_3.j_yr_q as t_plus_3, t_plus_4.j_yr_q as t_plus_4,
        t_plus_1.wage as wage_t_plus_1, t_plus_2.wage as wage_t_plus_2,
        t_plus_3.wage as wage_t_plus_3, t_plus_4.wage as wage_t_plus_4
from cohort_2009 a
    left join t_plus_1 on a.deident_id = t_plus_1.ssn
    left join t_plus_2 on a.deident_id = t_plus_2.ssn
    left join t_plus_3 on a.deident_id = t_plus_3.ssn
    left join t_plus_4 on a.deident_id = t_plus_4.ssn
where (t_plus_1.wage is not null and t_plus_2.wage is null and t_plus_3.wage is not null) or
      (t_plus_2.wage is not null and t_plus_3.wage is null and t_plus_4.wage is not null)

order by a.deident_id;

commit;
"""
cursor.execute(cohort_2009_full_q)

In [None]:
# load the data into a pandas dataframe and get a quick look
q = """
select * from cohort_2009_link_mo
"""
dfmo = pd.read_sql(q, conn)
dfmo.head()

In [None]:
# load the data into a pandas dataframe and get a quick look
q = """
select * from cohort_2009_link_full
"""
df = pd.read_sql(q, conn)
df.head()

## impute wage values and explore resulting wage estimate distributions

In [None]:
# let's look at the distribution of Missouri wages for quarter t before imputation
dfmo['wage_t_plus_2'].describe()

In [None]:
# impute missing quarter t+2 wages row wise as mean of wages
dfmo['t_2_imp_mean'] = dfmo.wage_t_plus_2.fillna(dfmo[['wage_t_plus_1', 'wage_t_plus_3', 'wage_t_plus_4']].mean(axis = 1))
dfmo['t_2_imp_mean'].describe()

In [None]:
# impute t wage as zero
dfmo['t_2_imp_zero'] = dfmo.wage_t_plus_2.fillna(0)
dfmo['t_2_imp_zero'].describe()

In [None]:
# see all three distributions side-by-side
fig,ax = plt.subplots(figsize = (10, 10))
dfmo[['wage_t_plus_2', 't_2_imp_mean', 't_2_imp_zero']].boxplot(grid = False, vert = False)
ax.set(title = 'distribution of wage values',
       yticklabels = ['no imputation', 'imputed rowwise mean', 'imputed zero'],
       xlim = (-500,20000),
       xticks = (np.arange(0, 20000, 1500)));

In [None]:
# let's look at the distribution of other states' wages for quarter t before imputation
df['wage_t_plus_2'].describe()

In [None]:
# impute missing quarter t+2 wages row wise as mean of wages
df['t_2_imp_mean'] = df.wage_t_plus_2.fillna(df[['wage_t_plus_1', 'wage_t_plus_3', 'wage_t_plus_4']].mean(axis = 1))
df['t_2_imp_mean'].describe()

In [None]:
# impute t wage as zero
df['t_2_imp_zero'] = df.wage_t_plus_2.fillna(0)
df['t_2_imp_zero'].describe()

In [None]:
# see all three distributions side-by-side
fig,ax = plt.subplots(figsize = (10, 10))
df[['wage_t_plus_2', 't_2_imp_mean', 't_2_imp_zero']].boxplot(grid = False, vert = False)
ax.set(title = 'distribution of wage values',
       yticklabels = ['no imputation', 'imputed rowwise mean', 'imputed zero'],
       xlim = (-500,20000),
       xticks = (np.arange(0, 20000, 1500)));