In [None]:
%pylab inline
import pandas as pd
import psycopg2
import psycopg2.extras
import sklearn
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier,
                              AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import sqlalchemy
sns.set_style("white")

In [None]:
# schema name
schema = "class2"

# ==> database table names - just like file names above, store reused database information in variables here.

# work table name
table = "for_inference_example"

print( "Database variables initialized at " + str( datetime.datetime.now() ) )

In [None]:
db_name = "appliedda"
hostname = "10.10.2.10"
conn = psycopg2.connect(database=db_name, host = hostname) #database connection
cursor = conn.cursor()

First, select Head of Household information, including geocodes and member info. Then, add on spells.

In [None]:
##DO NOT RERUN UNLESS CHANGING VARIABLES/PARAMETERS BELOW
conn.rollback()
sql_hhspells = """CREATE TEMP TABLE hhspells AS 
                SELECT a.*, b.start_date, b.end_date, b.benefit_type, b.ch_dpa_caseid
                FROM idhs.hh_member a LEFT JOIN idhs.hh_indcase_spells b ON a.recptno = b.recptno 
                WHERE end_date BETWEEN '2013-01-01' AND '2013-12-31';"""
sql_hhselect1 = """CREATE TEMP TABLE hhinfo AS 
                SELECT a.*, b.edlevel, b.health, b.martlst, b.workexp
                FROM hhspells a LEFT JOIN idhs.member_info b 
                ON a.ch_dpa_caseid=b.ch_dpa_caseid AND a.recptno=b.recptno;"""
sql_hhselect2 = """CREATE TEMP TABLE hhinfo2 AS 
                SELECT a.*, b.lng_x, b.lat_y, b.geom, b.geom_2163, b.county_fips_10_nbr, 
                b.tract_fips_10_nbr, b.place_10_nm
                FROM hhinfo a LEFT JOIN idhs.case_geocode b 
                ON a.ch_dpa_caseid=b.ch_dpa_caseid;"""
sql_hhselect3 = """CREATE TEMP TABLE hhinfo3 AS 
                SELECT a.*, b.district, b.case_group, b.homeless
                FROM hhinfo2 a LEFT JOIN idhs.assistance_case b 
                 ON a.ch_dpa_caseid=b.ch_dpa_caseid;"""

cursor.execute(sql_hhspells)
cursor.execute(sql_hhselect1)
cursor.execute(sql_hhselect2)
cursor.execute(sql_hhselect3)

conn.commit()


In [None]:
#DO NOT RERUN UNLESS RERUNNING THE ABOVE - IF SO, DROP TABLE FIRST!
sql_createtable = """CREATE TABLE class2.for_inference_example AS 
                SELECT * FROM hhinfo3;
                
                ALTER TABLE class2.for_inference_example
                ADD COLUMN new_spell_win1yr INTEGER,
                ADD COLUMN new_spell_win1yr_benefit INTEGER,
                ADD COLUMN has_job_q1 INTEGER,
                ADD COLUMN has_job_q2 INTEGER,
                ADD COLUMN has_job_q3 INTEGER,
                ADD COLUMN has_job_q4 INTEGER,
                ADD COLUMN has_job_win1yr INTEGER,
                ADD COLUMN lose_job_win1yr INTEGER,
                ADD COLUMN wage_q1 REAL,
                ADD COLUMN wage_q2 REAL,
                ADD COLUMN wage_q3 REAL,
                ADD COLUMN wage_q4 REAL,
                ADD COLUMN total_wage_1yr REAL,
                ADD COLUMN new_id SERIAL PRIMARY KEY;
                
                ALTER TABLE class2.for_inference_example
                OWNER TO class2_admin;
                GRANT ALL PRIVILEGES ON TABLE class2.for_inference_example TO class2_admin;
                GRANT SELECT ON TABLE class2.for_inference_example TO class2_select;
                """
cursor.execute(sql_createtable)
conn.commit()

In [None]:
conn.rollback()

Now loop through all rows and update table with job variables

In [None]:
select_statement = "SELECT * FROM class2.for_inference_example WHERE total_wage_1yr IS NULL;"
row_cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
qtr_cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
update_cur = conn.cursor()

row_cur.execute(select_statement)
for row in row_cur:  #loop through each row (spell) of participation
    
    #initialize variables
    new_spell_win1yr = 0
    new_spell_win1yr_benefit = 0
    has_job_win1yr = 0
    lose_job_win1yr = 0
    total_wage_1yr = 0

    end_year = row['end_date'].year
    # determine quarter from month of end_spell date
    if row['end_date'].month < 4:
        end_qtr = 1
    elif row['end_date'].month < 7:
        end_qtr = 2
    elif row['end_date'].month < 10:
        end_qtr = 3
    else:
        end_qtr = 4

    #create a list (length 4) of dictionaries where each dict contains quarter info for 4 quarters after
    # end_date so as to pull data from il_wage - different dates for each row/spell
    oneyear = []
    for i in range(0,4):
        qtr_info={}
        qtr_info['count'] = i+1
        qtr_info['quarter'] = end_qtr + i
        if qtr_info['quarter'] >= 5:
            qtr_info['quarter'] = qtr_info['quarter'] - 4
            qtr_info['year'] = end_year + 1
        else:
            qtr_info['year'] = end_year
        oneyear.append(qtr_info)
    ## END FOR LOOP - FOR I IN RANGE (0,4)

    for qtr in oneyear:  #loop through the four quarters determined above
        #initialize variables
        has_job = 0;
        wage = 0;
        #select any wage data for this SSN, this quarter
        qtr_emp_select = "SELECT empr_no, wage FROM ides.il_wage_" + str(qtr['year']) + "q" + str(qtr['quarter'])
        qtr_emp_select += " WHERE ssn LIKE '" + str(row['ssn_hash']) + "' " 
        qtr_emp_select += ";"
        qtr_cur.execute(qtr_emp_select)
        qtr_result = qtr_cur.fetchall()

        if qtr_result: #if results are obtained
            has_job = 1
            has_job_win1yr = 1 #this global variable is set if any instance of has_job
            for entry in qtr_result:
                wage += entry['wage']
        elif has_job_win1yr == 1:
            lose_job_win1yr = 1 #set lose job if respondent previously had job indicator positive

        #update quarter specific wage info
        update_stmt = "UPDATE " + schema + "." + table
        update_stmt += " SET has_job_q" + str(qtr['count']) + " = " + str(has_job) + ","
        update_stmt += " wage_q" + str(qtr['count']) + " = " + str(wage) 
        update_stmt += " WHERE new_id = " + str(row['new_id']) + ";"
        update_cur.execute(update_stmt)

        total_wage_1yr += wage
    ## END FOR LOOP - FOR QTR IN YEAR

    conn.commit() #commit the 4qtr updates

    #update row specific wage info
    tot_up_stmt = "UPDATE " + schema + "." + table
    tot_up_stmt += " SET has_job_win1yr = " + str(has_job_win1yr) + ", "
    tot_up_stmt += "lose_job_win1yr = " + str(lose_job_win1yr) + ", "
    tot_up_stmt += "total_wage_1yr = " + str(total_wage_1yr)
    tot_up_stmt += " WHERE new_id = " + str(row['new_id']) + ";"

    update_cur.execute(tot_up_stmt)
    conn.commit() #commit row updates

##END FOR LOOP - FOR ROW IN RESULT
    

In [None]:
select_statement = "SELECT * FROM class2.for_inference_example WHERE total_wage_1yr IS NULL;"
row_cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
bene_cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
update_cur = conn.cursor()

row_cur.execute(select_statement)
for row in row_cur:  #loop through each row (spell) of participation
    #initialize variables
    new_spell_win1yr = 0
    new_spell_win1yr_benefit = 0
    
    #get the date 1 year from end_date of spell
    date_1yr = row['end_date'] + datetime.timedelta(days=365)
    
    #build the select statement to get start of next spell for this individual (by recptno and ch_dpa_caseid)
    #start dates between last end and 1 year later
    #order by start_date to get the first start date after end using fetchone
    bene_select = "SELECT start_date, benefit_type FROM idhs.hh_indcase_spells"
    bene_select += " WHERE recptno = " + str(row['recptno']) + " AND " 
    bene_select += " ch_dpa_caseid = " + str(row['ch_dpa_caseid']) + " AND "
    bene_select += " (start_date BETWEEN '" + str(row['end_date']) + "' AND '" + str(date_1yr) + "')"
    bene_select += " ORDER BY start_date;"
    
    bene_cur.execute(bene_select)
    bene_result = bene_cur.fetchall()
    
    if bene_result: #if a spell is obtained within 1 year of end_date of the spell
        new_spell_win1yr = 1
        
        #check to see if its the same type of benefit - need to check all spells, not just first
        for spell in bene_result:
            if spell['benefit_type'] == row['benefit_type']:
                new_spell_win1yr_benefit = 1
            # END IF BENEFIT TYPES MATCH
        #END FOR LOOP - SPELL IN BENE_RESULT
    #END IF NEW SPELL EXISTS - IMPLIED ELSE IS THE ZEROS THAT THE VARIABLES WERE INITIALIZED TO
    
    #update record with results
    update_stmt = "UPDATE " + schema + "." + table
    update_stmt += " SET new_spell_win1yr = " + str(new_spell_win1yr) + ","
    update_stmt += " new_spell_win1yr_benefit = " + str(new_spell_win1yr_benefit) 
    update_stmt += " WHERE new_id = " + str(row['new_id']) + ";"
    update_cur.execute(update_stmt)
    conn.commit()
    
##END FOR LOOP - FOR ROW IN RESULT

In [None]:
conn.rollback()