# VoterKarma calculation
_This notebook outlines the steps for calculating the VoterKarma score presented in the Debug Politics Hackthon 1-15-16_
### v 0.1: MVP: Logistic regression defaults to score voters
### v 0.2: XGBoost model

In [14]:
import pdb
import json
import datetime
import psycopg2 as pg
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
#Read in db access information
vk_access = {}
with open('./db_vars', 'r') as f:
    for l in f.readlines():
        vk_access[l.split('=')[0]] = l.split('=')[1].strip('\n')

In [4]:
#Pull in best parameters from testing
with open('best_params.json', 'r') as f:
    best_params = json.load(f)

### Utilities

In [5]:
# Utilities
def date_to_years(start, end=datetime.date.today()):
    return (end - start.date()).days / 365

def convert_to_date(col):
    return pd.to_datetime(col, errors='ignore', format="%Y-%m-%d")

### Global vars

In [6]:
#Limit % of observations to retrieve
LIMIT = .25
#Columns in database
HEADERS = (
'id', 'dob', 'gender', 'status', 'enrollment', 'district', 'regdate', 
'idrequired', 'idmet',
'e2001_09_primary',
'e2001_11_general',
'e2005_09_primary',
'e2005_11_general',
'e2006_11_general',
'e2008_02_primary',
'e2008_11_general',
'e2009_09_primary',
'e2009_11_general',
'e2010_09_primary',
'e2010_11_general',
'e2012_06_primary',
'e2012_09_primary',
'e2012_11_general',
'e2013_09_primary',
'e2013_11_general',
'e2014_06_primary',
'e2014_11_general'
)
#Recent elections (column names)
RECENT = {'local_primary':'e2013_09_primary', 
               'local_general':'e2013_11_general', 
               'national_midterm':'e2014_11_general',
               'national_presidential': 'e2012_11_general'}

### Data pulling
Requires connection to vk RDS.  Use `LIMIT` global to limit number of observations.  Total dataset is >3M rows

In [7]:
#Data pulling
def pulldata(limit=None):
    conn = pg.connect(database = vk_access['VK_DB'], user = vk_access['VK_U'], password = vk_access['VK_PW'],
        host = vk_access['VK_HOST'], port = vk_access['VK_PORT'])

    cur = conn.cursor()

    sel = """
    SELECT {}
    FROM {}
    """.format(', '.join(HEADERS), 'rawvoters')

    if limit is not None:
        sel += " TABLESAMPLE BERNOULLI({})".format(limit)

    cur.execute(sel)
    df = pd.DataFrame.from_records(cur.fetchall(), columns=HEADERS)
    return(df)

In [8]:
#Data processing
def process(data):
    # Set index to 'id' and drop id
    df = data.set_index(['id']).copy(deep=True)
    # Convert Date fields to years duration
    df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(convert_to_date)
    df[df.select_dtypes(['datetime64[ns]']).columns] = \
            df.select_dtypes(['datetime64[ns]']).applymap(lambda x:
                    date_to_years(x))
    #DOB range 18-167, probably fair to drop anyone >=100
    df = df[df.dob<100]
    # Create categories and pivot them
    df[df.select_dtypes(['object']).columns] = \
            df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

    # Pivot categorical variables
    cat_cols = df.select_dtypes(['category']).columns
    for col in cat_cols:
        dummy = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummy], axis=1)

    df.drop(cat_cols, axis=1, inplace=True)
    return(df)

### Testing model

In [12]:
#Create train/test
def tr_te_split(f, l, pct):
    rnd_indices = np.random.rand(len(f)) < pct
    test_x = f[~rnd_indices]
    test_y = l[~rnd_indices]
    train_x = f[rnd_indices]
    train_y = l[rnd_indices]
    return(train_x, train_y, test_x, test_y)

#Train model
def train_model(params, dm):
    train_gb = xgb.train(params, dm)
    return(train_gb)

#Test model, return accuracy score and predictions
def test_model(test_x, test_y, model):
    test_dm = xgb.DMatrix(test_x)
    y_pred_prob = model.predict(test_dm)
    y_pred = y_pred_prob.copy()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    acc = accuracy_score(y_pred, test_y)
    roc = roc_auc_score(y_pred, test_y)
    return(acc, roc, y_pred_prob)

In [10]:
df = pulldata(limit = LIMIT)
df = process(df)

In [15]:
#Trained models
trained = {}

for t in best_params:
    print t['target']
    train_x, train_y, test_x, test_y = tr_te_split(df.drop(RECENT[t['target']], axis=1), 
                                                   df[RECENT[t['target']]],.7)
    #Make train/test dmatrix
    train_dm = xgb.DMatrix(train_x, train_y)
    trained[t['target']] = train_model(t['params'][0], train_dm)
    acc, roc, y_pred = test_model(test_x, test_y, trained[t['target']])
    print 'Accuracy: ', acc
    print 'ROC AUC: ', roc

local_general
Accuracy:  0.898469578201
ROC AUC:  0.845107453953
local_primary
Accuracy:  0.917082179132
ROC AUC:  0.835520649168
national_presidential
Accuracy:  0.819190968956
ROC AUC:  0.812304876565
national_midterm
Accuracy:  0.890136627363
ROC AUC:  0.805146887464


### Training/predicting on full data

In [16]:
full = pulldata()
full = process(full)

In [19]:
#Trained models
trained = {}
#Scored observations
scored = pd.DataFrame(index = full.index)

for t in best_params:
    print t['target']
    X, y = full.drop(RECENT[t['target']], axis=1), full[RECENT[t['target']]]
    train_dm = xgb.DMatrix(X, y)
    trained[t['target']] = train_model(t['params'][0], train_dm)
    acc, roc, y_pred = test_model(X, y, trained[t['target']])
    print 'Accuracy: ', acc
    print 'ROC AUC: ', roc
    scored[t['target']] = y_pred

local_general
Accuracy:  0.901122365644
ROC AUC:  0.844086378812
local_primary
Accuracy:  0.922655396602
ROC AUC:  0.84847023614
national_presidential
Accuracy:  0.820772340598
ROC AUC:  0.813416362618
national_midterm
Accuracy:  0.888649315869
ROC AUC:  0.810068104205


In [None]:
# Sum score - used in v 0.1, v0.2
scored['score_total'] = scored['local_general'] + scored['national_midterm'] + scored['national_presidential']
scored['score_total_scaled'] = scored['score_total'] / scored['score_total'].max()

In [131]:
# Export to gz
scored.to_csv('scored_xgboost_v02.csv.gz', compression='gzip')

### Upload to db (takes a long time outside of AWS)

In [None]:
conn = pg.connect(database = vk_access['VK_DB'], user = vk_access['VK_U'], password = vk_access['VK_PW'],
    host = vk_access['VK_HOST'], port = vk_access['VK_PORT'])
cur = conn.cursor()

ins_cols = ['local_general', 'local_primary', 'national_presidential', 
            'national_midterm', 'raw_voter_id', 'score_w_scaled', 'score_total_scaled']

#Counter for resume upload
#Commits every 10,000 records staged
counter = 0
for ind, vals in scored.iloc[counter:][ins_cols].iterrows():
    if counter % 10000 == 0:
        print "uploaded {} records".format(counter)
        conn.commit()
    vals_str = ', '.join([str(x) for x in vals])
    rvid = vals['raw_voter_id']
    ins = """ 
    INSERT INTO {0} ({1})
    select {2}
    WHERE NOT EXISTS (SELECT * FROM {0} WHERE raw_voter_id={3})
    """.format('voter_grades', 
            ', '.join(ins_cols), 
              vals_str,
              rvid)
    cur.execute(ins)
    counter+=1
conn.commit()

### Voter counts for weighting
This retrieves the count of the number of people that voted in the most recent elections.  This isn't used in v0.1 or v0.2

In [11]:
# Voter counts
v_cnt = {}
conn = pg.connect(database = vk_access['VK_DB'], user = vk_access['VK_U'], password = vk_access['VK_PW'],
    host = vk_access['VK_HOST'], port = vk_access['VK_PORT'])
cur = conn.cursor()
cnt_sql = """
            SELECT count(*)
            from rawvoters
            where {} = TRUE
          """
for k,v in RECENT.items():
    cur.execute(cnt_sql.format(v))
    v_cnt[k] = float(cur.fetchall()[0][0])