In [14]:
import csv
import numpy as np
import xgboost as xgb
import sklearn
import random
import sklearn.linear_model

from collections import defaultdict

In [115]:
county_deaths = defaultdict(dict)

with open('./deathData.txt', 'r') as foo:
    lines = list(csv.reader(foo, delimiter='\t'))
    
    header = lines[0]
    
    for line in lines[1:]:
        if line[0] == '---':
            break
        _, name, number, year, year_number, deaths, population, _ = line
        if deaths == 'Missing':
            pass
        elif deaths == 'Suppressed':
            county_deaths[name][year] = ('Suppressed', int(population))
        else:
            county_deaths[name][year] = (int(deaths) / int(population), int(population))

years = ['2010', '2011', '2012', '2013', '2014', '2015']

spike_data = defaultdict(lambda: defaultdict(dict))

for county, county_data in county_deaths.items():
    for prev_year, next_year in zip(years, years[1:]):
        if prev_year not in county_data or next_year not in county_data:
            continue
        
        prev_death, prev_pop = county_data[prev_year]
        next_death, next_pop = county_data[next_year]
        
        if prev_death == 'Suppressed' and next_death == 'Suppressed':
            is_spike = False
        elif prev_death != 'Suppressed' and next_death == 'Suppressed':
            is_spike = False
        elif prev_death == 'Suppressed' and next_death != 'Suppressed':
            is_spike = True
        elif prev_death != 'Suppressed' and next_death != 'Suppressed':
            is_spike = (next_death - prev_death)/prev_death > 0.1
        
        spike_data[county][prev_year]['is_spike'] = is_spike
        spike_data[county][prev_year]['population'] = prev_pop
        if prev_death == 'Suppressed':
            spike_data[county][prev_year]['death_rate'] = 0
        else:
            spike_data[county][prev_year]['death_rate'] = prev_pop   

In [131]:
with open("./CHR_TRENDS_CSV_2017.csv", 'r') as foo:
    lines = list(csv.reader(foo))
    
field_names = lines[0]

years = defaultdict(int)
features = defaultdict(lambda: defaultdict(int))
counties = set()

for line in lines[1:]:
    features[line[1]][line[0]] += 1
    years[line[0]] += 1
    
for feature, y in features.items():
    print(feature, dict(y))

for line in lines[1:]:
    features[line[1]][line[0]] += 1
    years[line[0]] += 1 

for line in lines[1:]:
    feature = line[1]
    year = line[0]
    county = line[4]
    state = line[5]
    value = line[8]
    
    full_county = '{}, {}'.format(county, state)
    
    if '-' in year:
        year = year.split('-')[1]
        
    if value != '':
        value = float(value.replace(',',''))
    else:
        value = na.nan
        
    spike_data[full_county][year][feature] = value 

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f63cc79e7f0>>


Premature death {'1997-1999': 3189, '1998-2000': 3189, '1999-2001': 3189, '2000-2002': 3189, '2001-2003': 3190, '2002-2004': 3190, '2003-2005': 3190, '2004-2006': 3190, '2005-2007': 3190, '2006-2008': 3190, '2007-2009': 3190, '2008-2010': 3190, '2009-2011': 3190, '2010-2012': 3190, '2011-2013': 3190, '2012-2014': 3194}
Preventable hospital stays {'2006-2007': 3190, '2008': 3190, '2009': 3190, '2010': 3190, '2011': 3190, '2012': 3190, '2013': 3190, '2014': 3194}
Diabetes monitoring {'2006-2007': 3190, '2008': 3190, '2009': 3190, '2010': 3190, '2011': 3190, '2012': 3190, '2013': 3190, '2014': 3194}
Adult obesity {'2003-2005': 3190, '2004-2006': 3190, '2005-2007': 3190, '2006-2008': 3190, '2007-2009': 3190, '2008-2010': 3190, '2009-2011': 3190, '2010-2012': 3190, '2011-2013': 3190, '2012-2014': 3194}
High school graduation {'2010-2011': 2946, '2011-2012': 2946, '2012-2013': 3145, '2013-2014': 3190, '2014-2015': 2880}
Unemployment rate {'2002': 3190, '2003': 3190, '2004': 3190, '2005': 319

Traceback (most recent call last):
  File "/home/ethanid/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 324, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


In [151]:
columns = [a for a in spike_data['Autauga County, AL']['2012'].keys() if a not in ('is_spike', 'Air pollution - particulate matter', 'Adult obesity')]

rows = defaultdict(list)

train_years = ['2011', '2012', '2013']
test_year = '2014'

train_rows = []
train_values = []

test_rows = []
test_values = []

for county, county_data in spike_data.items():
    for year, year_data in county_data.items():
        if year in train_years or year == test_year:
            row = []
            for name in columns:
                if name in year_data:
                    row.append(year_data[name])
                else:
                    break
            
            if len(row) != len(columns):
                continue
            
            if 'is_spike' not in year_data:
                continue

            value = year_data['is_spike']

            if year in train_years:
                train_rows.append(row)
                train_values.append(value)
            elif year == test_year:
                test_rows.append(row)
                test_values.append(value)

train_rows = np.array(train_rows)
test_rows = np.array(test_rows)
            
train_rows, train_values = sklearn.utils.shuffle(train_rows, train_values)

param_grid = {
    'max_depth': [3, 6],
    'min_child_weight': [1, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [5, 10, 50, 200],
    'objective': ['binary:logistic'],
}

estimator = xgb.XGBClassifier()

trained_model = sklearn.model_selection.GridSearchCV(
        estimator=estimator,
        param_grid=param_grid, 
        scoring='roc_auc',
        n_jobs=1, 
        verbose=0,
        refit=True,
        cv=5,
)

trained_model.fit(train_rows, train_values)

print(trained_model.best_params_)
print(trained_model.best_score_)


def calc_scores(trained_model, rows, values):
    names = [
        "roc_auc",
        "average_precision",
        "recall",
        "accuracy",
        "precision"
    ]

    return {
         name: sklearn.metrics.get_scorer(name)(trained_model, rows, values) for name in names
    }

final_score = calc_scores(trained_model, test_rows, test_values)
print(final_score)

{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'objective': 'binary:logistic'}
0.871127145547
{'roc_auc': 0.88138744414439518, 'average_precision': 0.48375058982122637, 'recall': 0.0099601593625498006, 'accuracy': 0.84077855775366939, 'precision': 0.7142857142857143}


In [157]:
linear_train = np.array(train_rows)
linear_test = np.array(test_rows)

imputer = sklearn.preprocessing.Imputer()
linear_train = imputer.fit_transform(linear_train)
linear_test = imputer.transform(linear_test)

scaler = sklearn.preprocessing.StandardScaler()
linear_train = scaler.fit_transform(linear_train)
linear_test = scaler.transform(linear_test)

param_grid = {
    'C': [10**(a) for a in range(-5, 4)],
}


estimator = sklearn.linear_model.LogisticRegression()

trained_linear_model = sklearn.model_selection.GridSearchCV(
        estimator=estimator,
        param_grid=param_grid, 
        scoring='roc_auc',
        n_jobs=1, 
        verbose=3,
        refit=True,
        cv=5,
)

trained_linear_model.fit(linear_train, train_values)

print(trained_linear_model.best_params_)
print(trained_linear_model.best_score_)

print(trained_linear_model.best_estimator_.coef_)

for i, name in enumerate(columns):
    coef = trained_linear_model.best_estimator_.coef_[0,i]
    print(coef, name)

final_score = calc_scores(trained_linear_model, linear_test, test_values)
print(final_score)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1e-05 .........................................................
[CV] ................ C=1e-05, score=0.7591083705122386, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ................ C=1e-05, score=0.7359864849894759, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ................ C=1e-05, score=0.7815741899596717, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ................ C=1e-05, score=0.7410513141426784, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ................ C=1e-05, score=0.7551606174384649, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............... C=0.0001, score=0.7625928550078048, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................. C=0.01, score=0.7605198773372891, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7366566965769359, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7755305242664442, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7530190515922681, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7468085106382979, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7581618306413221, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7348039215686275, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    1.0s finished
