In [1]:
import numpy
import csv
import numpy as np
from collections import defaultdict
import pickle

In [2]:
with open('data/disease_map.pkl', 'rb') as f:
    disease_map = pickle.load(f)
    
with open('data/values_per_column.pkl', 'rb') as f:
    values_per_column = pickle.load(f)

In [3]:
with open('data/stanford_blueprint_datathon_2019_data.csv') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    header_map = {name: i for i, name in enumerate(header)}
    
diseases = ['chlamydia', 'gential_warts', 'gonorrhea', 'herpes', 'hpv', 'other_std', 'parasitic', 'std_screen', 'syphilis', 'trich']


In [4]:
column_to_name = {}
index = 0
for i, name in enumerate(header):
    if i in values_per_column:
        column_to_name[index] = name + "|" + ''
        index += 1
        
        for val in values_per_column[i]: 
            column_to_name[index] = name + "|" + val
            index += 1
    else:
        column_to_name[index] = name
        index += 1

name_to_column = {column: index for index, column in column_to_name.items()}

In [5]:
disease_to_index = {disease: i for i, disease in enumerate(diseases)}

train_year = '2016-10-01'
test_year = '2017-01-01'

target_disease = 'parasitic'

def get_x_y(year, target_disease):
    labels = np.load('data/labels/' + year + '.npy')[:, disease_to_index[target_disease]]
    non_nan_labels = ~np.isnan(labels)
    return np.load('data/rows/' + year + '.npy')[non_nan_labels, :], labels[non_nan_labels]
    
train_x, train_y = get_x_y(train_year, target_disease)
test_x, test_y = get_x_y(test_year, target_disease)

print(len(train_x))
print(len(test_x))

10135
9673


In [6]:
import sklearn.impute
import sklearn.preprocessing

mean_inputer = sklearn.impute.SimpleImputer()
T_train_x = mean_inputer.fit_transform(train_x)
T_test_x = mean_inputer.transform(test_x)

In [7]:
scaler = sklearn.preprocessing.StandardScaler()
T_train_x = scaler.fit_transform(T_train_x)
T_test_x = scaler.transform(T_test_x)

In [8]:
import sklearn.linear_model

model =  sklearn.linear_model.Lasso(alpha=0.001)
model.fit(T_train_x, train_y)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
print(model.score(T_train_x, train_y))
print(model.score(T_test_x, test_y))

0.6599929129864321
0.64802157903439


In [10]:
coefs = [(column_to_name[i], val) for i, val in enumerate(model.coef_) if val != 0]
coefs.sort(key=lambda x: -abs(x[1]))
print('\n'.join(str(a) for a in coefs))

('technology_and_connectivity__electronics_enthusiast', 0.047638178546395286)
('technology_and_connectivity__science_&_new_tech_enthusiasts', 0.04432601582332632)
('age|18-24 years old', 0.036271513138214716)
('education|High School', -0.03617764956793113)
('state|California', -0.029259491742187168)
('education|Some High School or Less', 0.02643165602625926)
('herpes', 0.025189588214639375)
('state|Texas', -0.023752701065832955)
('other_std', 0.023750989736325265)
('restaurants_genres__steakhouse_customers', 0.023031121748072366)
('state|Michigan', -0.02030686500441746)
('hpv', 0.019920689336290688)
('state|Washington, DC', 0.019293278833931806)
('income|$250,000 or more', -0.017331592558080092)
('parasitic', 0.016873971333240777)
('state|Connecticut', 0.016361279411676876)
('gender|Female', -0.01584802258027059)
('state|Nevada', 0.014483417989796651)
('state|Delaware', 0.013692299461684423)
('state|Massachusetts', 0.013666009202496129)
('entertainment_movies__online_movie_ticket_buyer

In [13]:
import lightgbm
train_data = lightgbm.Dataset(train_x, train_y)
test_data = lightgbm.Dataset(test_x, test_y)

<lightgbm.basic.Dataset object at 0x7fd403fd6978>


In [30]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lightgbm.train(params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    early_stopping_rounds=5)


[1]	training's l2: 0.0451449	valid_1's l2: 0.0462867
Training until validation scores don't improve for 5 rounds.
[2]	training's l2: 0.0420191	valid_1's l2: 0.04325
[3]	training's l2: 0.0391906	valid_1's l2: 0.0405171
[4]	training's l2: 0.0366414	valid_1's l2: 0.0380304
[5]	training's l2: 0.0343499	valid_1's l2: 0.0358043
[6]	training's l2: 0.0322431	valid_1's l2: 0.0337725
[7]	training's l2: 0.0303491	valid_1's l2: 0.0319387
[8]	training's l2: 0.0286141	valid_1's l2: 0.0302624
[9]	training's l2: 0.0270499	valid_1's l2: 0.0287692
[10]	training's l2: 0.0256355	valid_1's l2: 0.0274088
[11]	training's l2: 0.0243249	valid_1's l2: 0.0261487
[12]	training's l2: 0.0231419	valid_1's l2: 0.0250054
[13]	training's l2: 0.0220617	valid_1's l2: 0.0239643
[14]	training's l2: 0.0210853	valid_1's l2: 0.0230363
[15]	training's l2: 0.020202	valid_1's l2: 0.022197
[16]	training's l2: 0.0193759	valid_1's l2: 0.0213943
[17]	training's l2: 0.0186282	valid_1's l2: 0.0206925
[18]	training's l2: 0.0179612	vali

In [28]:
predictions = gbm.predict(test_x)

print(sklearn.metrics.r2_score(test_y, predictions))

0.7270072210271461
