In [1]:
import numpy
import csv
import numpy as np
from collections import defaultdict

In [2]:
disease_map = defaultdict(dict)

values_per_column = defaultdict(dict)
def add_to_dict(column_index, value):
    current = values_per_column[column_index]
    
    if value not in current:
        next_index = len(current)
        current[value] = next_index

counts_per_year = defaultdict(int)
        
diseases = ['chlamydia', 'gential_warts', 'gonorrhea', 'herpes', 'hpv', 'other_std', 'parasitic', 'std_screen', 'syphilis', 'trich']

with open('data/stanford_blueprint_datathon_2019_data.csv') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    header_map = {name: i for i, name in enumerate(header)}
    
    for row in reader:
        for i, column in enumerate(row):
            if column == '':
                continue
            try:
                val = float(column)
            except:
                add_to_dict(i, column)
        
        age, gender, state, income, education = row[:5]
        
        disease_values = []
        
        for disease in diseases:
            val = row[header_map[disease]]
            if val == '':
                d_val = float('nan')
            else:
                d_val = float(val)
                
            disease_values.append(d_val)
        counts_per_year[row[header_map['date']]] += 1
        
        disease_map[tuple(row[:5])][row[header_map['date']]] = disease_values

In [6]:
def add_months(date, delta):
    year, month, day = date.split('-')
    year = int(year)
    month = int(month)
    
    month += delta
    
    while month <= 0:
        month += 12
        year -= 1
    
    while month > 12:
        month -= 12
        year += 1

    return f'{year}-{month:02}-{day}'

total_columns = 0
for i in range(len(header)):
    if i in values_per_column:
        total_columns += len(values_per_column[i]) + 1
    else:
        total_columns += 1

rows_per_year = {
    year: np.zeros((num, total_columns)) for year, num in counts_per_year.items()
}
labels_per_year = {
    year: np.zeros((num, len(diseases))) for year, num in counts_per_year.items()
}

next_index_per_year = {
    year: 0 for year in counts_per_year
}

with open('data/stanford_blueprint_datathon_2019_data.csv') as f:
    reader = csv.reader(f)
    header = next(reader)

    header_map = {name: i for i, name in enumerate(header)}

    for row in reader:
        year = row[header_map['date']]
        result = []
        for i, column in enumerate(row):
            if i in values_per_column:
                one_hot_values = [0 for _ in range(len(values_per_column[i]) + 1)]
                if column == '':
                    one_hot_values[0] = 1
                else:
                    one_hot_values[1 + values_per_column[i][column]] = 1
                result.extend(one_hot_values)
            else:
                if column == '':
                    result.append(float('nan'))
                else:
                    result.append(float(column))

        next_date = add_months(year, 3)
        if next_date in disease_map[tuple(row[:5])]:
            lab = disease_map[tuple(row[:5])][next_date]
        else:
            lab = [float('nan') for _ in range(len(diseases))]
                    
        index = next_index_per_year[year]
        next_index_per_year[year] += 1
        rows_per_year[year][index, :] = result
        labels_per_year[year][index, :] = lab

In [110]:
for year in rows_per_year:
    np.save('data/rows/' +  year, rows_per_year[year])
    np.save('data/labels/' +  year, labels_per_year[year])

In [111]:
import pickle

with open('data/disease_map.pkl', 'wb') as f:
    pickle.dump(disease_map, f)
    
with open('data/values_per_column.pkl', 'wb') as f:
    pickle.dump(values_per_column, f)

In [35]:
column_to_name = {}
index = 0
for i, name in enumerate(header):
    if i in values_per_column:
        column_to_name[index] = name + "|" + ''
        index += 1
        
        for val in values_per_column[i]: 
            column_to_name[index] = name + "|" + val
            index += 1
    else:
        column_to_name[index] = name
        index += 1

name_to_column = {column: index for index, column in column_to_name.items()}

In [46]:
disease_to_index = {disease: i for i, disease in enumerate(diseases)}

train_year = '2016-10-01'
test_year = '2017-01-01'

target_disease = 'parasitic'

def get_x_y(year, target_disease):
    labels = labels_per_year[year][:, disease_to_index[target_disease]]
    non_nan_labels = ~np.isnan(labels)
    return rows_per_year[year][non_nan_labels, :], labels[non_nan_labels]
    
train_x, train_y = get_x_y(train_year, target_disease)
test_x, test_y = get_x_y(test_year, target_disease)

print(len(train_x))
print(len(test_x))

10135
9673


In [47]:
import sklearn.impute

mean_inputer = sklearn.impute.SimpleImputer()
T_train_x = mean_inputer.fit_transform(train_x)
T_test_x = mean_inputer.transform(test_x)

In [48]:
scaler = sklearn.preprocessing.StandardScaler()
T_train_x = scaler.fit_transform(T_train_x)
T_test_x = scaler.transform(T_test_x)

In [101]:
import sklearn.linear_model

model =  sklearn.linear_model.Lasso(alpha=0.001)
model.fit(T_train_x, train_y)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [102]:
print(model.score(T_train_x, train_y))
print(model.score(T_test_x, test_y))

0.6599929129864321
0.64802157903439


In [105]:
coefs = [(column_to_name[i], val) for i, val in enumerate(model.coef_) if val != 0]
coefs.sort(key=lambda x: -abs(x[1]))
print('\n'.join(str(a) for a in coefs))

('technology_and_connectivity__electronics_enthusiast', 0.047638178546395286)
('technology_and_connectivity__science_&_new_tech_enthusiasts', 0.04432601582332632)
('age|18-24 years old', 0.036271513138214716)
('education|High School', -0.03617764956793113)
('state|California', -0.029259491742187168)
('education|Some High School or Less', 0.02643165602625926)
('herpes', 0.025189588214639375)
('state|Texas', -0.023752701065832955)
('other_std', 0.023750989736325265)
('restaurants_genres__steakhouse_customers', 0.023031121748072366)
('state|Michigan', -0.02030686500441746)
('hpv', 0.019920689336290688)
('state|Washington, DC', 0.019293278833931806)
('income|$250,000 or more', -0.017331592558080092)
('parasitic', 0.016873971333240777)
('state|Connecticut', 0.016361279411676876)
('gender|Female', -0.01584802258027059)
('state|Nevada', 0.014483417989796651)
('state|Delaware', 0.013692299461684423)
('state|Massachusetts', 0.013666009202496129)
('entertainment_movies__online_movie_ticket_buyer

In [106]:
import lightgbm