In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
raw_data = pd.read_csv('../../../data/who/who.csv',index_col=0)

In [3]:
columns = ['year', 'status', 'label', 'adult_mortality', 'infant_deaths', 'alcohol', 'pct_expenditure',
          'hep_b', 'measles', 'bmi', 'under_5_deaths', 'polio', 'total_expenditure', 'diptheria',
          'hiv_aids', 'gdp', 'population', 'thinness_119', 'thinness_59', 'income_comp', 'schooling']

raw_data.columns = columns
data = raw_data.drop(columns=['year', 'status'])

In [4]:
def drop_labels_fix_vals(data):
    # drop country label and any data with a missing label
    d = data.reset_index().drop(columns=['Country'])
    d = d.drop(d[d.label.isnull()].index,axis=0)
    d = d.reset_index().drop(columns=['index'])

    # fill missing values with the median
    d.fillna(d.median(), inplace=True)
    return d

In [5]:
d = drop_labels_fix_vals(data)
train, test = train_test_split(d, random_state=0)

In [11]:
# save (regression) datasets
train.to_csv('data/who/who_train.csv',index=False)
test.to_csv('data/who/who_test.csv', index=False)

# Convert to binary label

In [6]:
# reset indices
train = train.reset_index().drop(columns=['index'])
test = test.reset_index().drop(columns=['index'])

In [7]:
def convert_to_binary(train, test, med=None):
    # if target label > median, set label as 1, else 0
    new_labels = np.zeros(len(train))
    if med is None:
        med = np.median(train.label)
    for i in range(len(train.index)):
        if train.label[i] >= med:
            new_labels[i] = 1

    new_test_labels = np.zeros(len(test))
    for i in range(len(test.index)):
        if test.label[i] >= med:
            new_test_labels[i] = 1

    train_bin = train.copy()
    test_bin = test.copy()

    train_bin['label'] = new_labels
    test_bin['label'] = new_test_labels
    test_bin.label = test_bin.label.astype(int)
    train_bin.label = train_bin.label.astype(int)
    
    return train_bin, test_bin, med

In [8]:
train_bin, test_bin, _ = convert_to_binary(train, test)
train_bin.to_csv('data/who/whobin_train.csv',index=False)
test_bin.to_csv('data/who/whobin_test.csv', index=False)

# Break into two datasets for dataset shift

In [9]:
print(raw_data.year.median())
print(raw_data.year.mean())

2008.0
2007.5187202178352


The median year is 2008 and the mean year is 2007.5 -- so, we'll use 2008 as the breakpoint for new vs old data originally, then gradually increase cutoff year to get smaller and smaller temporal shifts


I'm not sure whether to combine the shifted dataset with the original -- I think I will to represent getting new data and updating a model (even though updating wouldn't happen from scratch...)

In [10]:
def get_shifted_datasets(year, raw_data):
    data_orig = raw_data[raw_data.year < year].drop(columns=['year', 'status'])
    data_shift = raw_data[raw_data.year >= year].drop(columns=['year', 'status'])
    data_orig, data_shift = drop_labels_fix_vals(data_orig), drop_labels_fix_vals(data_shift)
    data_shift = pd.concat([data_shift,data_orig]).reset_index().drop(columns=['index'])
    data_shift = data_shift.sample(frac=1).reset_index().drop(columns=['index'])

    data_orig_train, data_orig_test = train_test_split(data_orig, random_state=0)
    data_shift_train, data_shift_test = train_test_split(data_shift, random_state=0)

    # reset indices
    data_orig_train = data_orig_train.reset_index().drop(columns=['index'])
    data_orig_test = data_orig_test.reset_index().drop(columns=['index'])
    data_shift_train = data_shift_train.reset_index().drop(columns=['index'])
    data_shift_test = data_shift_test.reset_index().drop(columns=['index'])

    # convert to binary
    data_orig_train_bin, data_orig_test_bin, med = convert_to_binary(data_orig_train, data_orig_test)
    data_shift_train_bin, data_shift_test_bin, _ = convert_to_binary(data_shift_train, data_shift_test, med)

    return data_orig_train_bin, data_orig_test_bin, data_shift_train_bin, data_shift_test_bin

In [13]:
data_orig_train, data_orig_test, data_shift_train, data_shift_test = get_shifted_datasets(2014, raw_data)

In [14]:
print(len(data_orig_train)/len(data_shift_train))

0.8747723132969034


In [15]:
# save to csv
pct = '87'
data_orig_train.to_csv('../../../data/who/whobin_' + pct + '_orig_train.csv',index=False)
data_orig_test.to_csv('../../../data/who/whobin_' + pct + '_orig_test.csv',index=False)
data_shift_train.to_csv('../../../data/who/whobin_' + pct + '_shift_train.csv',index=False)
data_shift_test.to_csv('../../../data/who/whobin_' + pct + '_shift_test.csv',index=False)



In [72]:
data_orig

Unnamed: 0,label,adult_mortality,infant_deaths,alcohol,pct_expenditure,hep_b,measles,bmi,under_5_deaths,polio,total_expenditure,diptheria,hiv_aids,gdp,population,thinness_119,thinness_59,income_comp,schooling
0,57.5,295.0,82,0.02,10.910156,63.0,1141,15.2,113,63.0,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
1,57.3,295.0,84,0.03,17.171518,64.0,1990,14.7,116,58.0,7.43,58.0,0.1,272.563770,2589345.0,19.2,19.3,0.405,8.1
2,57.3,291.0,85,0.02,1.388648,66.0,1296,14.2,118,58.0,8.70,58.0,0.1,25.294130,257798.0,19.3,19.5,0.396,7.9
3,57.0,293.0,87,0.02,15.296066,67.0,466,13.8,120,5.0,8.79,5.0,0.1,219.141353,24118979.0,19.5,19.7,0.381,6.8
4,56.7,295.0,87,0.01,11.089053,65.0,798,13.4,122,41.0,8.82,41.0,0.1,198.728544,2364851.0,19.7,19.9,0.373,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,44.3,723.0,27,4.36,0.000000,68.0,31,27.1,42,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
1460,44.5,715.0,26,4.06,0.000000,7.0,998,26.7,41,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
1461,44.8,73.0,25,4.43,0.000000,73.0,304,26.3,40,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
1462,45.3,686.0,25,1.72,0.000000,76.0,529,25.9,39,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8
