# Data Preprocessing

In [2]:
from utils import *

## External Experiments

### Load Data

In [21]:
train = pd.read_csv('health_state_prediction/data/data.train.csv', index_col=False, parse_dates=['time_window'])
test = pd.read_csv('health_state_prediction/data/data.test.csv', index_col=False, parse_dates=['time_window'])

### Exercise Behavior Extraction

In [22]:
def extract_exercise_behavior(data, pattern_context):
    def get_dict_value(user_dict, key, col, fill_val=None):
        try:
            return user_dict[key][col]
        except KeyError:
            return fill_val
        
    exercise_dict = pattern_context.set_index(['userID', 'time_window']).to_dict('index')
    for col in pattern_context.columns:
        if col in ['userID', 'time_window', 'cluster', 'periodicity']: continue
        if 'std' in col: continue
        for i in range(1, 5):
            data['{}_m{}'.format(col, 1+i)] = [
                get_dict_value(exercise_dict, (user, str(m + relativedelta(months=i))[:7]), col, fill_val=None) \
                for user, m in zip(data['userID'], data['time_window'])
            ]
    return data

In [23]:
def add_pure_features(data):
    # Load user representations
    df_train = pd.read_csv('health_state_prediction/data/exercise_patterns.clusters.train.csv')
    df_test = pd.read_csv('health_state_prediction/data/exercise_patterns.clusters.test.csv')
    pattern_context = pd.concat([df_train, df_test])#.fillna(0)
    
    # Add PURE features to data
    data = extract_exercise_behavior(data, pattern_context)
    data = data.fillna(0)
    return data

In [24]:
train_samples = add_pure_features(train)
test_samples = add_pure_features(test)

### Save Data

In [25]:
def save_data(data, file_path):
    data['target'] = [0 if x==True else 1 for x in data['target_belowAvg']]
    data = data.sample(frac=1).reset_index(drop=True)
    data.to_csv(file_path, index=None)

In [26]:
save_data(train_samples, 'health_state_prediction/data/train.csv')
save_data(test_samples, 'health_state_prediction/data/test.csv')

## Internal Experiments

### PFPM + PCA

In [34]:
ptn_train = pd.read_csv('health_state_prediction/data/exercise_patterns.train.csv', index_col=False)
ptn_test = pd.read_csv('health_state_prediction/data/exercise_patterns.test.csv', index_col=False)

In [36]:
from sklearn.decomposition import PCA
K = 11
Ep = list(filter(lambda x: x.startswith('pattern_'), ptn_train.columns))

In [37]:
pca = PCA(n_components=K)
pca.fit(ptn_train[Ep].values)

PCA(copy=True, iterated_power='auto', n_components=11, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [38]:
ptn_train_decomp = pca.transform(ptn_train[Ep].values)
ptn_test_decomp = pca.transform(ptn_test[Ep].values)

In [39]:
ptn_train = ptn_train[['userID', 'time_window']]
ptn_test = ptn_test[['userID', 'time_window']]
for i in range(K):
    ptn_train['pattern_{}'.format(i)] = [ptn[i] for ptn in ptn_train_decomp]
    ptn_test['pattern_{}'.format(i)] = [ptn[i] for ptn in ptn_test_decomp]

In [40]:
pattern_context = pd.concat([ptn_train, ptn_test])

pattern_dict = pattern_context.set_index(['userID', 'time_window']).to_dict('index')
pattern_dict = {k: list(v.values()) for k,v in pattern_dict.items()}

with open('health_state_prediction/data/exercise_patterns.decomposed.pkl', 'wb') as f:
    pkl.dump(pattern_dict, f)

### FPM + PCA

In [4]:
ptn_train = pd.read_csv('health_state_prediction/data/tmp/exercise_patterns.train.csv', index_col=False)
ptn_test = pd.read_csv('health_state_prediction/data/tmp/exercise_patterns.test.csv', index_col=False)

In [5]:
from sklearn.decomposition import PCA
K = 6
Ep = list(filter(lambda x: x.startswith('pattern_'), ptn_train.columns))

In [6]:
pca = PCA(n_components=K)
pca.fit(ptn_train[Ep].values)

PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [7]:
ptn_train_decomp = pca.transform(ptn_train[Ep].values)
ptn_test_decomp = pca.transform(ptn_test[Ep].values)

In [8]:
ptn_train = ptn_train[['userID', 'time_window']]
ptn_test = ptn_test[['userID', 'time_window']]
for i in range(K):
    ptn_train['pattern_{}'.format(i)] = [ptn[i] for ptn in ptn_train_decomp]
    ptn_test['pattern_{}'.format(i)] = [ptn[i] for ptn in ptn_test_decomp]

In [9]:
pattern_context = pd.concat([ptn_train, ptn_test])

pattern_dict = pattern_context.set_index(['userID', 'time_window']).to_dict('index')
pattern_dict = {k: list(v.values()) for k,v in pattern_dict.items()}

with open('health_state_prediction/data/tmp/exercise_patterns.decomposed.pkl', 'wb') as f:
    pkl.dump(pattern_dict, f)

---