In [9]:
import os
import pandas as pd
import numpy as np
import seaborn as sns

sns.set()

# Anomaly detection in NHAMCS data

## Loading the dataset

In [10]:
nhamcs = pd.read_csv(
    os.path.join('data', 'nhamcs2018.csv'),
    usecols=[
        'VMONTH',
        'VDAYR',
        'RFV1',
    ],
    na_values={
        'RFV1': -9,
    }
)
nhamcs['year'] = 2018

nhamcs.rename(columns={
    'VMONTH': 'month',
    'VDAYR': 'day',
},
inplace=True)

nhamcs['date'] = pd.to_datetime(nhamcs[['year', 'month', 'day']])

nhamcs.drop([
        'year',
        'month',
        'day',
    ],
    axis=1,
    inplace=True,
)

nhamcs.head(10)

Unnamed: 0,RFV1,date
0,10100.0,2018-12-01
1,55450.0,2018-12-01
2,10100.0,2018-12-06
3,15451.0,2018-12-04
4,55450.0,2018-11-02
5,10100.0,2018-12-01
6,14400.0,2018-12-07
7,14400.0,2018-12-04
8,55050.0,2018-12-07
9,18600.0,2018-12-06


## Creating feature vectors

Restricting reasons for visit to only those associated with flu-like symptoms then switching from categorical encoding to on-hot.

In [11]:
import rfv
nhamcs = nhamcs[nhamcs["RFV1"].isin(rfv.rfv1)].reindex()

new = pd.get_dummies(nhamcs['RFV1'], prefix='RFV')

nhamcs.drop('RFV1', axis=1, inplace=True)

nhamcs = nhamcs.join(new)

nhamcs.head(10)

Unnamed: 0,date,RFV_10050.0,RFV_10100.0,RFV_10150.0,RFV_10200.0,RFV_10250.0,RFV_10300.0,RFV_10350.0,RFV_10352.0,RFV_10353.0,...,RFV_14851.0,RFV_15250.0,RFV_15300.0,RFV_15350.0,RFV_15400.0,RFV_15450.0,RFV_15451.0,RFV_16150.0,RFV_19651.0,RFV_46050.0
0,2018-12-01,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-12-06,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-12-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,2018-12-01,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2018-12-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2018-12-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,2018-12-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
12,2018-12-06,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,2018-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,2018-12-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
syndromic_features = nhamcs.groupby('date').sum().apply(lambda row: row/float(row.sum()))

syndromic_features.head(10)

Unnamed: 0_level_0,RFV_10050.0,RFV_10100.0,RFV_10150.0,RFV_10200.0,RFV_10250.0,RFV_10300.0,RFV_10350.0,RFV_10352.0,RFV_10353.0,RFV_10501.0,...,RFV_14851.0,RFV_15250.0,RFV_15300.0,RFV_15350.0,RFV_15400.0,RFV_15450.0,RFV_15451.0,RFV_16150.0,RFV_19651.0,RFV_46050.0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,0.0,0.018182,0.0,0.005076,0.0,0.015748,0.0,0.0,0.0,0.009444,...,0.0,0.003704,0.025641,0.0,0.0,0.047619,0.008621,0.0,0.142857,0.0
2018-01-02,0.058824,0.038636,0.046512,0.020305,0.043478,0.015748,0.0,0.0,0.0,0.018888,...,0.0,0.018519,0.037037,0.0,0.0,0.015873,0.015674,0.111111,0.142857,0.016129
2018-01-03,0.0,0.030682,0.046512,0.025381,0.0,0.0,0.0,0.0,0.0,0.022036,...,0.0,0.022222,0.031339,0.0,0.0,0.015873,0.014107,0.0,0.0,0.016129
2018-01-04,0.0,0.031818,0.046512,0.010152,0.043478,0.015748,0.0,0.0,0.0,0.011542,...,0.0,0.022222,0.031339,0.0,0.0,0.047619,0.019592,0.055556,0.0,0.064516
2018-01-05,0.0,0.022727,0.0,0.020305,0.0,0.023622,0.0,0.0,0.0,0.017838,...,0.0,0.011111,0.034188,0.0,0.0,0.0,0.010972,0.0,0.0,0.016129
2018-01-06,0.058824,0.031818,0.023256,0.025381,0.021739,0.0,0.0,0.0,0.0,0.011542,...,0.0,0.007407,0.008547,0.0,0.0,0.0,0.009404,0.0,0.0,0.032258
2018-01-07,0.0,0.013636,0.023256,0.0,0.043478,0.0,0.0,0.0,0.0,0.017838,...,0.0,0.003704,0.011396,0.0,0.0,0.0,0.00627,0.0,0.0,0.048387
2018-02-01,0.0,0.017045,0.0,0.005076,0.021739,0.031496,0.0,0.166667,0.0,0.008395,...,0.0,0.011111,0.005698,0.090909,0.0,0.015873,0.009404,0.055556,0.0,0.016129
2018-02-02,0.0,0.023864,0.023256,0.015228,0.0,0.015748,0.083333,0.0,0.0,0.009444,...,0.0,0.02963,0.017094,0.0,0.0,0.0,0.015674,0.0,0.0,0.032258
2018-02-03,0.0,0.023864,0.0,0.020305,0.021739,0.015748,0.0,0.0,0.0,0.018888,...,0.0,0.011111,0.019943,0.0,0.0,0.0,0.009404,0.0,0.0,0.0
