In [8]:
import os
import pandas as pd
import numpy as np
import seaborn as sns

sns.set()

# Anomaly detection in NHAMCS data

## Loading the dataset

The NHAMCS dataset is loaded, where aach row represents a patient visit. All columns except those related to the date of the visit and the reason for the visit are discarded. The primary reason for visit column (RFV1) is categorical, with each symptom being assigned a unique numeric code.

Here, the visit year, month, and day are combined into a datetime and the original date columns are discarded.

In [9]:
years = [
    2015,
    2016,
    2017,
    2018,
]

cols = [
    'VMONTH',
    'VDAYR',
    'RFV1',
]

col_rename = {
    'VMONTH': 'month',
    'VDAYR': 'day',
}

na = {
    'RFV1': -9,
}

def get_nhamcs_data(years):
    for year in years:
        df = pd.read_csv(
            os.path.join('data', 'nhamcs2018.csv'),
            usecols=cols,
            na_values=na,
        )
        df['year'] = year
        yield df

frames = list(get_nhamcs_data(years))
nhamcs = pd.concat(frames, join='inner', ignore_index=True)

nhamcs.rename(columns=col_rename, inplace=True)
nhamcs['date'] = pd.to_datetime(nhamcs[['year', 'month', 'day']])
nhamcs.drop([
        'year',
        'month',
        'day',
    ],
    axis=1,
    inplace=True,
)

nhamcs.head(10)

Unnamed: 0,RFV1,date
0,10100.0,2015-12-01
1,55450.0,2015-12-01
2,10100.0,2015-12-06
3,15451.0,2015-12-04
4,55450.0,2015-11-02
5,10100.0,2015-12-01
6,14400.0,2015-12-07
7,14400.0,2015-12-04
8,55050.0,2015-12-07
9,18600.0,2015-12-06


## Creating feature vectors

The reasons for the visit are restricting to only those associated with flu-like symptoms. The categorical information is replaced with a new one-hot encoding, with one column for each symptom.

To create the feature vectors, the symptom are grouped by date with each row representing the fraction of patients visiting for that symptom as a fraction of the total visits.

In [10]:
import rfv

nhamcs = nhamcs[nhamcs["RFV1"].isin(rfv.rfv1)].reindex()

onehot = pd.get_dummies(nhamcs['RFV1'], prefix='RFV')
nhamcs.drop('RFV1', axis=1, inplace=True)
nhamcs = nhamcs.join(onehot)

syndromic_features = nhamcs.groupby('date').sum().apply(lambda row: row/float(row.sum()))
syndromic_features.head()

Unnamed: 0_level_0,RFV_10050.0,RFV_10100.0,RFV_10150.0,RFV_10200.0,RFV_10250.0,RFV_10300.0,RFV_10350.0,RFV_10352.0,RFV_10353.0,RFV_10501.0,...,RFV_14851.0,RFV_15250.0,RFV_15300.0,RFV_15350.0,RFV_15400.0,RFV_15450.0,RFV_15451.0,RFV_16150.0,RFV_19651.0,RFV_46050.0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,0.0,0.004545,0.0,0.001269,0.0,0.003937,0.0,0.0,0.0,0.002361,...,0.0,0.000926,0.00641,0.0,0.0,0.011905,0.002155,0.0,0.035714,0.0
2015-01-02,0.014706,0.009659,0.011628,0.005076,0.01087,0.003937,0.0,0.0,0.0,0.004722,...,0.0,0.00463,0.009259,0.0,0.0,0.003968,0.003918,0.027778,0.035714,0.004032
2015-01-03,0.0,0.00767,0.011628,0.006345,0.0,0.0,0.0,0.0,0.0,0.005509,...,0.0,0.005556,0.007835,0.0,0.0,0.003968,0.003527,0.0,0.0,0.004032
2015-01-04,0.0,0.007955,0.011628,0.002538,0.01087,0.003937,0.0,0.0,0.0,0.002886,...,0.0,0.005556,0.007835,0.0,0.0,0.011905,0.004898,0.013889,0.0,0.016129
2015-01-05,0.0,0.005682,0.0,0.005076,0.0,0.005906,0.0,0.0,0.0,0.00446,...,0.0,0.002778,0.008547,0.0,0.0,0.0,0.002743,0.0,0.0,0.004032
