# Data cleaning
The data cleaning pipeline has four stages:
1. remove outliers using a) interquartile ratio with cutoff of 2.5 and b) "natural" bounds from other research or experimental design
2. standardize continuous variables using StandardScaler
3. calculate interaction terms based, e.g. between anxiety and depression
4. propogate fixed demographic variables captured at baseline across subsequent timelines

The target variables for each step are specified in `cleaning.py` and interaction functions are defined in `interactions.py`. It is straightforward to edit either of these files to add/remove variables and interactions.

Notes on definition of a priori bounds and raw distributions of each variable can be found in this google doc: https://docs.google.com/document/d/1-m_O8VCCq6M2dacm8rz9VW5QbgtNk3xpaAzX3wQObpg/edit

In [None]:
import pandas as pd
import seaborn as sns
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

In [32]:
# note: in this version, tb_cryst and tb_fluid have been commented out of standardize_vars in cleaning.py 
!cd ../scripts
from data_cleaning import remove_outlier_bounds, remove_outlier_IQR, standardize, outlier_vars, standardize_vars, fixed_vars
from interactions import add_interactions

In [4]:
raw_data = '../raw/'
derived_data = '../derived/'

In [5]:
version_old = '20211206'
version_new = date.today().strftime('%Y%m%d')

In [17]:
data = pd.read_csv(f'{derived_data}{version_old}/task_demo_outcomes_panel.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)

In [18]:
data_clean = data.copy()

# Remove outliers

In [8]:
# for IQR outlier removal
outliers = list(outlier_vars.keys())
data_clean[outliers] = remove_outlier_IQR(data_clean[outliers], cutoff=2.5)

In [19]:
# for logical/prior bounds outlier removal
for var, bounds in outlier_vars.items():
    data_clean[var] = remove_outlier_bounds(data_clean[var], bounds)

In [20]:
t0 = data_clean.query('time == 0').reset_index(drop=True)
t1 = data_clean.query('time == 1').reset_index(drop=True)
t2 = data_clean.query('time == 2').reset_index(drop=True)
t3 = data_clean.query('time == 3').reset_index(drop=True)

# Standardize

In [21]:
t0[standardize_vars] = standardize(t0[standardize_vars])
t1[standardize_vars] = standardize(t1[standardize_vars])
t2[standardize_vars] = standardize(t2[standardize_vars])
t3[standardize_vars] = standardize(t3[standardize_vars])

# Calculate interactions

In [22]:
t0 = add_interactions(t0).copy()
t1 = add_interactions(t1).copy()
t2 = add_interactions(t2).copy()
t3 = add_interactions(t3).copy()

# Propogate fixed variables

In [23]:
fixed = t0[['subject']+fixed_vars].copy()
t0.drop(columns=fixed_vars, inplace=True)
t1.drop(columns=fixed_vars, inplace=True)
t2.drop(columns=fixed_vars, inplace=True)
t3.drop(columns=fixed_vars, inplace=True)

In [24]:
t0 = pd.merge(t0, fixed, on=['subject'], how='outer')
t1 = pd.merge(t1, fixed, on=['subject'], how='outer')
t2 = pd.merge(t2, fixed, on=['subject'], how='outer')
t3 = pd.merge(t3, fixed, on=['subject'], how='outer')

In [25]:
data_clean_std = t0.append(t1)
data_clean_std = data_clean_std.append(t2)
data_clean_std = data_clean_std.append(t3)

In [26]:
data_clean_std.shape

(47504, 504)

In [27]:
data_clean_std.to_csv(f'{derived_data}{version_new}/data_clean_bounds_{version_new}.csv')

In [30]:
data_clean_std[['tb_cryst', 'tb_fluid']].describe()

Unnamed: 0,tb_cryst,tb_fluid
count,19004.0,11551.0
mean,104.5,95.56
std,17.79,17.36
min,33.0,37.0
25%,90.0,84.0
50%,103.0,95.0
75%,116.0,107.0
max,197.0,211.0
