In [4]:
# %load ~/.ipython/standard_imports.py
import os
import logging
logging.basicConfig(level=logging.INFO)
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import isajosep_util
import isajosep_util.data_frame_plotter

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# Load

Attached CSV file contains a number of medical cases (extracted from EHRs). Each entry consists of:

- one more more “present” symptoms (symptoms that patient had at the time of visit). For example `s_0136` is GYANT code for “earache”.
- one more “absent” symptoms (symptoms that patient did not have). Keep in mind that there may other potential symptoms the patient was never asked about, which are neither “present” nor “absent”
- Age, Sex (1=Male), and Month of visit (which may be helpful, e.g. some conditions are gender-specific, some are seasonal)
- Diagnosis (“DX”) the patient was diagnosed with (using Gyant condition codes)


In [10]:
df = pd.read_csv('/Users/ijoseph/Code/Data/Gyant/data_challenge.csv', index_col=0)

# Cleaning

In [12]:
df.sample(5)

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
59908,c_0497,37.0,8,0,"s_0242, s_0865, s_0346, s_0327","s_1324, s_1547, s_0496, s_1316, s_0847"
8497,c_0133,19.0,8,0,,"s_0242, s_0824, s_1266, s_2697"
62265,c_0608,63.0,1,1,"s_0553, s_1213, s_1611, s_0242, s_2738, s_0084...","s_0445, s_2282"
66587,c_0273,36.0,7,0,"s_0070, s_0553, s_0078, s_0084, s_2563, s_0022...",
27272,c_0145,4.0,2,0,s_0106,"s_0136, s_0242, s_0309, s_0826, s_0180"


In [13]:
df.shape

(71662, 6)

In [14]:
df.describe()

Unnamed: 0,AGE,MONTH,SEX
count,71662.0,71662.0,71662.0
mean,32.297787,6.197357,0.407538
std,20.104694,3.531117,0.49138
min,-3.0,1.0,0.0
25%,19.0,3.0,0.0
50%,32.0,6.0,0.0
75%,45.0,9.0,1.0
max,161.0,12.0,1.0


Hm, mostly reasonable except for some impossible values for `AGE`.

## `AGE` cleaning

In [16]:
df.query("AGE > 100")

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
1186,c_0273,102.0,8,0,"s_0070, s_0106, s_0039, s_0346, s_0519, s_0002...","s_0578, s_0120, s_0865, s_2563"
24878,c_0497,159.0,12,1,s_1298,"s_0210, s_1317, s_2194, s_2387, s_1316"
24893,c_0036,101.0,7,0,"s_0070, s_1047, s_0519, s_0078, s_0022, s_0062",
44092,c_0720,161.0,1,0,s_0327,s_1216
47754,c_0608,102.0,4,0,"s_0400, s_0553, s_0826, s_0180","s_0022, s_2253, s_1030, s_0901, s_0460"
57856,c_0273,102.0,1,0,"s_0070, s_0864, s_0542, s_0039, s_0078","s_0578, s_0647, s_2738, s_1266"


In [18]:
df.query("AGE < 0")

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
14105,c_0273,-3.0,8,0,"s_2738, s_2282","s_2204, s_0578, s_0542, s_2563, s_1266"


Okay anything over 102.0 seems unlikely, and < 0 impossible. Want to assess whether missingness is correlated with anything else, rather than merely dropping these off the bat so as to not induce bias from dropping. 

In [20]:
df.query("(AGE > 103) or (AGE < 0) ")

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
14105,c_0273,-3.0,8,0,"s_2738, s_2282","s_2204, s_0578, s_0542, s_2563, s_1266"
24878,c_0497,159.0,12,1,s_1298,"s_0210, s_1317, s_2194, s_2387, s_1316"
44092,c_0720,161.0,1,0,s_0327,s_1216


Nothing obvious in terms of presence /absence of symptoms or DX, so safe to drop these, probably. 

In [21]:
df_age_cleaned = df.query("(AGE < 103) and (AGE > 0)")

## Missing/ Duplicated Values

Using custom helper function to check for both ([see repository here](https://github.com/ijoseph/util/blob/6d6cd84da663c39009d3894977f3e61c88d0969b/isajosep_util/__init__.py#L20))

In [23]:
isajosep_util.check_for_null(df_age_cleaned)

✗ 12,320 NaN or Inf values; 0 inf, 12,320 nan; Duplications: 185 duplicated rows by exclusively non-index content, 0 duplicated rows by exclusively index, 0 duplicated by both


(12320, 0, 12320, 185, 0, 0)

Okay, so several NaNs as expected via lack of check for absence or presence. 

Duplications are less expected. Looking more into these... 

### Duplicated by all columns other than index

In [36]:
df_age_cleaned[df_age_cleaned.duplicated(keep=False)].sort_values(by=['DX','AGE', 'MONTH', 'SEX']).head(10)

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
33339,c_0008,29.0,12,0,,
37010,c_0008,29.0,12,0,,
14403,c_0008,30.0,3,0,,
19340,c_0008,30.0,3,0,,
960,c_0008,30.0,6,1,,
59748,c_0008,30.0,6,1,,
29459,c_0008,31.0,1,1,,
53960,c_0008,31.0,1,1,,
32905,c_0008,31.0,6,0,,
49801,c_0008,31.0,6,0,,


In [39]:
df_age_cleaned_dedup = df_age_cleaned.drop_duplicates(subset=['DX', 'AGE', 'MONTH', 'Absent', 'Present'])

In [41]:
df_age_cleaned_dedup.shape, df_age_cleaned.shape

((69305, 6), (69646, 6))

### Duplicated by `DX`, `AGE`, `MONTH`, `SEX`

In [44]:
isajosep_util.check_for_null(df_age_cleaned_dedup.drop(['Absent', 'Present'], axis='columns'))

✗ 0 NaN or Inf values; 0 inf, 0 nan; Duplications: 48,083 duplicated rows by exclusively non-index content, 0 duplicated rows by exclusively index, 0 duplicated by both


(0, 0, 0, 48083, 0, 0)

Okay, interesting in that lots of duplications when we get rid of those columns for some reason. Off the bat, seems like too many duplications to be by chance; how many people with the exact same age (to the month) and gender could there be receiving the same diagnosis? 

Therefore, we need to merge. 

# Formatting

Create flags for each symptom found so as to be useful as features for ML modeling.  