# Russell Data CLeaning and EDA

## Imports and Helper functions

In [2]:
from warnings import simplefilter
simplefilter(action='ignore')

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
import pandas as pd
import numpy as np
import sympy as sympy
import time 
import enum

In [82]:
#Clean method to return all the image IDs of a given patient ID
def returnAllPatientImages(patient):
    return(list(train_data[train_data["PatientID"]==patient]["StudyInstanceUID"]))

## Data Cleanining and Integrity Checking

In [5]:
train_data = pd.read_csv("train - train.csv")
train_data

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,0,0,0,0,0,0,1,0,0,0,0,ec89415d1
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,0,0,1,0,0,1,0,0,0,1,0,bf4c6da3c
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,0,0,0,0,0,0,0,0,1,0,0,3fc1c97e5
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,0,0,0,0,0,0,0,1,0,0,0,c31019814
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,0,0,0,0,0,0,0,0,0,1,0,207685cd1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.2.826.0.1.3680043.8.498.74257566841157531124...,0,0,1,0,0,0,0,0,1,1,0,5b5b9ac30
30079,1.2.826.0.1.3680043.8.498.46510939987173529969...,0,0,0,0,0,0,0,0,0,1,0,7192404d8
30080,1.2.826.0.1.3680043.8.498.43173270582850645437...,0,0,1,0,0,1,0,1,0,1,0,d4d1b066d
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,0,0,0,0,0,0,0,0,1,0,0,01a6602b8


In [6]:
train_data.isna().sum()

StudyInstanceUID              0
ETT - Abnormal                0
ETT - Borderline              0
ETT - Normal                  0
NGT - Abnormal                0
NGT - Borderline              0
NGT - Incompletely Imaged     0
NGT - Normal                  0
CVC - Abnormal                0
CVC - Borderline              0
CVC - Normal                  0
Swan Ganz Catheter Present    0
PatientID                     0
dtype: int64

In [27]:
train_data["PatientID"].describe()

count         30083
unique         3255
top       05029c63a
freq            172
Name: PatientID, dtype: object

## EDA

In [75]:
#Counts the number of patients
train_data["PatientID"].unique().size

3255

Despite there being about 30,000 rows in the table, there are only about 3255 patients. This means that paints will the multiple images of them with varying data points. There are ~9 images per patitent. Lets look further into this.

In [76]:
#Groups the table to log the number of occurrences of each patient
frequencies = pd.DataFrame(train_data.groupby(by="PatientID")["PatientID"].count())
frequencies.rename(columns={"PatientID":"Count"}, inplace=True)

In [77]:
display(frequencies.sort_values(by="Count", ascending=False))
singleImagePatients = frequencies[frequencies["Count"]==1]
mt10ImagePatients = frequencies[frequencies["Count"]>10]
print(f"{singleImagePatients.size} patients have only 1 image")
print(f"{mt10ImagePatients.size} patients have more than 10 images")

Unnamed: 0_level_0,Count
PatientID,Unnamed: 1_level_1
05029c63a,172
55073fece,167
26da0d5ad,148
8849382d0,130
34242119f,110
...,...
c09883b19,1
e46763e02,1
7bcec67b1,1
c0b57c3d9,1


397 patients have only 1 image
800 patients have more than 10 images


Knowing this, let's take a look at the patient with the most entries to see if we can root out any stories or patters. How might these catherters change over time in a single patient?

In [91]:
mostImagedPatient = train_data[train_data["PatientID"]=="05029c63a"]
dataToDisplay = mostImagedPatient.sort_values(by= "ETT - Normal")

#Bypasses maximum row limit
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dataToDisplay)

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID
2226,1.2.826.0.1.3680043.8.498.74190045333565570017...,0,1,0,0,0,0,1,0,0,1,1,05029c63a
12933,1.2.826.0.1.3680043.8.498.61936035308630680796...,0,1,0,0,0,0,0,0,1,0,0,05029c63a
28334,1.2.826.0.1.3680043.8.498.83695501915139952540...,0,1,0,0,0,1,0,0,1,1,0,05029c63a
24390,1.2.826.0.1.3680043.8.498.42565923266899815850...,0,1,0,0,0,0,1,0,1,1,1,05029c63a
10367,1.2.826.0.1.3680043.8.498.89784585798061898325...,0,1,0,0,0,0,1,0,0,1,1,05029c63a
14245,1.2.826.0.1.3680043.8.498.93518520610058188214...,0,0,0,0,0,0,1,0,0,0,0,05029c63a
10253,1.2.826.0.1.3680043.8.498.81612968658469626927...,0,0,0,0,0,0,0,0,0,1,1,05029c63a
27011,1.2.826.0.1.3680043.8.498.14612428778012016836...,0,1,0,0,0,1,0,0,0,1,1,05029c63a
17329,1.2.826.0.1.3680043.8.498.51595082650400612709...,0,0,0,0,0,0,0,0,0,1,1,05029c63a
4828,1.2.826.0.1.3680043.8.498.74338185865293989379...,0,1,0,0,0,0,1,0,0,1,0,05029c63a


In [92]:
#This is to analyse a list of all the images of the most-imaged patient
returnAllPatientImages("05029c63a")

['1.2.826.0.1.3680043.8.498.33289872132944517059685300521052847970',
 '1.2.826.0.1.3680043.8.498.58815494070381425972841494897309562888',
 '1.2.826.0.1.3680043.8.498.18423562395321282202151467283211883687',
 '1.2.826.0.1.3680043.8.498.80010847260848855758249955549927520930',
 '1.2.826.0.1.3680043.8.498.68620140014200110095088585785330951729',
 '1.2.826.0.1.3680043.8.498.87375315768078677164756022968175911741',
 '1.2.826.0.1.3680043.8.498.99564257536158506118998101216918000997',
 '1.2.826.0.1.3680043.8.498.43908356782442779725424825386043768718',
 '1.2.826.0.1.3680043.8.498.51991674554454729241003433645598796537',
 '1.2.826.0.1.3680043.8.498.78021892236561877580290458242105043659',
 '1.2.826.0.1.3680043.8.498.86537162817021158811092283918656005072',
 '1.2.826.0.1.3680043.8.498.11525667914743618624335995337909956877',
 '1.2.826.0.1.3680043.8.498.12682264367523788590209252628203092871',
 '1.2.826.0.1.3680043.8.498.74190045333565570017428818377245264093',
 '1.2.826.0.1.3680043.8.498.294983

Let's look at a simple patient, maybe just one in a middle with only 7 images (01a6602b8)

In [94]:
mostImagedPatient = train_data[train_data["PatientID"]=="01a6602b8"]
dataToDisplay = mostImagedPatient

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dataToDisplay)

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID
3547,1.2.826.0.1.3680043.8.498.13368586175752823546...,0,0,0,0,0,0,0,0,0,1,0,01a6602b8
3834,1.2.826.0.1.3680043.8.498.92443648275106206350...,0,0,0,0,0,0,0,0,0,1,0,01a6602b8
18199,1.2.826.0.1.3680043.8.498.48575077712160297418...,0,0,0,0,0,0,0,0,1,0,0,01a6602b8
20679,1.2.826.0.1.3680043.8.498.38434326028461648396...,0,0,0,0,0,0,0,0,0,1,0,01a6602b8
22063,1.2.826.0.1.3680043.8.498.32265506210757468088...,0,0,0,0,0,0,0,0,0,1,0,01a6602b8
22775,1.2.826.0.1.3680043.8.498.70674394161629430460...,0,0,0,0,0,0,0,0,0,1,0,01a6602b8
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,0,0,0,0,0,0,0,0,1,0,0,01a6602b8


In [95]:
frequencies[frequencies["Count"]==7]

Unnamed: 0_level_0,Count
PatientID,Unnamed: 1_level_1
01a6602b8,7
01e3dc988,7
01fa2ad89,7
0261d55c1,7
04323559c,7
...,...
fc4269a27,7
fcc1c2dfc,7
fce6e7380,7
fed489bad,7


In [98]:
mostImagedPatient = train_data[train_data["PatientID"]=="ff826cf51"]
dataToDisplay = mostImagedPatient

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(dataToDisplay)

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID
9054,1.2.826.0.1.3680043.8.498.85888286754526046422...,0,0,0,0,0,0,0,0,0,1,0,ff826cf51
16211,1.2.826.0.1.3680043.8.498.50804831665099230965...,0,0,0,0,0,0,0,0,0,1,0,ff826cf51
17344,1.2.826.0.1.3680043.8.498.29983487174552869646...,0,0,1,0,0,1,0,0,0,1,0,ff826cf51
17658,1.2.826.0.1.3680043.8.498.73750776498365697385...,0,0,1,0,0,0,1,0,0,1,0,ff826cf51
18146,1.2.826.0.1.3680043.8.498.12722241245555310535...,0,0,1,0,0,0,1,0,0,1,0,ff826cf51
20747,1.2.826.0.1.3680043.8.498.65803857161092450324...,0,0,1,0,0,1,0,0,0,1,0,ff826cf51
25717,1.2.826.0.1.3680043.8.498.12724690186666099182...,0,0,0,0,0,0,0,0,1,0,0,ff826cf51


After taking a peek at some of the patients with only a few images, it can be seen that most images are focused on CVC caherters. Additionally, the CVC catherters seem to bounce back and forth a lot betwtween normal and bordeline and sometimes abnormal. There are two possible conjectures, since there is no record of time between these photos: <br><br>
1] The catherters move about and can change betwwen normal and borerline between images <br>
2] Chaterters start borderline or normal until an event or action which displaces them into another state, in which they stay until disturbed agian. 