# Data Exploration for CTSlice dataset

In [1]:
import numpy as np
import pandas as pd
import os

from torchvision.datasets.utils import download_url, download_and_extract_archive
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## `CTSlice` from `SSDKL`

In [2]:
def load_dataset_from_ssdkl(path_to_folder):
    X = np.load(os.path.join(path_to_folder, "X.npy"))
    y = np.load(os.path.join(path_to_folder, "y.npy"))

    return X, y

PATH_TO_DATASET = "/home/flo/ssdgm/notebooks/datasets/SSDKL/ctslice"
X_ssdkl, y_ssdkl = load_dataset_from_ssdkl(PATH_TO_DATASET)
X_ssdkl.shape, y_ssdkl.shape

((53500, 384), (53500,))

## `CTSlice` from `UCI`

In [3]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip"
DOWNLOAD_FOLDER = "/home/flo/ssdgm/notebooks/datasets/UCI/CTSlice/raw"

download_and_extract_archive(url=URL, download_root=DOWNLOAD_FOLDER)
os.remove(os.path.join(DOWNLOAD_FOLDER, os.path.basename(URL)))

Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip to /home/flo/ssdgm/notebooks/datasets/UCI/CTSlice/raw/slice_localization_data.zip


  0%|          | 0/17823884 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [4]:
df_uci = pd.read_csv(os.path.join(DOWNLOAD_FOLDER, "slice_localization_data.csv"))
df_uci.shape

(53500, 386)

In [5]:
df_uci.head()

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


## Necessary Cleaning Steps

In [8]:
df_uci.describe()

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
count,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,...,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0,53500.0
mean,0.059627,0.071558,0.145819,0.218728,0.274762,0.276189,0.204531,0.062281,-0.042025,-0.231614,...,-0.029404,0.182913,0.320112,0.359373,0.342889,0.266091,0.083049,-0.031146,-0.154524,47.028039
std,0.174243,0.196921,0.30027,0.359163,0.378862,0.369605,0.351294,0.292232,0.268391,0.100085,...,0.085817,0.383333,0.463517,0.478188,0.471811,0.437633,0.279734,0.098738,0.122491,22.347042
min,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,1.738733
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,29.891607
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,43.987893
75%,0.0,0.0,0.0,0.446429,0.684477,0.662382,0.441412,0.0,0.0,-0.25,...,0.0,0.0,0.996286,0.999677,0.99956,0.949478,0.0,0.0,0.0,63.735059
max,1.0,1.0,1.0,1.0,0.99879,0.996468,0.999334,1.0,1.0,1.0,...,0.961279,1.0,1.0,1.0,1.0,1.0,0.999857,0.996839,0.942851,97.489115


- Remove Id column

In [6]:
df_uci = df_uci.drop(['patientId'], axis=1)

In [27]:
for col in df_uci.columns:
    if (df_uci[col] == 0.0).all():
        print(f"{col} is all zero.")
    if (df_uci[col].std() == 0.0):
        print(f"{col} has only one value: {df_uci[col].unique()}")

value59 has only one value: [-0.25]
value69 has only one value: [-0.25]
value179 has only one value: [-0.25]
value189 has only one value: [-0.25]
value351 has only one value: [-0.25]


In [26]:
df_uci["value351"].unique()

array([-0.25])

In [24]:
for el in df_uci.min():
    print(el)

0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
-0.25
0.0
0.0
0.0
-0.25
-0.25
-0

## Conclusion

- use `reference` as target variable