# Data Exploration for the Parkinsons Dataset

In [20]:
import os

import numpy as np

import pandas as pd

from torchvision.datasets.utils import download_url

from sklearn.preprocessing import StandardScaler

## Download the UCI Parkinsons dataset

In [7]:
urls = [
    "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data",
    "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names"
]

raw_folder = "/home/flo/ssdgm/notebooks/datasets/Parkinsons/raw"

os.makedirs(raw_folder, exist_ok=True)

['https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data',
 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names']

In [3]:
filenames = [os.path.basename(url) for url in urls]
filenames

['parkinsons_updrs.data', 'parkinsons_updrs.names']

In [8]:
for filename, url in zip(filenames, urls):
    download_url(url=url, root=raw_folder, filename=filename)

Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data to /home/flo/ssdgm/notebooks/datasets/Parkinsons/raw/parkinsons_updrs.data


  0%|          | 0/911261 [00:00<?, ?it/s]

Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.names to /home/flo/ssdgm/notebooks/datasets/Parkinsons/raw/parkinsons_updrs.names


  0%|          | 0/4423 [00:00<?, ?it/s]

## Load the UCI Parkinsons dataset

In [14]:
uci_df = pd.read_csv(os.path.join(raw_folder, filenames[0]))

## Load the SSDKL Parkinsons dataset

In [11]:
ssdkl_folder = "/home/flo/ssdgm/notebooks/datasets/SSDKL/parkinsons"

X = np.load(os.path.join(ssdkl_folder, "X.npy"))
y = np.load(os.path.join(ssdkl_folder, "y.npy"))

## Exploration

In [15]:
uci_df.head()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [16]:
uci_df.describe()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,21.494128,64.804936,0.317787,92.863722,21.296229,29.018942,0.006154,4.4e-05,0.002987,0.003277,0.008962,0.034035,0.31096,0.017156,0.020144,0.027481,0.051467,0.03212,21.679495,0.541473,0.65324,0.219589
std,12.372279,8.821524,0.465656,53.445602,8.129282,10.700283,0.005624,3.6e-05,0.003124,0.003732,0.009371,0.025835,0.230254,0.013237,0.016664,0.019986,0.039711,0.059692,4.291096,0.100986,0.070902,0.091498
min,1.0,36.0,0.0,-4.2625,5.0377,7.0,0.00083,2e-06,0.00033,0.00043,0.00098,0.00306,0.026,0.00161,0.00194,0.00249,0.00484,0.000286,1.659,0.15102,0.51404,0.021983
25%,10.0,58.0,0.0,46.8475,15.0,21.371,0.00358,2.2e-05,0.00158,0.00182,0.00473,0.01912,0.175,0.00928,0.01079,0.015665,0.02783,0.010955,19.406,0.469785,0.59618,0.15634
50%,22.0,65.0,0.0,91.523,20.871,27.576,0.0049,3.5e-05,0.00225,0.00249,0.00675,0.02751,0.253,0.0137,0.01594,0.02271,0.04111,0.018448,21.92,0.54225,0.6436,0.2055
75%,33.0,72.0,1.0,138.445,27.5965,36.399,0.0068,5.3e-05,0.00329,0.00346,0.00987,0.03975,0.365,0.020575,0.023755,0.032715,0.061735,0.031463,24.444,0.614045,0.711335,0.26449
max,42.0,85.0,1.0,215.49,39.511,54.992,0.09999,0.000446,0.05754,0.06956,0.17263,0.26863,2.107,0.16267,0.16702,0.27546,0.48802,0.74826,37.875,0.96608,0.8656,0.73173


In [18]:
X.shape, y.shape

((5875, 20), (5875,))

In [23]:
np.mean(X, axis=0)

array([-4.06369973e-16, -1.93509511e-17,  3.74924678e-17,  7.74038044e-17,
        1.45132133e-16,  1.16105707e-16, -9.67547555e-18, -2.90264267e-17,
       -4.83773778e-18, -7.74038044e-17, -4.83773778e-17,  4.83773778e-17,
       -2.90264267e-17, -3.87019022e-17,  9.67547555e-17, -2.90264267e-17,
       -6.19230435e-16,  1.93509511e-16,  4.16045449e-16, -4.16045449e-16])

In [24]:
np.std(X, axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [25]:
np.mean(y)

-4.613266742609917e-14

In [26]:
np.std(y)

10.699372551138886

In [28]:
uci_data_npy = uci_df.loc[:, uci_df.columns != "total_UPDRS"].to_numpy()
uci_target_npy = uci_df["total_UPDRS"].to_numpy()

In [30]:
scaler = StandardScaler()

scaler.fit(X=uci_data_npy, y=uci_target_npy)

StandardScaler()

In [37]:
scaled_uci_data = scaler.transform(uci_data_npy)
np.mean(scaled_uci_data, axis=0)


array([ 7.74038044e-17, -4.06369973e-16, -1.93509511e-17,  3.50735989e-17,
        5.80528533e-17,  1.45132133e-16,  1.16105707e-16,  1.93509511e-17,
       -1.93509511e-17,  1.69320822e-16,  1.74158560e-16,  1.83834035e-16,
        4.83773778e-17,  1.54807609e-16, -3.87019022e-17,  9.67547555e-17,
       -3.87019022e-17, -6.38581386e-16,  1.93509511e-16,  4.45071875e-16,
        1.74158560e-16])

In [39]:
np.std(scaled_uci_data, axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])