In [1]:
import os
import numpy as np 
import pandas as pd 

DATA_DIR = "../data"
MITBIH_TRAIN_DIR = os.path.join(DATA_DIR, "mitbih_train.csv")
MITBIH_TEST_DIR = os.path.join(DATA_DIR, "mitbih_test.csv")
PTBDB_ABNORMAL_DIR = os.path.join(DATA_DIR, "ptbdb_abnormal.csv")
PTBDB_NORMAL_DIR = os.path.join(DATA_DIR, "ptbdb_normal.csv")

Question to ask:
- How many samples are there in mitbih dataset? Class distribution? Plot some samples?
- How many samples are there in ptbdb dataset? Class distribution? Plot some samples?

# Analysis of MITBIH Dataset

In [4]:
mitbih_train_df = pd.read_csv(MITBIH_TRAIN_DIR, header=None)
mitbih_train_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,187
0,0.977941,0.926471,0.681373,0.245098,0.154412,0.191176,0.151961,0.085784,0.058824,0.04902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.960114,0.863248,0.461538,0.196581,0.094017,0.125356,0.099715,0.088319,0.074074,0.082621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.659459,0.186486,0.07027,0.07027,0.059459,0.056757,0.043243,0.054054,0.045946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.925414,0.665746,0.541436,0.276243,0.196133,0.077348,0.071823,0.060773,0.066298,0.058011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.967136,1.0,0.830986,0.586854,0.356808,0.248826,0.14554,0.089202,0.117371,0.150235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
mitbih_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87554 entries, 0 to 87553
Columns: 188 entries, 0 to 187
dtypes: float64(188)
memory usage: 125.6 MB


In [9]:
LABEL_COL = 187
mitbih_train_df[LABEL_COL].value_counts()

187
0.0    72471
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64

In [10]:
mitbih_test_df = pd.read_csv(MITBIH_TEST_DIR, header=None)
mitbih_test_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,187
0,1.0,0.758264,0.11157,0.0,0.080579,0.078512,0.066116,0.049587,0.047521,0.035124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.908425,0.783883,0.531136,0.362637,0.3663,0.344322,0.333333,0.307692,0.296703,0.300366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.730088,0.212389,0.0,0.119469,0.10177,0.10177,0.110619,0.123894,0.115044,0.132743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.910417,0.68125,0.472917,0.229167,0.06875,0.0,0.004167,0.014583,0.054167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.57047,0.399329,0.238255,0.147651,0.0,0.003356,0.040268,0.080537,0.07047,0.090604,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
mitbih_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21892 entries, 0 to 21891
Columns: 188 entries, 0 to 187
dtypes: float64(188)
memory usage: 31.4 MB


In [12]:
LABEL_COL = 187
mitbih_test_df[LABEL_COL].value_counts()

187
0.0    18118
4.0     1608
2.0     1448
1.0      556
3.0      162
Name: count, dtype: int64

## Comments:
The MITBIH dataset have 109446 data entries (87554 train/ 21892 test).

Each samples contains an ECG beat's morphology represented with 187 data points. The last item in each entry is the category of that beat.
There are five categories:

| Condition                                 | Class      | #Samples in Training Set | #Sample in Test set |
| :--------------------------------------   | :--------: | -----------------------: | ------------------: |
| Normal beat (N)                           | 0          | 72471                    | 18118               | 
| Supraventricular premature beat (S)       | 1          | 2223                     | 556                 | 
| Premature ventricular contraction (P)     | 2          | 5788                     | 1448                |
| Fusion of ventricular and normal beat (F) | 3          | 641                      | 162                 |
| Unclassifiable beat (U)                   | 4          | 6431                     | 1608                |

# Analysis of PTBDB Dataset

In [16]:
ptbdb_abnormal_df = pd.read_csv(PTBDB_ABNORMAL_DIR, header=None)
ptbdb_abnormal_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,187
0,0.932233,0.869679,0.886186,0.929626,0.908775,0.93397,0.801043,0.749783,0.687229,0.6351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.606941,0.384181,0.254237,0.223567,0.276836,0.25343,0.184826,0.153349,0.121872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.951613,0.923963,0.853303,0.791859,0.734255,0.672043,0.6851,0.670507,0.667435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.977819,0.899261,0.230129,0.032348,0.142329,0.22366,0.328096,0.367837,0.381701,0.389094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.935618,0.801661,0.805815,1.0,0.722741,0.480789,0.454829,0.319834,0.266874,0.308411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
ptbdb_abnormal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Columns: 188 entries, 0 to 187
dtypes: float64(188)
memory usage: 15.1 MB


In [20]:
LABEL_COL = 187
ptbdb_abnormal_df[LABEL_COL].value_counts()

187
1.0    10506
Name: count, dtype: int64

In [21]:
ptbdb_normal_df = pd.read_csv(PTBDB_NORMAL_DIR, header=None)
ptbdb_normal_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,187
0,1.0,0.900324,0.35859,0.051459,0.046596,0.126823,0.133306,0.119125,0.110616,0.113047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.794681,0.375387,0.116883,0.0,0.171923,0.283859,0.293754,0.325912,0.345083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.909029,0.791482,0.423169,0.186712,0.0,0.007836,0.063032,0.077002,0.074957,0.077342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.478893,0.05676,0.064176,0.081289,0.072732,0.055619,0.048774,0.054478,0.041643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.867238,0.20136,0.099349,0.141336,0.120934,0.108516,0.096393,0.093436,0.100828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
ptbdb_normal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Columns: 188 entries, 0 to 187
dtypes: float64(188)
memory usage: 5.8 MB


In [23]:
LABEL_COL = 187
ptbdb_normal_df[LABEL_COL].value_counts()

187
0.0    4046
Name: count, dtype: int64

## Comments:
The PTBDB dataset have 14552 data entries.

Each samples contains an ECG beat's morphology represented with 187 data points. The last item in each entry is the category of that beat.
There are two categories:

| Condition                                 | Class      | #Samples in dataset      | 
| :--------------------------------------   | :--------: | -----------------------: | 
| Normal                                    | 0          | 4046                     | 
| Abnormal                                  | 1          | 10506                    | 
