In [1]:
import numpy as np
from numpy import loadtxt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.layers import Dense
from keras.models import Sequential

# Load the data

In [2]:
arrhythmia_df = pd.read_csv('arrhythmia.data', sep=",")
arrhythmia_df

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,8
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,10
3,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,1
4,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,53,1,160,70,80,199,382,154,117,-37,...,0.0,4.3,-5.0,0.0,0.0,0.7,0.6,-4.4,-0.5,1
448,37,0,190,85,100,137,361,201,73,86,...,0.0,15.6,-1.6,0.0,0.0,0.4,2.4,38.0,62.4,10
449,36,0,166,68,108,176,365,194,116,-85,...,0.0,16.3,-28.6,0.0,0.0,1.5,1.0,-44.2,-33.2,2
450,32,1,155,55,93,106,386,218,63,54,...,-0.4,12.0,-0.7,0.0,0.0,0.5,2.4,25.0,46.6,1


## Changing the class target variable to be binary: 0 = regular heartbeat, 1 = irregular

In [3]:
arrhythmia_df['class'].value_counts()

1     245
10     50
2      44
6      25
16     22
3      15
4      15
5      13
9       9
15      5
14      4
7       3
8       2
Name: class, dtype: int64

In [4]:
#class output 0 = regular, 1 = irregular
arrhythmia_df['class'] = arrhythmia_df['class'].apply(lambda x: 0 if x==1 else 1)
arrhythmia_df

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,1
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,1
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,1
3,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,0
4,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,53,1,160,70,80,199,382,154,117,-37,...,0.0,4.3,-5.0,0.0,0.0,0.7,0.6,-4.4,-0.5,0
448,37,0,190,85,100,137,361,201,73,86,...,0.0,15.6,-1.6,0.0,0.0,0.4,2.4,38.0,62.4,1
449,36,0,166,68,108,176,365,194,116,-85,...,0.0,16.3,-28.6,0.0,0.0,1.5,1.0,-44.2,-33.2,1
450,32,1,155,55,93,106,386,218,63,54,...,-0.4,12.0,-0.7,0.0,0.0,0.5,2.4,25.0,46.6,0


# Check dtypes and missing values

In [5]:
arrhythmia_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 280 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    age                           452 non-null    int64  
 1    sex                           452 non-null    int64  
 2    height                        452 non-null    int64  
 3    weight                        452 non-null    int64  
 4    QRSduration                   452 non-null    int64  
 5    PRinterval                    452 non-null    int64  
 6    Q-Tinterval                   452 non-null    int64  
 7    Tinterval                     452 non-null    int64  
 8    Pinterval                     452 non-null    int64  
 9    QRS                           452 non-null    int64  
 10   T                             452 non-null    object 
 11   P                             452 non-null    object 
 12   QRST                          452 non-null    ob

### Checking for "?" as Nan

In [6]:
#checking if "?" is in df

if '?' in arrhythmia_df.values:
    print('Element exists in Dataframe')
else:
    print('Not in Dataframe')

Element exists in Dataframe


### replace "?" with Nan

In [7]:
arrhythmia_df = arrhythmia_df.replace('?', np.nan)

In [8]:
# making sure "?" have been replaced
if '?' in arrhythmia_df.values:
    print('Element exists in Dataframe')
else:
    print('Not in Dataframe')

Not in Dataframe


### checking for missing values now that '?' has been replace with Nan

In [9]:
arrhythmia_df.isnull().sum().sort_values(ascending=False)

J                             376
P                              22
T                               8
QRST                            1
heartrate                       1
                             ... 
chV2_RPwave                     0
chV2_SPwave                     0
chV2_intrinsicReflecttions      0
chV2_RRwaveExists               0
class                           0
Length: 280, dtype: int64

### remove columns with more than 80% missing data

In [10]:
total = arrhythmia_df.isnull().sum().sort_values(ascending=False)
percent = (arrhythmia_df.isnull().sum()/arrhythmia_df.isnull().count()).sort_values(ascending=False)*100

missing_data=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
J,376,83.185841
P,22,4.867257
T,8,1.769912
QRST,1,0.221239
heartrate,1,0.221239
...,...,...
chV2_RPwave,0,0.000000
chV2_SPwave,0,0.000000
chV2_intrinsicReflecttions,0,0.000000
chV2_RRwaveExists,0,0.000000


In [10]:
# drop columns with more than 80% missing values
perc = 80.0
min_count = int(((100-perc)/100)*arrhythmia_df.shape[0]+1)
drop_arrhythmia_df = arrhythmia_df.dropna(axis=1, thresh=min_count)

In [11]:
#checking for missing values below 80%
drop_arrhythmia_df.isnull().sum().sort_values(ascending=False)

P                             22
T                              8
heartrate                      1
QRST                           1
chAVR_SwaveAmp                 0
                              ..
chV2_RPwave                    0
chV2_SPwave                    0
chV2_intrinsicReflecttions     0
chV2_RRwaveExists              0
class                          0
Length: 279, dtype: int64

### fill remaining missing values in df with mean

In [15]:
# fill the remaining NaN values using fillna with bfill method

#arrhythmia_df = arrhythmia_df.fillna(method='bfill')
#arrhythmia_df = arrhythmia_df.replace(to_replace = np.nan, value = 'mean')

In [16]:
#checking to make sure there are no Nans
#arrhythmia_df.isnull().sum().sort_values(ascending=False)

age                0
chDIII_PwaveAmp    0
chAVR_RwaveAmp     0
chAVR_QwaveAmp     0
chAVR_JJwaveAmp    0
                  ..
chV2_Rwave         0
chV2_Swave         0
chV2_RPwave        0
chV2_SPwave        0
class              0
Length: 279, dtype: int64

### fill remaining missing values using Simple Imputer

In [12]:
# fill missing values with the mean of that column

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(fill_value =  np.nan, strategy='mean')
heart_df = imputer.fit_transform(drop_arrhythmia_df)

In [15]:
#above output returns a numpy array, so convert back into a df
arrhythmia = pd.DataFrame(heart_df, columns = drop_arrhythmia_df.columns)
arrhythmia.head()

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,75.0,0.0,190.0,80.0,91.0,193.0,371.0,174.0,121.0,-16.0,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,1.0
1,56.0,1.0,165.0,64.0,81.0,174.0,401.0,149.0,39.0,25.0,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,1.0
2,54.0,0.0,172.0,95.0,138.0,163.0,386.0,185.0,102.0,96.0,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,1.0
3,55.0,0.0,175.0,94.0,100.0,202.0,380.0,179.0,143.0,28.0,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,0.0
4,75.0,0.0,190.0,80.0,88.0,181.0,360.0,177.0,103.0,-16.0,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,1.0


### Categorical Dtypes

In [16]:
arrhythmia.select_dtypes(['object']).columns

Index([], dtype='object')

In [17]:
arrhythmia.describe()

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,46.471239,0.550885,166.188053,68.170354,88.920354,155.152655,367.207965,169.949115,90.004425,33.676991,...,-0.278982,9.048009,-1.457301,0.003982,0.0,0.514823,1.222345,19.326106,29.47323,0.457965
std,16.466631,0.497955,37.17034,16.590803,15.364394,44.842283,33.385421,35.633072,25.826643,45.431434,...,0.548876,3.472862,2.00243,0.050118,0.0,0.347531,1.426052,13.503922,18.493927,0.498782
min,0.0,0.0,105.0,6.0,55.0,0.0,232.0,108.0,0.0,-172.0,...,-4.1,0.0,-28.6,0.0,0.0,-0.8,-6.0,-44.2,-38.6,0.0
25%,36.0,0.0,160.0,59.0,80.0,142.0,350.0,148.0,79.0,3.75,...,-0.425,6.6,-2.1,0.0,0.0,0.4,0.5,11.45,17.55,0.0
50%,47.0,1.0,164.0,68.0,86.0,157.0,367.0,162.0,91.0,40.0,...,0.0,8.8,-1.1,0.0,0.0,0.5,1.35,18.1,27.9,0.0
75%,58.0,1.0,170.0,79.0,94.0,175.0,384.0,179.0,102.0,66.0,...,0.0,11.2,0.0,0.0,0.0,0.7,2.1,25.825,41.125,1.0
max,83.0,1.0,780.0,176.0,188.0,524.0,509.0,381.0,205.0,169.0,...,0.0,23.6,0.0,0.8,0.0,2.4,6.0,88.8,115.9,1.0


### Standardize data

In [18]:
X = arrhythmia.drop('class', axis=1)
y =(arrhythmia['class'])

#split data into train and test
#standardize data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 24)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [19]:
print(arrhythmia.shape)

(452, 279)


In [33]:
#https://stackoverflow.com/questions/51293196/attributeerror-numpy-ndarray-object-has-no-attribute-drop
new_df = pd.DataFrame(StandardScaler().fit_transform(arrhythmia), columns=arrhythmia.columns, index=arrhythmia.index)

In [34]:
new_df

Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,1.734439,-1.107520,0.641327,0.713814,0.135505,0.844945,0.113709,0.113809,1.201469,-1.094661,...,0.508843,-0.013839,0.278621,-0.079546,0.0,1.109553,1.177737,0.294603,1.078670,1.087922
1,0.579312,0.902918,-0.031998,-0.251644,-0.516072,0.420769,1.013301,-0.588564,-1.977064,-0.191203,...,0.508843,-0.157972,0.728573,-0.079546,0.0,-0.906889,0.616126,0.079613,0.504874,1.087922
2,0.457720,-1.107520,0.156533,1.618932,3.197915,0.175193,0.563505,0.422853,0.464980,1.373324,...,0.508843,0.130294,-0.471299,-0.079546,0.0,-0.618826,1.528744,-0.520878,1.057018,1.087922
3,0.518516,-1.107520,0.237332,1.558590,0.721924,1.045871,0.383587,0.254284,2.054247,-0.125096,...,0.508843,0.908612,-0.371310,-0.079546,0.0,-0.330763,0.967133,1.132324,1.739077,-0.919183
4,1.734439,-1.107520,0.641327,0.713814,-0.059968,0.577044,-0.216141,0.198094,0.503742,-1.094661,...,0.508843,1.168051,-1.071235,-0.079546,0.0,-1.771079,1.879751,0.450286,1.804035,1.087922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,0.396924,0.902918,-0.166663,0.110403,-0.581229,0.978896,0.443560,-0.448089,1.046419,-1.557409,...,0.508843,-1.368689,-1.771161,-0.079546,0.0,0.533427,-0.436895,-1.758926,-1.622502,-0.919183
448,-0.575815,-1.107520,0.641327,1.015520,0.721924,-0.405260,-0.186154,0.872372,-0.659136,1.152968,...,0.508843,1.888716,-0.071342,-0.079546,0.0,-0.330763,0.826730,1.384382,1.782383,1.087922
449,-0.636611,-1.107520,-0.005065,-0.010279,1.243185,0.465419,-0.066209,0.675708,1.007656,-2.615116,...,0.508843,2.090502,-13.569901,-0.079546,0.0,2.837932,-0.156089,-4.709483,-3.392610,1.087922
450,-0.879796,0.902918,-0.301328,-0.794714,0.265820,-1.097337,0.563505,1.349985,-1.046762,0.447829,...,-0.220727,0.850959,0.378610,-0.079546,0.0,-0.042700,0.826730,0.420632,0.927101,-0.919183


In [39]:
X = arrhythmia.drop('class', axis=1)
y =(arrhythmia['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 24)

In [42]:
# Remove columns containing NaN from X
#X.drop(X.columns[np.isnan(X).any()], axis=1)
X_train.fillna(X_train.mean(), inplace=True)

### Feature Selection using ...

In [43]:
#from sklearn.decomposition import PCA
#feature extraction
#pca = PCA(n_components=10)
#fit = pca.fit(X)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
#from sklearn.feature_selection import chi2

#change the inputs and output to arrays
X = new_df.drop('class', axis=1).values
y =(new_df['class']).values

test = SelectKBest(score_func=f_classif, k=10)
fit = test.fit(X,y)

  f = msb / msw


In [44]:
from numpy import set_printoptions

# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[7.701e-02 2.366e+01 1.687e+00 4.835e-01 5.466e+01 1.133e+00 3.038e-05
 2.179e+01 4.009e+00 2.517e+00 1.114e-02 9.303e-02 2.981e-01 4.483e+00
 8.323e-01 6.586e-01 1.112e+01 1.265e+00       nan 4.738e+00 1.184e+00
 6.814e-02 6.814e-02 2.380e+00 2.380e+00 4.806e+00 3.100e+00 8.206e-02
 1.659e+01 9.746e-02 1.673e+00 5.186e+00 5.772e-02 5.262e+00 1.184e+00
 1.424e-02 1.424e-02 1.881e+00 1.526e+00 2.189e+00 9.966e-01 4.810e-01
 1.609e+00 4.342e+00 8.446e-01 7.281e-01 8.446e-01 3.651e+00 4.095e-01
 2.380e+00 1.185e+00 1.611e+01 1.990e+00 4.847e+00 8.446e-01 4.643e+00
 1.353e+00 2.380e+00 1.424e-02 2.380e+00 2.380e+00 4.806e+00 7.757e-02
 5.916e+00 7.375e+00 9.092e-05       nan 1.656e+01       nan 9.132e-01
 8.446e-01 1.184e+00 1.882e-01 1.184e+00 1.478e+01 3.258e-01 9.707e+00
 9.639e-01 3.379e+00 9.978e+00 2.380e+00 3.464e-01       nan 8.446e-01
 8.446e-01 1.184e+00 1.177e+01 1.676e+00 1.460e+01 3.760e+01 9.878e-01
 2.916e+01 3.587e+00 8.537e+00 2.860e-02 1.385e+00 6.814e-02 7.248e-02
 2.778