In [1]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import pickle

In [3]:
# Open the file in binary read mode ('rb') to unpickle the data
with open('batch_of_data.pickle', 'rb') as file:
    loaded_data = pickle.load(file)

In [4]:
loaded_data

[{'I': array([-18, -23, -23, ..., -31, -33, -31], dtype=int16),
  'II': array([-21, -25, -22, ...,  10,  11,  12], dtype=int16),
  'V1': array([ 5,  7,  7, ..., 31, 32, 31], dtype=int16),
  'V2': array([-47, -38, -30, ...,  24,  26,  25], dtype=int16),
  'V3': array([9, 9, 9, ..., 2, 3, 2], dtype=int16),
  'V4': array([ -2,   0,   1, ..., -30, -28, -30], dtype=int16),
  'V5': array([-54, -49, -46, ..., -32, -30, -33], dtype=int16),
  'V6': array([  -1,    2,    3, ..., -131, -129, -132], dtype=int16)},
 {'I': array([-46, -46, -46, ...,  -8,  -5,  -4], dtype=int16),
  'II': array([-34, -34, -34, ...,  84,  86,  86], dtype=int16),
  'V1': array([-50, -50, -50, ..., -80, -80, -85], dtype=int16),
  'V2': array([40, 40, 40, ..., 80, 73, 70], dtype=int16),
  'V3': array([-4, -4, -4, ..., 42, 36, 33], dtype=int16),
  'V4': array([-8, -8, -8, ..., 28, 23, 21], dtype=int16),
  'V5': array([-12, -12, -12, ..., -73, -75, -77], dtype=int16),
  'V6': array([114, 114, 114, ..., 104, 106, 108], dtype

In [5]:
data = []
for patient in loaded_data:
    new_patient_format = np.array(list(patient.values()))
    data.append(new_patient_format)
    print(len(new_patient_format[7]))

5000
5000
5000
5000
5000
5000
5000
5000
5000
5000


In [6]:
data = np.array(data)

In [7]:
data

array([[[  -18,   -23,   -23, ...,   -31,   -33,   -31],
        [  -21,   -25,   -22, ...,    10,    11,    12],
        [    5,     7,     7, ...,    31,    32,    31],
        ...,
        [   -2,     0,     1, ...,   -30,   -28,   -30],
        [  -54,   -49,   -46, ...,   -32,   -30,   -33],
        [   -1,     2,     3, ...,  -131,  -129,  -132]],

       [[  -46,   -46,   -46, ...,    -8,    -5,    -4],
        [  -34,   -34,   -34, ...,    84,    86,    86],
        [  -50,   -50,   -50, ...,   -80,   -80,   -85],
        ...,
        [   -8,    -8,    -8, ...,    28,    23,    21],
        [  -12,   -12,   -12, ...,   -73,   -75,   -77],
        [  114,   114,   114, ...,   104,   106,   108]],

       [[  -18,   -18,   -18, ...,    -1,     1,     5],
        [  -10,   -10,   -10, ...,   -20,   -26,   -35],
        [  -12,   -12,   -12, ...,     5,     7,     8],
        ...,
        [  -16,   -16,   -16, ...,   -26,   -24,   -24],
        [  -12,   -12,   -12, ...,  -109,  -1

In [8]:
# Function to extract features from each lead
def extract_features(lead):
    mean_value = np.mean(lead)
    std_dev = np.std(lead)
    skewness = skew(lead)
    kurt = kurtosis(lead)
    
    # Spectral features using Welch method
    _, psd = welch(lead)
    max_power_freq = np.argmax(psd)
    mean_power = np.mean(psd)
    
    return mean_value, std_dev, skewness, kurt, max_power_freq, mean_power

In [9]:
ecg_data = data

In [10]:
# Apply feature extraction to each lead
lead_features = np.apply_along_axis(lambda lead: np.array(extract_features(lead)), axis=2, arr=ecg_data)

In [11]:
num_leads = 8

In [12]:
num_samples = 10

In [13]:
# Create a DataFrame with feature names
feature_names = [f'Lead{i}_{stat}' for i in range(1, num_leads + 1) for stat in ['Mean', 'StdDev', 'Skew', 'Kurt', 'MaxPowerFreq', 'MeanPower']]
ecg_df = pd.DataFrame(lead_features.reshape(num_samples, -1), columns=feature_names)

In [27]:
# Add the target variable
ecg_df['Severity'] = np.array([0,0,0,2,0,0,0,0,0,0])

In [28]:
ecg_df

Unnamed: 0,Lead1_Mean,Lead1_StdDev,Lead1_Skew,Lead1_Kurt,Lead1_MaxPowerFreq,Lead1_MeanPower,Lead2_Mean,Lead2_StdDev,Lead2_Skew,Lead2_Kurt,...,Lead7_Kurt,Lead7_MaxPowerFreq,Lead7_MeanPower,Lead8_Mean,Lead8_StdDev,Lead8_Skew,Lead8_Kurt,Lead8_MaxPowerFreq,Lead8_MeanPower,Severity
0,-1.5,36.049422,2.314799,4.975224,2.0,2517.79126,-0.2124,32.092885,-1.512631,3.894237,...,1.397434,1.0,1963.283081,-5.2038,66.383961,0.411321,-0.258793,2.0,1314.836914,0
1,-6.5872,39.197245,2.176368,4.454381,2.0,3532.866943,-3.5318,34.415499,-0.984946,2.578938,...,0.132929,1.0,3560.788574,-8.0332,151.243856,0.624826,0.040773,1.0,5510.708496,0
2,0.8286,28.751105,3.332958,12.369288,2.0,1570.428833,-4.57,21.160404,-0.312583,8.796732,...,10.491898,2.0,2001.522705,-8.7316,36.60324,2.446,11.352048,2.0,1928.089478,0
3,8.8842,39.188738,1.969945,5.408691,2.0,2106.440674,-0.6628,44.199017,-0.172189,-0.069797,...,0.246015,1.0,2411.773438,20.9372,157.659876,0.892897,0.857702,1.0,3726.195801,2
4,-3.4822,34.407989,3.142274,10.50917,2.0,2398.953857,-1.289,23.597387,0.073601,5.539113,...,6.671241,2.0,1495.157349,-24.8848,54.073202,0.66264,0.551886,1.0,1655.096558,0
5,-43.4668,47.782772,0.746709,0.919928,1.0,1818.098877,-11.1084,28.829205,-1.109292,3.937247,...,6.624335,2.0,1472.199829,7.067,35.013639,2.379774,8.364886,2.0,2005.720337,0
6,-4.8628,37.550302,2.943368,9.258202,2.0,2686.497803,-1.666,29.40987,-1.194493,4.609127,...,0.181511,2.0,2774.62793,-4.4184,62.746399,1.682007,3.577332,2.0,2740.827148,0
7,2.804,31.411163,3.041112,9.949202,2.0,1793.974487,7.099,25.867137,-1.084414,4.935357,...,9.247506,1.0,2838.928955,292.796,819.526973,-1.386061,15.321463,1.0,172831.984375,0
8,7.6514,33.738498,3.431281,13.120279,2.0,2009.421997,19.398,27.102022,-2.437146,10.232016,...,20.731822,2.0,1647.510254,-15.145,38.187794,1.8213,7.330171,2.0,1676.395996,0
9,4.028,35.662401,3.036944,10.342252,2.0,2349.001953,-2.434,25.29944,-2.249303,8.714789,...,2.963587,1.0,33805.292969,-4.3528,80.671402,-0.26428,1.733284,1.0,2917.19165,0


In [29]:
severity_labels = ecg_df['Severity']

In [30]:
# Prepare the features and target variable
X = ecg_df.drop(['Severity'], axis=1)
y = ecg_df['Severity']

In [18]:
X

Unnamed: 0,Lead1_Mean,Lead1_StdDev,Lead1_Skew,Lead1_Kurt,Lead1_MaxPowerFreq,Lead1_MeanPower,Lead2_Mean,Lead2_StdDev,Lead2_Skew,Lead2_Kurt,...,Lead7_Skew,Lead7_Kurt,Lead7_MaxPowerFreq,Lead7_MeanPower,Lead8_Mean,Lead8_StdDev,Lead8_Skew,Lead8_Kurt,Lead8_MaxPowerFreq,Lead8_MeanPower
0,-1.5,36.049422,2.314799,4.975224,2.0,2517.79126,-0.2124,32.092885,-1.512631,3.894237,...,-0.416187,1.397434,1.0,1963.283081,-5.2038,66.383961,0.411321,-0.258793,2.0,1314.836914
1,-6.5872,39.197245,2.176368,4.454381,2.0,3532.866943,-3.5318,34.415499,-0.984946,2.578938,...,0.347672,0.132929,1.0,3560.788574,-8.0332,151.243856,0.624826,0.040773,1.0,5510.708496
2,0.8286,28.751105,3.332958,12.369288,2.0,1570.428833,-4.57,21.160404,-0.312583,8.796732,...,2.13704,10.491898,2.0,2001.522705,-8.7316,36.60324,2.446,11.352048,2.0,1928.089478
3,8.8842,39.188738,1.969945,5.408691,2.0,2106.440674,-0.6628,44.199017,-0.172189,-0.069797,...,0.685994,0.246015,1.0,2411.773438,20.9372,157.659876,0.892897,0.857702,1.0,3726.195801
4,-3.4822,34.407989,3.142274,10.50917,2.0,2398.953857,-1.289,23.597387,0.073601,5.539113,...,1.233673,6.671241,2.0,1495.157349,-24.8848,54.073202,0.66264,0.551886,1.0,1655.096558
5,-43.4668,47.782772,0.746709,0.919928,1.0,1818.098877,-11.1084,28.829205,-1.109292,3.937247,...,2.111958,6.624335,2.0,1472.199829,7.067,35.013639,2.379774,8.364886,2.0,2005.720337
6,-4.8628,37.550302,2.943368,9.258202,2.0,2686.497803,-1.666,29.40987,-1.194493,4.609127,...,-0.497194,0.181511,2.0,2774.62793,-4.4184,62.746399,1.682007,3.577332,2.0,2740.827148
7,2.804,31.411163,3.041112,9.949202,2.0,1793.974487,7.099,25.867137,-1.084414,4.935357,...,1.410819,9.247506,1.0,2838.928955,292.796,819.526973,-1.386061,15.321463,1.0,172831.984375
8,7.6514,33.738498,3.431281,13.120279,2.0,2009.421997,19.398,27.102022,-2.437146,10.232016,...,3.909561,20.731822,2.0,1647.510254,-15.145,38.187794,1.8213,7.330171,2.0,1676.395996
9,4.028,35.662401,3.036944,10.342252,2.0,2349.001953,-2.434,25.29944,-2.249303,8.714789,...,1.242876,2.963587,1.0,33805.292969,-4.3528,80.671402,-0.26428,1.733284,1.0,2917.19165


In [19]:
y

0    1
1    1
2    0
3    0
4    0
5    1
6    0
7    0
8    1
9    0
Name: Severity, dtype: int32

In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [33]:
X_train

Unnamed: 0,Lead1_Mean,Lead1_StdDev,Lead1_Skew,Lead1_Kurt,Lead1_MaxPowerFreq,Lead1_MeanPower,Lead2_Mean,Lead2_StdDev,Lead2_Skew,Lead2_Kurt,...,Lead7_Skew,Lead7_Kurt,Lead7_MaxPowerFreq,Lead7_MeanPower,Lead8_Mean,Lead8_StdDev,Lead8_Skew,Lead8_Kurt,Lead8_MaxPowerFreq,Lead8_MeanPower
1,-6.5872,39.197245,2.176368,4.454381,2.0,3532.866943,-3.5318,34.415499,-0.984946,2.578938,...,0.347672,0.132929,1.0,3560.788574,-8.0332,151.243856,0.624826,0.040773,1.0,5510.708496
5,-43.4668,47.782772,0.746709,0.919928,1.0,1818.098877,-11.1084,28.829205,-1.109292,3.937247,...,2.111958,6.624335,2.0,1472.199829,7.067,35.013639,2.379774,8.364886,2.0,2005.720337
0,-1.5,36.049422,2.314799,4.975224,2.0,2517.79126,-0.2124,32.092885,-1.512631,3.894237,...,-0.416187,1.397434,1.0,1963.283081,-5.2038,66.383961,0.411321,-0.258793,2.0,1314.836914
7,2.804,31.411163,3.041112,9.949202,2.0,1793.974487,7.099,25.867137,-1.084414,4.935357,...,1.410819,9.247506,1.0,2838.928955,292.796,819.526973,-1.386061,15.321463,1.0,172831.984375
2,0.8286,28.751105,3.332958,12.369288,2.0,1570.428833,-4.57,21.160404,-0.312583,8.796732,...,2.13704,10.491898,2.0,2001.522705,-8.7316,36.60324,2.446,11.352048,2.0,1928.089478
9,4.028,35.662401,3.036944,10.342252,2.0,2349.001953,-2.434,25.29944,-2.249303,8.714789,...,1.242876,2.963587,1.0,33805.292969,-4.3528,80.671402,-0.26428,1.733284,1.0,2917.19165
4,-3.4822,34.407989,3.142274,10.50917,2.0,2398.953857,-1.289,23.597387,0.073601,5.539113,...,1.233673,6.671241,2.0,1495.157349,-24.8848,54.073202,0.66264,0.551886,1.0,1655.096558
3,8.8842,39.188738,1.969945,5.408691,2.0,2106.440674,-0.6628,44.199017,-0.172189,-0.069797,...,0.685994,0.246015,1.0,2411.773438,20.9372,157.659876,0.892897,0.857702,1.0,3726.195801
6,-4.8628,37.550302,2.943368,9.258202,2.0,2686.497803,-1.666,29.40987,-1.194493,4.609127,...,-0.497194,0.181511,2.0,2774.62793,-4.4184,62.746399,1.682007,3.577332,2.0,2740.827148


In [34]:
y_train

1    0
5    0
0    0
7    0
2    0
9    0
4    0
3    2
6    0
Name: Severity, dtype: int32

In [35]:
# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [36]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [37]:
y_overfit = clf.predict(X_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [38]:
# Evaluate the model
accuracy = accuracy_score(y_train, y_overfit)
print(f"Accuracy: {accuracy}")
accuracy2 = accuracy_score(y_test, y_pred)
print(f"Accuracy2: {accuracy2}")

Accuracy: 1.0
Accuracy2: 1.0


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
