In [5]:
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks
from scipy.fft import fft
from biosppy.signals import ecg  # Biosppy is a library for biosignal processing

In [6]:
import pickle

In [4]:
# Open the file in binary read mode ('rb') to unpickle the data
with open('batch_of_data.pickle', 'rb') as file:
    loaded__data = pickle.load(file)

NameError: name 'pickle' is not defined

In [None]:
ecg_data = []
for patient in loaded_data:
    new_patient_format = np.array(list(patient.values()))
    ecg_data.append(new_patient_format)
    print(len(new_patient_format[7]))

In [None]:
ecg_data = np.array(ecg_data)

In [None]:
ecg_data

In [7]:
num_patients = 23292
num_leads = 8
num_time_points = 5000

In [8]:
np.random.seed(42)
ecg_data = np.random.randn(num_patients, num_leads, num_time_points)
labels = np.random.randint(0, 3, size=num_patients)

In [None]:
# Function to extract features from each lead
def extract_features(lead):
    # 1. Statistical Features
    mean_value = np.mean(lead)
    median_value = np.median(lead)
    std_dev_value = np.std(lead)
    skewness_value = skew(lead)
    kurtosis_value = kurtosis(lead)

    # 2. Time-Domain Features
    # You might need to preprocess the data to find R-peaks for RR interval calculations
    # Example using biosppy
    _, rpeaks = ecg.ecg(lead, sampling_rate=500, show=True)
    rr_interval = np.diff(rpeaks)

    # Calculate features from RR intervals
    rr_mean = np.mean(rr_interval)
    heart_rate = 60 / rr_mean

    # 3. Frequency-Domain Features
    #power_spectral_density (psd) shape is (num_time_points // 2 + 1)
    f, psd = signal.welch(lead, fs=500)
    dominant_frequency = f[np.argmax(psd)]
    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10))

    # 4. Wavelet Transform (using PyWavelets library)
    import pywt
    # Wavelet Transform Features
    coeffs = pywt.wavedec(lead, 'db1', level=4)

    # Combine all features into a single array
    extracted_features = np.array([
        mean_value, median_value, std_dev_value, skewness_value, kurtosis_value,
        rr_mean, heart_rate, dominant_frequency, spectral_entropy,
        *coeffs[0], *coeffs[1], *coeffs[2], *coeffs[3]
    ])

    return extracted_features

#    (cA, cD) = pywt.dwt(lead, 'db1')

    # 5. Heart Rate Variability Features
    # Already calculated RR intervals, you can extract various features from them

    # 6. Dynamical Features
    # You might need a dynamic model or use dynamic time warping techniques

    # 7. Deep Learning Representations
    # Use pre-trained models or train your own CNN/RNN on ECG data

    # 8. Principal Component Analysis (PCA)
#    from sklearn.decomposition import PCA

#    flattened_data = np.reshape(ecg_data, (num_patients, -1))
#    pca = PCA(n_components=10)  # Adjust the number of components
#    pca_features = pca.fit_transform(flattened_data)
    
    # 9. Cross-Lead Features
#    lead_correlations = np.zeros((num_patients, num_leads, num_leads))
#    for i in range(num_patients):
#        for j in range(num_leads):
#            for k in range(num_leads):
#                lead_correlations[i, j, k] = np.corrcoef(ecg_data[i, j, :], ecg_data[i, k, :])[0, 1]

In [None]:
# Initialize an empty array to store features for each patient
patient_features = np.zeros((num_patients, len(extract_features(ecg_data[0, 0, :])) * num_leads))

In [None]:
# Apply feature extraction for each lead and concatenate features for each patient
for patient_index in range(num_patients):
    patient_lead_features = np.zeros((num_leads, len(extract_features(ecg_data[0, 0, :]))))
    
    for lead_index in range(num_leads):
        patient_lead_features[lead_index, :] = extract_features(ecg_data[patient_index, lead_index, :])
    
    # Concatenate features for the current patient
    patient_features[patient_index, :] = patient_lead_features.flatten()

In [None]:
y = labels

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Assuming patient_features is your 2D array of features (patients x features)
# Assuming y is your target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(patient_features, y, test_size=0.2, random_state=42)

In [None]:
lead_features.shape

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
print("Accuracy:", accuracy)
print("Classification Report:\n", report)