In [2]:
%pip install PyWavelets
%pip install pandas
%pip install scipy

Collecting PyWavelets
  Using cached pywavelets-1.8.0-cp310-cp310-win_amd64.whl.metadata (9.0 kB)
Collecting numpy<3,>=1.23 (from PyWavelets)
  Using cached numpy-2.2.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Using cached pywavelets-1.8.0-cp310-cp310-win_amd64.whl (4.2 MB)
Using cached numpy-2.2.3-cp310-cp310-win_amd64.whl (12.9 MB)
Installing collected packages: numpy, PyWavelets
Successfully installed PyWavelets-1.8.0 numpy-2.2.3
Note: you may need to restart the kernel to use updated packages.
Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected 

In [3]:
import pandas as pd
import numpy as np
import os
import shutil
import sys
import time
from scipy import signal
import pywt

In [4]:
label = pd.read_csv("labels_original.csv", header = 0, names = ('patient', 'diagonsis'))
patient_array = np.array(label['patient'])
diagonsis_array = np.array(label['diagonsis'])

In [6]:
def baseline_corr(array):
    # Function for use in baseline removal
    def wrcoef(X, coef_type, coef, wavename, level):
        N = np.array(X).size
        a, ds = coef[0], list(reversed(coef[1:]))

        if coef_type =='a':
            return pywt.upcoef('a', a, wavename, level=level)[:N]
        elif coef_type == 'd':
            return pywt.upcoef('d', ds[level-1], wavename, level=level)[:N]
        else:
            raise ValueError("Invalid coefficient type: {}".format(coef_type))
    #Baseline correction variables
    wavename = 'db1'
    #Baseline correction
    coef = pywt.wavedec(array, wavename, level=10)
    A10 = wrcoef(array, 'a', coef, wavename, 10)
    D10 = wrcoef(array, 'd', coef, wavename, 10)
    D9 = wrcoef(array, 'd', coef, wavename, 9)
    D8 = wrcoef(array, 'd', coef, wavename, 8)
    D7 = wrcoef(array, 'd', coef, wavename, 7)
    D6 = wrcoef(array, 'd', coef, wavename, 6)
    D5 = wrcoef(array, 'd', coef, wavename, 5)
    D4 = wrcoef(array, 'd', coef, wavename, 4)
    D3 = wrcoef(array, 'd', coef, wavename, 3)
    D2 = wrcoef(array, 'd', coef, wavename, 2)
    D1 = wrcoef(array, 'd', coef, wavename, 1)
    array = D10 + D9 + D8 + D7 + D6 + D5 + D4 + D3 + D2 + D1
    return array

In [7]:
def wavelet_denoise(data, wavelet='sym7', level=4, mode='soft'): #Update
    coeffs = pywt.wavedec(data, wavelet, level=level)
    threshold = np.sqrt(2 * np.log(len(data))) * np.median(np.abs(coeffs[-level])) / 0.6745
    coeffs[1:] = [pywt.threshold(c, threshold, mode=mode) for c in coeffs[1:]]
    return pywt.waverec(coeffs, wavelet)

In [8]:
#Loop through each file in the database
patient = []
diagonsis = []
original_file = []
signal_num = 0
if os.path.exists('PTB_processed'):
    shutil.rmtree('PTB_processed')
os.makedirs('PTB_processed')
for i in range (len(patient_array)): #For each patient
    if patient_array[i] < 10:
        p_string = '00' + str(patient_array[i])
    elif patient_array[i] < 100:
        p_string = '0' + str(patient_array[i])
    else:
        p_string = str(patient_array[i])
    #p_number = 'patient' + p_string
    filepath = "PTB_original/patient" + p_string
    if os.path.isdir(filepath) == True: #Check if patient folder exists
         for filename in os.listdir(filepath): #For each file in patient, load file
            start_time = time.time()
            data = pd.read_csv(filepath + '/' + filename, header = 0, skiprows = 1, usecols = range(1,13), engine = 'python')
            patient.append(i + 1)
            original_file.append(filename)
            diagonsis.append(diagonsis_array[i])
            signal_num += 1
            if signal_num < 10:
                mod_signal = '00' + str(signal_num)
            elif signal_num < 100:
                mod_signal = '0' + str(signal_num)
            else:
                mod_signal = str(signal_num)
            file_array = []
            for column in data: # Processing
                df = np.array(data[column])
                df = baseline_corr(df)
                df = wavelet_denoise(df) # Updated
                df = df[0:30000]
                df = signal.resample(df, 7500)
                file_array.append(df)
            newdata = pd.concat([pd.Series(file_array[0]),
                                 pd.Series(file_array[1]),
                                 pd.Series(file_array[2]),
                                 pd.Series(file_array[3]),
                                 pd.Series(file_array[4]),
                                 pd.Series(file_array[5]),
                                 pd.Series(file_array[6]),
                                 pd.Series(file_array[7]),
                                 pd.Series(file_array[8]),
                                 pd.Series(file_array[9]),
                                 pd.Series(file_array[10]),
                                 pd.Series(file_array[11]),], axis = 1)
            newdata.columns = ['i','ii','iii','avr','avl','avf','v1','v2','v3','v4','v5','v6']
            newdata.to_csv('PTB_processed/record' + mod_signal + '_' + p_string + '_' + str(diagonsis_array[i]) + '.csv', index = False)
    completion = (i / len(patient_array)) * 100
    end_time = time.time()
    elasped_time = round(end_time - start_time, 2)
    sys.stdout.write('\r' + p_string + ' loaded - ' + str(completion) + '% completed' + ' Took : '+ str(elasped_time) + ' seconds')


label_df = pd.concat([pd.Series(patient), pd.Series(original_file), pd.Series(diagonsis)], axis = 1)
label_df.columns = ['patient', 'original_file', 'diagonsis']
label_df.to_csv('labels.csv')

print('')
print ('Done: ' + str(signal_num) + ' signals processed and loaded')

094 loaded - 46.5% completed Took : 0.77 seconds: 0.49 seconds

  thresholded = (1 - value/magnitude)


294 loaded - 99.5% completed Took : 0.53 seconds 0.49 seconds
Done: 448 signals processed and loaded


In [9]:
from distutils.sysconfig import get_python_lib
print(get_python_lib())

c:\Users\user\Desktop\Folders\SIT STUFF\Year 3\ML\opopopopopopop\PTB\.venv\Lib\site-packages


  from distutils.sysconfig import get_python_lib
  from distutils.sysconfig import get_python_lib
