- Load the PTB data
- explore the data to get a sense, and prepare it for model training
- run the train model script on the dataset
- get some results

In [1]:
%%capture
!pip install GitPython
from git import Repo

In [2]:
HTTPS_REMOTE_URL = 'https://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@github.com/Antony-gitau/physionet-challenge-2024-dsail.git'
DEST_NAME = 'ecg_classification'
cloned_repo = Repo.clone_from(HTTPS_REMOTE_URL, DEST_NAME)

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [4]:
!kaggle datasets download -d antonymgitau/ptb-xl-public-electrocardiography-dataset

Dataset URL: https://www.kaggle.com/datasets/antonymgitau/ptb-xl-public-electrocardiography-dataset
License(s): unknown
Downloading ptb-xl-public-electrocardiography-dataset.zip to /content
 99% 1.71G/1.72G [00:19<00:00, 130MB/s]
100% 1.72G/1.72G [00:19<00:00, 94.1MB/s]


In [5]:
import os, zipfile
try:
  os.mkdir("/content/data/")
except:
  print("data folder already exists")

In [6]:
import os, zipfile

dir_name = "/content/"
target_dir = "/content/data/"
extension = ".zip"

os.chdir(dir_name) # change directory from working dir to dir with files

for item in os.listdir(dir_name): # loop through items in dir
    if item.endswith(extension): # check for ".zip" extension
        file_name = os.path.abspath(item) # get full path of files
        zip_ref = zipfile.ZipFile(file_name) # create zipfile object
        zip_ref.extractall(target_dir) # extract file to dir
        zip_ref.close() # close file
        os.remove(file_name) # delete zipped file

In [7]:
os.chdir("/content/ecg_classification")
!pwd

/content/ecg_classification


In [8]:
# i will call the various functions from the physionet 2024 repo
from ecg_classification.helper_code import *
from ecg_classification.train_model  import *
from ecg_classification.run_model  import *
from ecg_classification.team_code  import *
from ecg_classification.prepare_ptbxl_data  import *

## run the prepare ptbxl script on the dataset

In [9]:
!python prepare_ptbxl_data.py -i "/content/data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3" -d "/content/data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/ptbxl_database.csv" -s "/content/data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/scp_statements.csv" -o "/content/data/preped_data"

### Some analysis of the the  ptbxl_database.csv  which has all the metadata

In [10]:
import pandas as pd

ptbxl_db = pd.read_csv("/content/data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/ptbxl_database.csv")
ptbxl_db.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [None]:
ptbxl_db.describe(include="all")

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
count,21799.0,21799.0,21799.0,21799.0,6974.0,9421.0,20326.0,21782.0,21799,21799,...,21799,1598,3260,613,30,1949,291,21799.0,21799,21799
unique,,,,,,,,,11,21795,...,2,317,124,103,14,128,4,,21799,21799
top,,,,,,,,,CS100 3,1992-02-06 11:47:42,...,True,", V6",", I-AVR,",alles,V6,1ES,"ja, pacemaker",,records100/00000/00001_lr,records500/00000/00001_hr
freq,,,,,,,,,6140,2,...,16056,221,953,140,8,405,285,,1,1
mean,10926.658379,11250.156521,62.769301,0.47915,166.702323,70.995223,2.291745,1.544945,,,...,,,,,,,,5.503005,,
std,6302.393366,6235.026404,32.308813,0.499577,10.867321,15.878803,3.254033,4.172883,,,...,,,,,,,,2.874948,,
min,1.0,302.0,2.0,0.0,6.0,5.0,0.0,0.0,,,...,,,,,,,,1.0,,
25%,5469.5,5974.5,50.0,0.0,160.0,60.0,0.0,0.0,,,...,,,,,,,,3.0,,
50%,10926.0,11419.0,62.0,0.0,166.0,70.0,1.0,1.0,,,...,,,,,,,,6.0,,
75%,16386.5,16607.5,72.0,1.0,174.0,80.0,3.0,2.0,,,...,,,,,,,,8.0,,


In [None]:
ptbxl_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21799 entries, 0 to 21798
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ecg_id                        21799 non-null  int64  
 1   patient_id                    21799 non-null  float64
 2   age                           21799 non-null  float64
 3   sex                           21799 non-null  int64  
 4   height                        6974 non-null   float64
 5   weight                        9421 non-null   float64
 6   nurse                         20326 non-null  float64
 7   site                          21782 non-null  float64
 8   device                        21799 non-null  object 
 9   recording_date                21799 non-null  object 
 10  report                        21799 non-null  object 
 11  scp_codes                     21799 non-null  object 
 12  heart_axis                    13331 non-null  object 
 13  i

# Generate synthetic images

In [11]:
imagen_repo = 'https://github.com/alphanumericslab/ecg-image-kit.git'
imagen_folder = 'ecg_generate'
cloned_repo = Repo.clone_from(imagen_repo, imagen_folder)

In [12]:
os.chdir("/content/ecg_classification/ecg_generate/codes/ecg-image-generator")

In [None]:
!ls

config.yaml		 extract_leads.py		    helper_functions.py  template1.json
CreasesWrinkles		 Fonts				    ImageAugmentation	 template2.json
documentation		 gen_ecg_image_from_data.py	    README.md		 TemplateFiles
ecg_plot.py		 gen_ecg_images_from_data_batch.py  requirements.txt
environment_droplet.yml  HandwrittenText		    SampleData


In [13]:
%%capture
!pip install -r requirements.txt

In [30]:
!nvidia-smi

Fri May 10 09:50:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

the line below took 34 minutes when using the Tesla T4 GPU

In [14]:
!python gen_ecg_images_from_data_batch.py -i "/content/data/preped_data/records100/00000" -o "/content/data/preped_data/processed_records100/00000" --print_header --bbox


2024-05-10 08:29:21.690343: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-10 08:29:21.753669: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 08:29:21.753727: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 08:29:21.753784: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-10 08:29:21.767088: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-10 08:29:21.767446: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

## Add image to the preped data

In [15]:
os.chdir("/content/ecg_classification")

In [16]:
!python add_image_filenames.py -i "/content/data/preped_data/records100/00000"  -o "/content/data/preped_data/processed_records100/00000"

# Then we create a hidden set for inference

In [17]:
! python remove_hidden_data.py -i "/content/data/preped_data/processed_records100" -o "/content/data/preped_data/hidden_records100"

# Train the classification model

In [None]:
try:
  os.mkdir("/content/models")
except:
  print("model folder already created")

In [24]:
!python train_model.py -d "/content/data/preped_data/processed_records100/00000" -m "/content/models" -v '--verbose'

Training the digitization model...
Finding the Challenge data...
Extracting features and labels from the data...
-   1/987: 00001_lr...
-   2/987: 00002_lr...
-   3/987: 00003_lr...
-   4/987: 00004_lr...
-   5/987: 00005_lr...
-   6/987: 00006_lr...
-   7/987: 00007_lr...
-   8/987: 00008_lr...
-   9/987: 00009_lr...
-  10/987: 00010_lr...
-  11/987: 00011_lr...
-  12/987: 00012_lr...
-  13/987: 00013_lr...
-  14/987: 00014_lr...
-  15/987: 00015_lr...
-  16/987: 00016_lr...
-  17/987: 00017_lr...
-  18/987: 00018_lr...
-  19/987: 00019_lr...
-  20/987: 00020_lr...
-  21/987: 00021_lr...
-  22/987: 00022_lr...
-  23/987: 00023_lr...
-  24/987: 00024_lr...
-  25/987: 00025_lr...
-  26/987: 00026_lr...
-  27/987: 00027_lr...
-  28/987: 00028_lr...
-  29/987: 00029_lr...
-  30/987: 00030_lr...
-  31/987: 00031_lr...
-  32/987: 00032_lr...
-  33/987: 00033_lr...
-  34/987: 00034_lr...
-  35/987: 00035_lr...
-  36/987: 00036_lr...
-  37/987: 00037_lr...
-  38/987: 00038_lr...
-  39/987: 00

# Evaluate the model

In [25]:
try:
  os.mkdir("/content/test_outputs")
except:
  print("test output folder already created")

In [26]:
!python run_model.py -d "/content/data/preped_data/hidden_records100/00000" -m "/content/models" -o "/content/test_outputs"

In [None]:
eval_repo = 'https://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@github.com/Antony-gitau/evaluation-2024.git'
eval_folder = 'ecg_dx_evaluation'
clone_eval = Repo.clone_from(eval_repo,eval_folder)

In [34]:
os.chdir("/content/ecg_classification/ecg_dx_evaluation")

In [37]:
!python evaluate_model.py -d "/content/data/preped_data/processed_records100/00000" -o "/content/test_outputs"

SNR: -11.463
F-measure: 0.349

