### Diagnostic Information
Outputs:
* ID
* PatientAge
* Gender
* SnomedCode
* DiagnosisName
* MergedSnomedCode
* MergedDiagnosisName
* Rhythm, Duration, Amplitude, Morphology, Unlabeled

In [1]:
import wfdb
import pandas as pd
import numpy as np
import re
import os
import csv

# Load the Diagnosis - Acronym Name - Integration Code information from the labeling file
def load_diagnosis_map(label_file_path):
    diagnosis_map = {}
    group_info = {}  
    groups = set()  
    
    with open(label_file_path, "r") as file:
        reader = csv.DictReader(file, delimiter=";")
        for row in reader:
            # Convert Snomed Code to integer without scientific notation check
            diagnosis_code = row["Snomed Code"].replace(',', '')  # Remove commas if present
            try:
                # Directly convert the diagnosis_code to integer
                diagnosis_code = int(diagnosis_code)  # Regular conversion
            except ValueError:
                print(f"Invalid Snomed Code: {row['Snomed Code']}")
                continue  # Skip this row if the conversion fails
            
            # Diagnosis map oluşturma
            diagnosis_map[diagnosis_code] = {
                "Acronym": row["Acronym Name"],
                "IntegrationCode": row["Integration Code"],
                "IntegrationName": row["Integration Name"]
            }
            
            integration_code = row["Integration Code"]
            group = row["Group"]
            group_info[integration_code] = group
            groups.add(group)
    
    return diagnosis_map, group_info, groups

# Convert DiagnosisCodes to Acronym Name and add Integration Code and Integration Name
def get_acronym_names_and_integration_codes(diagnosis_codes, diagnosis_map):
    acronym_names = []
    integration_codes = []
    integration_names = []

    for code in diagnosis_codes:
        try:
            if code.strip():
                code_int = int(code)
                if code_int in diagnosis_map:
                    acronym = diagnosis_map[code_int]["Acronym"]
                    integration_code = diagnosis_map[code_int]["IntegrationCode"]
                    integration_name = diagnosis_map[code_int]["IntegrationName"]
                    
                    acronym_names.append(acronym)
                    integration_codes.append(integration_code)
                    integration_names.append(integration_name)
        except ValueError as e:
            print(f"ValueError for code {code}: {e}")
            pass

    return acronym_names, integration_codes, integration_names

# Extract patient information from a record file
def extract_patient_info(file_path, diagnosis_map):
    try:
        record = wfdb.rdrecord(file_path)
        comments = getattr(record, 'comments', [])
        
        def extract_info(pattern, comment):
            match = re.search(pattern, comment)
            return match.group(1) if match else np.nan

        patient_info = {
            'ID': record.record_name,
            'PatientAge': next((extract_info(r'Age:\s*(\d+)', c) for c in comments if "Age" in c), np.nan),
            'Gender': next((extract_info(r'Sex:\s*(\w+)', c) for c in comments if "Sex" in c), np.nan),
            'SnomedCode': next((extract_info(r'Dx:\s*(.*)', c) for c in comments if "Dx" in c), np.nan),        
        }

        # Eğer PatientAge değeri 0 ise, NaN yap
        if not pd.isna(patient_info['PatientAge']) and int(patient_info['PatientAge']) == 0:
            patient_info['PatientAge'] = np.nan

        if pd.isna(patient_info['SnomedCode']):
            patient_info['SnomedCode'] = ""

        diagnosis_codes = patient_info['SnomedCode'].split(',')

        acronym_names, integration_codes, integration_names = get_acronym_names_and_integration_codes(diagnosis_codes, diagnosis_map)
        patient_info['DiagnosisName'] = ", ".join(acronym_names) if acronym_names else np.nan
        patient_info['MergedSnomedCode'] = ", ".join(map(str, integration_codes)) if integration_codes else np.nan
        patient_info['MergedDiagnosisName'] = ", ".join(integration_names) if integration_names else np.nan

        patient_info_df = pd.DataFrame([patient_info])
        
        return patient_info_df
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return pd.DataFrame()


# Format DiagnosisCodes as a list
def format_diagnosis_codes(patient_info_df):
    patient_info_df['SnomedCode'] = patient_info_df['SnomedCode'].apply(
        lambda x: [code.strip() for code in x.split(',')] if isinstance(x, str) and x else []
    )
    patient_info_df['DiagnosisName'] = patient_info_df['DiagnosisName'].apply(
        lambda x: x.split(', ') if isinstance(x, str) and x else []
    ) 
    patient_info_df['MergedSnomedCode'] = patient_info_df['MergedSnomedCode'].apply(
        lambda x: x.split(', ') if isinstance(x, str) and x else []
    )
    patient_info_df['MergedDiagnosisName'] = patient_info_df['MergedDiagnosisName'].apply(
        lambda x: x.split(', ') if isinstance(x, str) and x else []
    )
    return patient_info_df

# Add group information to the DataFrame with specific header order
def add_group_columns(df, group_info, groups):
    # Ensure the specific order for the group columns
    desired_order = ['Rhythm', 'Duration', 'Amplitude', 'Morphology', 'Unlabeled']
    
    # Add new columns with False for each group
    for group in desired_order:
        if group not in df.columns:
            df[group] = False
    
    # Iterate over rows and assign group values based on MergedSnomedCode
    for index, row in df.iterrows():
        merged_snomed_code = row['MergedSnomedCode']
        if isinstance(merged_snomed_code, list):
            for code in merged_snomed_code:
                if code in group_info:
                    group = group_info[code]
                    if group in desired_order:
                        df.at[index, group] = True
    return df

# Process all files
def process_all_files(directory_path, output_folder, label_file_path):
    diagnosis_map, group_info, groups = load_diagnosis_map(label_file_path)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    patient_info_all = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.hea'):
            file_path = os.path.join(directory_path, filename.replace('.hea', ''))
            patient_info_df = extract_patient_info(file_path, diagnosis_map)
            if not patient_info_df.empty:
                patient_info_all.append(patient_info_df)

    if patient_info_all:
        all_patient_info = pd.concat(patient_info_all, ignore_index=True)
        all_patient_info = format_diagnosis_codes(all_patient_info)

        all_patient_info = add_group_columns(all_patient_info, group_info, groups)

        # Ensure the columns are in the desired order, including group headers
        all_patient_info = all_patient_info[['ID', 'PatientAge', 'Gender', 'SnomedCode', 'DiagnosisName', 
                                             'MergedSnomedCode', 'MergedDiagnosisName'] + ['Rhythm', 'Duration', 'Amplitude', 'Morphology', 'Unlabeled']]

        all_patient_info.sort_values(by='ID', ascending=True, inplace=True)
        patient_info_path = os.path.join(output_folder, 'DiagnosticInfo_all.csv')
        all_patient_info.to_csv(patient_info_path, index=False)

        print("All files have been processed and saved to the CSV.")
        print(f"DiagnosticInfo_all.csv: {patient_info_path}")
    else:
        print("No files to process.")

# Define file paths
directory_path = "../01_Database_PhysioNet"
output_folder = "/Users/dogukankorkut/Library/CloudStorage/OneDrive-ozyegin.edu.tr/Ozyegin_MSc_Thesis/04_Technical_Works/ECG_Datasets/01_Ningbo/02_Diagnosis_PreProcess"
label_file_path = "../01_SNOMED_CT_Code/Chapman_Ningbo_ECG_DB_Labeling_Info.csv"

process_all_files(directory_path, output_folder, label_file_path)

All files have been processed and saved to the CSV.
DiagnosticInfo_all.csv: /Users/dogukankorkut/Library/CloudStorage/OneDrive-ozyegin.edu.tr/Ozyegin_MSc_Thesis/04_Technical_Works/ECG_Datasets/01_Ningbo/02_Diagnosis_PreProcess/DiagnosticInfo_all.csv


### Explaratory Data Analysis

In [2]:
import pandas as pd

# Try to load the DiagnosticInfo_all.csv file
diagnosis_info_path  = "DiagnosticInfo_all.csv"

try:
    # Read the DiagnosticInfo_all.csv file into a DataFrame
    diagnostic_info_df = pd.read_csv(diagnosis_info_path)

except FileNotFoundError:
    print(f"Error: The file {diagnosis_info_path } was not found. Please check the path and try again.")

diagnostic_info_df.head()

Unnamed: 0,ID,PatientAge,Gender,SnomedCode,DiagnosisName,MergedSnomedCode,MergedDiagnosisName,Rhythm,Duration,Amplitude,Morphology,Unlabeled
0,JS00001,85.0,Male,"['164889003', '59118001', '164934002']","['AF', 'NBBB', 'TWC']","['164889003', '59118001', '55930002']","['AFIB', 'NBBB', 'STTA']",True,False,False,True,True
1,JS00002,59.0,Female,"['426177001', '164934002']","['SB', 'TWC']","['426177001', '55930002']","['SB', 'STTA']",True,False,False,True,False
2,JS00004,66.0,Male,['426177001'],['SB'],['426177001'],['SB'],True,False,False,False,False
3,JS00005,73.0,Female,"['164890007', '429622005', '428750005']","['AFL', 'STDD', 'STTC']","['164889003', '55930002', '55930002']","['AFIB', 'STTA', 'STTA']",True,False,False,True,False
4,JS00006,46.0,Female,['426177001'],['SB'],['426177001'],['SB'],True,False,False,False,False


In [3]:
# Shape of dataset
print("Shape of Dataset:", diagnostic_info_df.shape)

Shape of Dataset: (45152, 12)


In [4]:
# Display the types of each column
column_types = pd.DataFrame(diagnostic_info_df.dtypes, columns=['Data Type']).reset_index()
column_types.columns = ['Column Name', 'Data Type']
column_types

Unnamed: 0,Column Name,Data Type
0,ID,object
1,PatientAge,float64
2,Gender,object
3,SnomedCode,object
4,DiagnosisName,object
5,MergedSnomedCode,object
6,MergedDiagnosisName,object
7,Rhythm,bool
8,Duration,bool
9,Amplitude,bool


#### Cleaned Undefined Values (6180003, '')

In [5]:
import ast

# Dosyayı yükle
diagnostic_info_df = pd.read_csv(diagnosis_info_path)

# Boş değerleri ('') ve 6180003 kodunu çıkaran fonksiyon
def clean_snomed_code_list(snomed_code_str):
    try:
        # Listeleri işlemek için ast.literal_eval kullanıyoruz
        snomed_code_list = ast.literal_eval(snomed_code_str)
        # '6180003' ve boş stringleri ('') çıkarıyoruz
        cleaned_list = [code for code in snomed_code_list if code != '' and code != '6180003']
        return cleaned_list
    except (ValueError, SyntaxError):
        return []

# SnomedCode, MergedSnomedCode gibi kolonlar için temizleme işlemi yap
diagnostic_info_df['SnomedCode'] = diagnostic_info_df['SnomedCode'].apply(clean_snomed_code_list)
diagnostic_info_df['MergedSnomedCode'] = diagnostic_info_df['MergedSnomedCode'].apply(clean_snomed_code_list)

# Güncellenmiş verileri kaydet
diagnostic_info_df.to_csv(diagnosis_info_path, index=False)

print("SnomedCode ve MergedSnomedCode sütunlarında bulunan '' (boş değerler) ve 6180003 gibi değerleri çıkartılmıştır")

SnomedCode ve MergedSnomedCode sütunlarında bulunan '' (boş değerler) ve 6180003 gibi değerleri çıkartılmıştır


#### Missing Value Elimination (PatientAge)

In [6]:
# Number of missing values
isnull_number = []
for i in diagnostic_info_df.columns:
    x = diagnostic_info_df[i].isnull().sum()
    isnull_number.append(x)
    
pd.DataFrame(isnull_number, index = diagnostic_info_df.columns, columns = ["Total Missing Values"])

Unnamed: 0,Total Missing Values
ID,0
PatientAge,302
Gender,0
SnomedCode,0
DiagnosisName,0
MergedSnomedCode,0
MergedDiagnosisName,0
Rhythm,0
Duration,0
Amplitude,0


In [7]:
# Drop rows with NaN values
diagnostic_info_df = diagnostic_info_df.dropna()

# Save the cleaned dataframe back to the same file
diagnostic_info_df.to_csv(diagnosis_info_path, index=False)

print("Rows with NaN values have been removed and the file has been updated.")

Rows with NaN values have been removed and the file has been updated.


In [8]:
# Number of missing values
isnull_number = []
for i in diagnostic_info_df.columns:
    x = diagnostic_info_df[i].isnull().sum()
    isnull_number.append(x)
    
pd.DataFrame(isnull_number, index = diagnostic_info_df.columns, columns = ["Total Missing Values"])

Unnamed: 0,Total Missing Values
ID,0
PatientAge,0
Gender,0
SnomedCode,0
DiagnosisName,0
MergedSnomedCode,0
MergedDiagnosisName,0
Rhythm,0
Duration,0
Amplitude,0


In [9]:
unique_number = []
for i in diagnostic_info_df.columns:
    x = diagnostic_info_df[i].value_counts().count()
    unique_number.append(x)
    
pd.DataFrame(unique_number, index = diagnostic_info_df.columns, columns = ["Total Unique Values"])

Unnamed: 0,Total Unique Values
ID,44850
PatientAge,86
Gender,3
SnomedCode,4971
DiagnosisName,4901
MergedSnomedCode,4108
MergedDiagnosisName,4108
Rhythm,2
Duration,2
Amplitude,2


#### Unknown Value Elimination (Gender)

In [10]:
diagnostic_info_df["Gender"].value_counts()

Gender
Male       25287
Female     19561
Unknown        2
Name: count, dtype: int64

In [11]:
# Delete lines with Gender variable 'Unknown'
diagnostic_info_df = diagnostic_info_df[diagnostic_info_df['Gender'] != 'Unknown']

# Save the cleaned dataframe back to the same file
diagnostic_info_df.to_csv(diagnosis_info_path, index=False)

print("Rows with 'Unknown' in the Gender column have been removed and the file has been updated.")

Rows with 'Unknown' in the Gender column have been removed and the file has been updated.


#### Cleaned Data Analysis

In [12]:
# Shape of dataset
print("Shape of Dataset:", diagnostic_info_df.shape)

Shape of Dataset: (44848, 12)


In [13]:
unique_number = []
for i in diagnostic_info_df.columns:
    x = diagnostic_info_df[i].value_counts().count()
    unique_number.append(x)
    
pd.DataFrame(unique_number, index = diagnostic_info_df.columns, columns = ["Total Unique Values"])

Unnamed: 0,Total Unique Values
ID,44848
PatientAge,86
Gender,2
SnomedCode,4971
DiagnosisName,4901
MergedSnomedCode,4108
MergedDiagnosisName,4108
Rhythm,2
Duration,2
Amplitude,2


In [14]:
# Calculate the distribution of Rhythm values
rhythm_counts = diagnostic_info_df['Rhythm'].value_counts()

# Display the results in DataFrame format
rhythm_counts_df = pd.DataFrame(rhythm_counts).reset_index()
rhythm_counts_df.columns = ['Rhythm', 'Count']

rhythm_counts_df

Unnamed: 0,Rhythm,Count
0,True,43948
1,False,900
