# Obtención de datos

En este código 

In [1]:
# paquetes
import neurokit2 as nk # incluye herramientas de análisis y detección de ondas de ECG
import pandas as pd # para trabajar con DataFrames
import numpy as np # para manejar tipos de números
import math # para operaciones especiales
import statistics # para operaciones especiales
import os # para acciones del sistema operativo

In [2]:
# como auxiliar de la siguiente función 'analiza' creamos esta función
# que nos da información sobre los segmentos RR
def seg_RR(loc_picos: list[np.float64]):
    l = len(loc_picos)
    intervalos = [loc_picos[x+1]-loc_picos[x] for x in range(l-1)]
    promedio = statistics.mean(intervalos)
    sd = statistics.stdev(intervalos)
    return promedio, sd

In [3]:
# creamos una función que nos regrese la información principal de cada ECG
def analiza(id_ecg, categoria, senal,sr=100):
    # usamos un try-except para las señales conflictivas
    try:
        # primero analizamos la información
        senal_limpia = nk.ecg_clean(senal, sampling_rate=sr, method='neurokit') # quitamos el ruido de la señal
        _, picos_R = nk.ecg_peaks(senal_limpia, sampling_rate=sr) # ubicamos los picos de las ondas R
        _, picos_otros = nk.ecg_delineate(senal_limpia, picos_R, sampling_rate=sr, method="peak") # ubicamos los otros picos
    
        # segundo organizamos los valores
        picos_R = picos_R['ECG_R_Peaks'] # pasamos el dato a lista
        loc_picos_R = [x.astype(float) for x in picos_R] # guardamos el momento en que ocurren los picos R
        picos_R = [senal_limpia[x] for x in picos_R if not math.isnan(x)] # quitamos NaN's
        picos_R = [x.astype(float) for x in picos_R] # transformamos a float para evitar problemas con el formato
        picos_T = picos_otros['ECG_T_Peaks']
        picos_T = [senal_limpia[x] for x in picos_T if not math.isnan(x)]
        picos_T = [x.astype(float) for x in picos_T]
        picos_P = picos_otros['ECG_P_Peaks']
        picos_P = [senal_limpia[x] for x in picos_P if not math.isnan(x)]
        picos_P = [x.astype(float) for x in picos_P]
    
        # tercero sacamos los resultados
        ritmo_cardiaco = len(picos_R) * 6 # el ritmo cardiaco se mide como [num. de ondas QRS en 10 segundos] x 6
        media_RR, sd_RR = seg_RR(loc_picos_R) # usamos la función que calcula los segmentos RR
        media_RR /= 100 # dividimos entre 100 para tener el valor en segundos
        sd_RR /= 100
        max_P = max(picos_P) # todos los valores de aqui para abajo están en mV
        media_P = statistics.mean(picos_P)
        min_P = min(picos_P)
        sd_P = statistics.stdev(picos_P)
        max_R = max(picos_R)
        media_R = statistics.mean(picos_R)
        min_R = min(picos_R)
        sd_R = statistics.stdev(picos_R)
        max_T = max(picos_T)
        media_T = statistics.mean(picos_T)
        min_T = min(picos_T)
        sd_T = statistics.stdev(picos_T)

        # cuarto ordenamos los resultados
        res = [id_ecg, categoria, ritmo_cardiaco, media_RR, sd_RR, min_P, media_P, max_P, sd_P,
               min_R, media_R, max_R, sd_R, min_T, media_T, max_T, sd_T]
    except:
        # en caso de no poder realizar el proceso, regresa una lista de NaN's
        res = [None for x in range(15)]
        res = [id_ecg, categoria] + res
        
    return res

In [4]:
# creamos una función que explore los archivos .csv de la carpeta con ECG
# y nos regrese un resumen de cada ECG como un renglón de un Data Frame
def explorar(direccion, tipo = ".csv"):

    # creamos el dataframe ya con los títulos
    df = pd.DataFrame(columns = ["id_ecg", "categoria", "ritmo_cardiaco", "media_RR", "sd_RR",
                                 "min_P", "media_P", "max_P", "sd_P", "min_R", "media_R", "max_R",
                                 "sd_R", "min_T", "media_T", "max_T", "sd_T"])
    i = 0 # para saber en que renglón vamos
    for root, dirs, files in os.walk(direccion): # os.walk va a ingresar a cada carpeta de la dirección y regresar el nombre de los archivos dentro
        for name in files: # para cada archivo...
            if name.endswith(tipo): # si el archivo es .csv...
                id_ecg = name[8:-4] # el nombre es 'patient_####.csv' entonces hacemos slicing para quitar "patient_" y ".csv".
                cat = root[13:] # el root es 'Datos_Leonel\carpeta' y queremos solo la carpeta, hacemos slicing.
                archivo = root + "\\" + name # la dirección del archivo se obtiene concatenando root y file_name
                senales = pd.read_csv(archivo, index_col='Unnamed: 0') # las señales están guardadas en formato .csv 
                senal_II = senales.II # Queremos la señal DII
                renglon = analiza(id_ecg=id_ecg, categoria = cat, senal=senal_II, sr=100) # aplicamos 'analiza' a DII
                df.loc[i,:] = renglon # concatena el resultado de 'analiza' al final del df
                i += 1 # contabilizamos el renglón
    return df # retornamos el dataframe

In [5]:
# ejecutamos y guardamos el df para acceder a él más fácilmente
# solo como seguridad, (para evitar repetir el proceso en vano)
# agregamos una llave que debe ajustarse a True si se quiere volver a calcular el dataframe

recalcular_df = False # llave

if recalcular_df:
    df1 = explorar(direccion='Datos_Leonel')
    df2 = pd.read_csv('ptbxl_database.csv')
    df1 = df1.astype({'id_ecg':float})
    df1 = df1.merge(df2[['ecg_id','patient_id','age','sex','height','weight']],
                      how='left', left_on='id_ecg',right_on='ecg_id',sort=True)
    df1.drop(labels='ecg_id', axis = 1, inplace=True)
    df1.to_csv('df_datos.csv')
    print("datos analizados y guardados :)")

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


datos analizados y guardados :)


In [8]:
# ahora si, accedemos al df de datos para ver qué tiene:
df = pd.read_csv('df_datos.csv', index_col='Unnamed: 0')
df.head(10)

Unnamed: 0,id_ecg,categoria,ritmo_cardiaco,media_RR,sd_RR,min_P,media_P,max_P,sd_P,min_R,...,sd_R,min_T,media_T,max_T,sd_T,patient_id,age,sex,height,weight
0,0.0,other,60.0,0.94,0.02,0.049611,0.071296,0.083545,0.013746,0.263022,...,0.015744,0.167191,0.178537,0.192539,0.008707,,,,,
1,1.0,other,42.0,1.271667,0.082321,-0.073595,0.047206,0.191584,0.096574,0.650633,...,0.110319,0.268503,0.375628,0.520278,0.096918,15709.0,56.0,1.0,,63.0
2,2.0,other,60.0,0.941111,0.022048,0.00041,0.071276,0.116616,0.036733,0.35367,...,0.050161,0.060588,0.123789,0.234671,0.065835,13243.0,19.0,0.0,,70.0
3,3.0,other,72.0,0.800909,0.045267,-0.16952,0.015421,0.247193,0.122509,0.850498,...,0.11344,0.238351,0.43337,0.606892,0.129951,20372.0,37.0,1.0,,69.0
4,4.0,other,66.0,0.905,0.051694,-0.02764,0.009301,0.058326,0.028887,0.748575,...,0.067279,0.281886,0.327968,0.389802,0.037975,17014.0,24.0,0.0,,82.0
5,5.0,other,78.0,0.721667,0.037859,0.086328,0.128303,0.164272,0.022178,0.704945,...,0.037433,-0.002302,0.048551,0.086224,0.022773,17448.0,19.0,1.0,,70.0
6,6.0,other,60.0,0.967778,0.016415,-0.0106,0.011937,0.036254,0.013678,0.506519,...,0.026496,0.064727,0.085343,0.106496,0.011511,19005.0,18.0,1.0,,58.0
7,7.0,mi,72.0,0.813636,0.015015,-0.00772,0.032875,0.065326,0.022375,0.274961,...,0.018002,0.13571,0.174177,0.197507,0.020348,16193.0,54.0,0.0,,83.0
8,8.0,other,60.0,0.986667,0.033912,-0.014985,0.010951,0.030378,0.014967,0.329058,...,0.035664,0.159501,0.210604,0.229076,0.02112,11275.0,48.0,0.0,,95.0
9,9.0,other,60.0,0.953333,0.061644,0.000818,0.02854,0.063939,0.01845,0.562056,...,0.043867,0.121482,0.17094,0.193339,0.022899,18792.0,55.0,0.0,,70.0
