<a href="https://colab.research.google.com/github/Fermu25/Cursos/blob/main/Data_sets_filtrado_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
from datetime import datetime
from pathlib import Path
import pandas as pd

INPUT_PATH  = "serial_log_parte1.csv"
OUTPUT_PATH = "salida1.csv"


TS_RE = re.compile(r'^(?P<ts>\d{2}:\d{2}:\d{2}\.\d{3})\s*->\s*(?P<rest>.*)$')

PRESS_RE = re.compile(
    r'Presiones:\s*P1\s*=\s*([-\d\.NaNnan]+)\s*mmHg\s*\|\s*P2\s*=\s*([-\d\.NaNnan]+)\s*mmHg',
    re.IGNORECASE
)

IR_RE = re.compile(
    r'IR\s*Canal\s*(\d+):\s*Obj\s*=\s*([-\d\.NaNnan]+)\s*°C,\s*Amb\s*=\s*([-\d\.NaNnan]+)\s*°C',
    re.IGNORECASE
)

DS_RE = re.compile(
    r'DS18B20:\s*T1\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T2\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T3\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T4\s*=\s*([-\d\.NaNnan]+)\s*°C',
    re.IGNORECASE
)

SEP_RE = re.compile(r'^-+\s*$')

def to_float(x: str):
    x = x.strip()
    if x.lower() == 'nan' or x == '':
        return float('nan')
    try:
        return float(x)
    except ValueError:
        return float('nan')

def parse_file(text_lines):
    """
    Devuelve una lista de dicts, 1 por bloque.
    Cada bloque comienza (normalmente) con la línea 'Presiones:' para anclar el timestamp.
    """
    blocks = []
    cur = None

    def finalize_current():
        nonlocal cur
        if cur is not None:
            blocks.append(cur)
            cur = None

    for raw in text_lines:
        line = raw.strip('\n')
        if not line.strip():
            continue

        m = TS_RE.match(line)
        if not m:
            if SEP_RE.match(line):
                finalize_current()
            continue

        ts = m.group('ts')
        rest = m.group('rest')


        if SEP_RE.match(rest):
            finalize_current()
            continue


        mp = PRESS_RE.search(rest)
        if mp:

            finalize_current()
            cur = {
                'time': ts,
                'P1_mmHg': to_float(mp.group(1)),
                'P2_mmHg': to_float(mp.group(2)),
                'IR2_Obj_C': float('nan'),
                'IR2_Amb_C': float('nan'),
                'IR3_Obj_C': float('nan'),
                'IR3_Amb_C': float('nan'),
                'DS_T1_C': float('nan'),
                'DS_T2_C': float('nan'),
                'DS_T3_C': float('nan'),
                'DS_T4_C': float('nan'),
            }
            continue


        if cur is None:
            continue


        mi = IR_RE.search(rest)
        if mi:
            canal = mi.group(1)
            obj = to_float(mi.group(2))
            amb = to_float(mi.group(3))
            if canal == '2':
                cur['IR2_Obj_C'] = obj
                cur['IR2_Amb_C'] = amb
            elif canal == '3':
                cur['IR3_Obj_C'] = obj
                cur['IR3_Amb_C'] = amb

            continue


        md = DS_RE.search(rest)
        if md:
            cur['DS_T1_C'] = to_float(md.group(1))
            cur['DS_T2_C'] = to_float(md.group(2))
            cur['DS_T3_C'] = to_float(md.group(3))
            cur['DS_T4_C'] = to_float(md.group(4))
            continue


    finalize_current()
    return blocks

def compute_times(df: pd.DataFrame) -> pd.DataFrame:


    t0 = None
    elapsed = []
    for t in df['time']:

        h, mi, rest = t.split(':')
        s = float(rest)
        total = int(h)*3600 + int(mi)*60 + s
        if t0 is None:
            t0 = total
        elapsed.append(total - t0)
    df['elapsed_s'] = elapsed
    df['dt_s'] = df['elapsed_s'].diff().fillna(0.0)
    return df

def main(in_path=INPUT_PATH, out_path=OUTPUT_PATH):

    p = Path(in_path)
    if not p.exists():
        raise FileNotFoundError(f"No se encontró el archivo de entrada: {p.resolve()}")
    with p.open('r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    blocks = parse_file(lines)
    if not blocks:
        raise RuntimeError("No se pudieron extraer bloques. ¿El formato coincide con los ejemplos?")

    df = pd.DataFrame(blocks, columns=[
        'time','P1_mmHg','P2_mmHg',
        'IR2_Obj_C','IR2_Amb_C','IR3_Obj_C','IR3_Amb_C',
        'DS_T1_C','DS_T2_C','DS_T3_C','DS_T4_C'
    ])
    df = compute_times(df)


    cols = [
        'time','elapsed_s','dt_s',
        'P1_mmHg','P2_mmHg',
        'IR2_Obj_C','IR2_Amb_C','IR3_Obj_C','IR3_Amb_C',
        'DS_T1_C','DS_T2_C','DS_T3_C','DS_T4_C'
    ]
    df = df[cols]

    df.to_csv(out_path, index=False, encoding='utf-8')
    print(f"Listo. Se escribió el CSV limpio en: {Path(out_path).resolve()}")

if __name__ == "__main__":
    main()

Listo. Se escribió el CSV limpio en: /content/salida1.csv


In [5]:
import re
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

INPUT_PATH  = "serial_log_parte2.csv"
OUTPUT_PATH = "salida2.csv"

PRESS_RE = re.compile(
    r'Presiones:\s*P1\s*=\s*([-\d\.NaNnan]+)\s*mmHg\s*\|\s*P2\s*=\s*([-\d\.NaNnan]+)\s*mmHg',
    re.IGNORECASE
)
IR_RE = re.compile(
    r'IR\s*Canal\s*(\d+):\s*Obj\s*=\s*([-\d\.NaNnan]+)\s*°C,\s*Amb\s*=\s*([-\d\.NaNnan]+)\s*°C',
    re.IGNORECASE
)
DS_RE = re.compile(
    r'DS18B20:\s*T1\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T2\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T3\s*=\s*([-\d\.NaNnan]+)\s*°C\s*\|\s*T4\s*=\s*([-\d\.NaNnan]+)\s*°C',
    re.IGNORECASE
)
SEP_RE = re.compile(r'^-+\s*$')

def to_float(x: str):
    x = (x or "").strip()
    if x.lower() == "nan" or x == "":
        return np.nan
    try:
        return float(x)
    except ValueError:
        return np.nan

def iso_to_hms_ms(iso_str: str) -> tuple[str, float]:
    dt = datetime.fromisoformat(iso_str.replace("Z",""))
    hms = dt.strftime("%H:%M:%S.") + f"{int(dt.microsecond/1000):03d}"
    return hms, dt.timestamp()

def parse_blocks(df_raw: pd.DataFrame) -> list[dict]:
    blocks = []
    cur = None
    t_anchor_epoch = None
    for _, row in df_raw.iterrows():
        msg = str(row['mensaje']).strip()
        iso = str(row['iso_ts']).strip()
        if SEP_RE.match(msg):
            if cur is not None:
                blocks.append(cur)
                cur = None
                t_anchor_epoch = None
            continue
        mp = PRESS_RE.search(msg)
        if mp:
            if cur is not None:
                blocks.append(cur)
            time_hms, t_epoch = iso_to_hms_ms(iso)
            t_anchor_epoch = t_epoch
            cur = {
                'time': time_hms,
                'epoch': t_anchor_epoch,
                'P1_mmHg': to_float(mp.group(1)),
                'P2_mmHg': to_float(mp.group(2)),
                'IR2_Obj_C': np.nan, 'IR2_Amb_C': np.nan,
                'IR3_Obj_C': np.nan, 'IR3_Amb_C': np.nan,
                'DS_T1_C': np.nan, 'DS_T2_C': np.nan, 'DS_T3_C': np.nan, 'DS_T4_C': np.nan,
            }
            continue
        if cur is None:
            continue
        mi = IR_RE.search(msg)
        if mi:
            canal = mi.group(1)
            obj = to_float(mi.group(2))
            amb = to_float(mi.group(3))
            if canal == '2':
                cur['IR2_Obj_C'] = obj
                cur['IR2_Amb_C'] = amb
            elif canal == '3':
                cur['IR3_Obj_C'] = obj
                cur['IR3_Amb_C'] = amb
            continue
        md = DS_RE.search(msg)
        if md:
            cur['DS_T1_C'] = to_float(md.group(1))
            cur['DS_T2_C'] = to_float(md.group(2))
            cur['DS_T3_C'] = to_float(md.group(3))
            cur['DS_T4_C'] = to_float(md.group(4))
            continue
    if cur is not None:
        blocks.append(cur)
    return blocks

def compute_time_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values('epoch').reset_index(drop=True)
    t0 = df.loc[0, 'epoch']
    df['elapsed_s'] = df['epoch'] - t0
    df['dt_s'] = df['elapsed_s'].diff().fillna(0.0)
    return df.drop(columns=['epoch'])

def main(in_path=INPUT_PATH, out_path=OUTPUT_PATH):
    p = Path(in_path)
    if not p.exists():
        raise FileNotFoundError(f"No se encontró el archivo: {p.resolve()}")
    df_raw = pd.read_csv(
        p,
        header=None,
        names=['seq', 'iso_ts', 'mensaje'],
        dtype={'seq': 'Int64', 'iso_ts': str, 'mensaje': str},
        keep_default_na=False,
        encoding='utf-8',
        delimiter=','
    )
    blocks = parse_blocks(df_raw)
    if not blocks:
        raise RuntimeError("No se detectaron bloques (líneas con 'Presiones'). Revisa el formato.")
    df = pd.DataFrame(blocks)
    df = compute_time_cols(df)
    cols = [
        'time','elapsed_s','dt_s',
        'P1_mmHg','P2_mmHg',
        'IR2_Obj_C','IR2_Amb_C','IR3_Obj_C','IR3_Amb_C',
        'DS_T1_C','DS_T2_C','DS_T3_C','DS_T4_C'
    ]
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    df = df[cols]
    df.to_csv(out_path, index=False, encoding='utf-8')
    print(f"CSV homogéneo escrito en: {Path(out_path).resolve()}")

if __name__ == "__main__":
    main()


CSV homogéneo escrito en: /content/salida2.csv


In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

CSV1 = "salida1.csv"
CSV2 = "salida2.csv"
OUTPUT = "salida_unida.csv"

def unir_csvs(csv1, csv2, out_path=OUTPUT):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    df = pd.concat([df1, df2], ignore_index=True)
    df['elapsed_s'] = pd.to_numeric(df['elapsed_s'], errors='coerce')
    df['dt_s'] = pd.to_numeric(df['dt_s'], errors='coerce')
    df = df.sort_values('elapsed_s').reset_index(drop=True)
    df['elapsed_s'] = df.index.to_numpy(dtype=float)
    df['dt_s'] = df['elapsed_s'].diff().fillna(0.0)
    cols = [
        'time','elapsed_s','dt_s',
        'P1_mmHg','P2_mmHg',
        'IR2_Obj_C','IR2_Amb_C','IR3_Obj_C','IR3_Amb_C',
        'DS_T1_C','DS_T2_C','DS_T3_C','DS_T4_C'
    ]
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    df = df[cols]
    df.to_csv(out_path, index=False, encoding='utf-8')
    print(f"Archivo combinado guardado en: {Path(out_path).resolve()}")

if __name__ == "__main__":
    unir_csvs(CSV1, CSV2, OUTPUT)



Archivo combinado guardado en: /content/salida_unida.csv
