In [1]:
#!/usr/bin/python

import csv
import os
import sys
import re
from datetime import datetime, timedelta
from geopy import distance
import math as ma
from itertools import islice
import pandas as pd
import numpy as np
from collections import defaultdict

def parse_sensor_info(sensor_info: str):
    """
    Parses the Sensor Info column to extract structured data for sensors.

    Parameters:
        sensor_info (str): The Sensor Info string.

    Returns:
        defaultdict: A nested defaultdict with structured data for each sensor type.
    """
    # Ensure sensor_info is a string
    if sensor_info is None or pd.isnull(sensor_info):
        sensor_info = ""
    else:
        sensor_info = str(sensor_info)
        
    parsed_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

    # Regex for engine run details (unchanged)
    engine_run_pattern = re.compile(
        r"(?P<sensor>[A-Za-z0-9]+)\s+monitor\s+reports\s+Engine\s+Run\s+(?P<duration>\d+)\s+minutes,\s+"
        r"starts\s+at\s+T=(?P<start_time>\d+)\s+and:\s+(?P<msl>\d+)\s+msl\s+\[(?P<agl>-?\d+)\s+agl\];\s+"
        r"Height\s+gain/loss\s+is:\s+(?P<hgl>-?\d+)"
    )

    # Regex for noise registration details (unchanged)
    noise_registration_pattern = re.compile(
        r"Motor\s+noise\s+registered\s+by\s+(?P<sensor>[A-Za-z0-9]+)\s+sensor\s+at\s+t=\[\s*(?P<times>[^\]]+)\s*\]"
        r"\s+and\s+\[\s*(?P<agl>[^\]]+)\s*\]AGL"
    )

    # Parse Engine Run Events
    for match in engine_run_pattern.finditer(sensor_info):
        sensor = match.group("sensor")
        duration = match.group("duration")
        start_time = match.group("start_time")
        msl = match.group("msl")
        agl = match.group("agl")
        hgl = match.group("hgl")

        parsed_data[sensor]["Engine_Run"]["Duration_Minutes"].append(int(duration))
        parsed_data[sensor]["Engine_Run"]["Start_Times"].append(int(start_time))
        parsed_data[sensor]["Engine_Run"]["Altitudes_MSL"].append(int(msl))
        parsed_data[sensor]["Engine_Run"]["Altitudes_AGL"].append(float(agl))
        parsed_data[sensor]["Engine_Run"]["Height_Gain_Loss"].append(int(hgl))

    # Parse Noise Registration Events
    for match in noise_registration_pattern.finditer(sensor_info):
        sensor = match.group("sensor")
        times_str = match.group("times")
        agl_str = match.group("agl")

        # Extract numbers without requiring quotes
        times = re.findall(r"(\d+)", times_str)
        times_int = [int(t) for t in times]
        parsed_data[sensor]["Noise_Registration"]["Times"].extend(times_int)

        # Split the AGL values and convert to floats
        agl_values = [float(a.strip()) for a in agl_str.split(",")]
        parsed_data[sensor]["Noise_Registration"]["Altitudes_AGL"].extend(agl_values)

    return parsed_data

def tokens_match_format(s):
    """
    Returns True if 's' is empty or all comma-separated tokens in 's'
    match the 6-digit HHMMSS format. Otherwise False.
    """
    if not s or s.strip() == "":
        return True
    tokens = [token.strip() for token in s.split(",") if token.strip()]
    for token in tokens:
        # Must be exactly 6 digits
        if not re.fullmatch(r"\d{6}", token):
            return False
    return True

def main():
    # Define the input CSV, output CSV
    # You can update the input_csv to match your actual filename
    input_csv = "Flt-times.805143.csv"
    updated_csv = f"Flt-times-updated.{os.getpid()}.csv"
    
    # Define sensors to extract data for
    sensor_types = ["ENL", "MOP", "RPM"]

    with open(input_csv, "r", newline="", encoding="utf-8") as infile, \
         open(updated_csv, "w", newline="", encoding="utf-8") as outfile:
        
        reader = csv.DictReader(infile)
        
        # Define additional columns for sensor-specific data
        sensor_columns = (
            [f"{sensor}_Engine_Run_Start_Times" for sensor in sensor_types] +
            [f"{sensor}_Engine_Run_Altitudes_MSL" for sensor in sensor_types] +
            [f"{sensor}_Engine_Run_Altitudes_AGL" for sensor in sensor_types] +
            [f"{sensor}_Noise_Registration_Times" for sensor in sensor_types] +
            [f"{sensor}_Noise_Registration_Altitudes_AGL" for sensor in sensor_types]
        )
        
        # Concatenate with the original fieldnames
        fieldnames = reader.fieldnames + sensor_columns
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        
        # Write the header
        writer.writeheader()
        
        # Process each row
        for row in reader:
            sensor_info = row.get("Sensor Info", "")
            structured_data = parse_sensor_info(sensor_info)
            
            # Add parsed sensor data to the row for each sensor type
            for sensor in sensor_types:
                engine_run = structured_data.get(sensor, {}).get("Engine_Run", {})
                noise_reg = structured_data.get(sensor, {}).get("Noise_Registration", {})

                # For MOP and RPM, remove decimals by converting the times to integers.
                if sensor in ["MOP", "RPM"]:
                    times_str = ",".join(str(int(x)) for x in engine_run.get("Start_Times", []))
                else:
                    times_str = ",".join(map(str, engine_run.get("Start_Times", [])))
                
                row[f"{sensor}_Engine_Run_Start_Times"] = times_str
                row[f"{sensor}_Engine_Run_Altitudes_MSL"] = ",".join(map(str, engine_run.get("Altitudes_MSL", [])))
                row[f"{sensor}_Engine_Run_Altitudes_AGL"] = ",".join(map(str, engine_run.get("Altitudes_AGL", [])))
                row[f"{sensor}_Noise_Registration_Times"] = ",".join(map(str, noise_reg.get("Times", [])))
                row[f"{sensor}_Noise_Registration_Altitudes_AGL"] = ",".join(map(str, noise_reg.get("Altitudes_AGL", [])))
            
            # At this point, we have updated row with the new columns.
            # We must check if each of the 3 *Engine_Run_Start_Times columns* is in valid HHMMSS format.
            # If any fails, skip (don't write).
            all_valid = True
            for sensor in sensor_types:
                col_name = f"{sensor}_Engine_Run_Start_Times"
                val = row.get(col_name, "")
                if not tokens_match_format(val):
                    all_valid = False
                    break
            
            if not all_valid:
                # Skip this row entirely
                continue

            # If we reach here, the row is valid. Write it.
            writer.writerow(row)
    
    print(f"Updated CSV saved to {updated_csv}")

if __name__ == "__main__":
    main()


Updated CSV saved to Flt-times-updated.586473.csv
