In [1]:
import os
import struct
import pandas as pd
import xml.etree.ElementTree as ET
import json
import numpy as np
from datetime import datetime
import pyproj

In [2]:
# ==========================================
# CONFIGURATION
# ==========================================
DATA_FOLDER = r'./data/TASKDATA'
INTERIM_FOLDER  = r'./data/taskdata_out2'
ENRICHED_FOLDER = r'./data/ENRICHED'
OUT_ESPG = "EPSG:25832"
# Safety Buffer: Add ~1km (0.01 deg) around the field to catch headland turns
GEO_BUFFER = 0.01 
BBOX_DEFAULT = (54.0, 58.0, 8.0, 16.0) # Denmark
if not os.path.exists(INTERIM_FOLDER):
    os.makedirs(INTERIM_FOLDER)
if not os.path.exists(ENRICHED_FOLDER):
    os.makedirs(ENRICHED_FOLDER)

In [10]:
# ==========================================
# 1. METADATA PARSER (Corrected with Sidecar XML Check)
# ==========================================
def parse_isobus_taskdata(data_folder):
    print(f"Scanning {data_folder} for TASKDATA.XML...")
    taskdata_path = os.path.join(data_folder, 'TASKDATA.XML')
    
    if not os.path.exists(taskdata_path):
        print("Error: TASKDATA.XML not found.")
        return None, None

    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
    except Exception as e:
        print(f"Error parsing XML: {e}")
        return None, None

    # 1. Parse Products & Fields
    products = {p.attrib.get('A'): p.attrib.get('B') for p in root.findall(".//PDT")}
    field_names = {f.attrib.get('A'): f.attrib.get('C') for f in root.findall(".//PFD")}
    
    # 2. Get Field Bounds
    field_bounds = {}
    for pfd in root.findall(".//PFD"):
        lats, lons = [], []
        for pnt in pfd.findall(".//PNT"):
            try:
                lats.append(float(pnt.attrib.get('C')))
                lons.append(float(pnt.attrib.get('D')))
            except: pass
        if lats:
            field_bounds[pfd.attrib.get('A')] = (min(lats)-GEO_BUFFER, max(lats)+GEO_BUFFER, 
                                                 min(lons)-GEO_BUFFER, max(lons)+GEO_BUFFER)

    # 3. Build Task Index
    tasks_list = []
    
    for tsk in root.findall(".//TSK"):
        tlg = tsk.find("TLG")
        if tlg is None: continue
        
        log_filename = tlg.attrib.get('A') + '.bin'
        
        # --- NEW: Look for Sidecar XML for DDIs ---
        ddi_list = []
        
        # Strategy A: Check TLGxxxxx.xml in the same folder
        sidecar_xml = log_filename.replace('.bin', '.xml')
        sidecar_path = os.path.join(data_folder, sidecar_xml)
        
        if os.path.exists(sidecar_path):
            try:
                side_tree = ET.parse(sidecar_path)
                # Look for DLV tags anywhere in the sidecar file
                for dlv in side_tree.findall(".//DLV"):
                    ddi_list.append(dlv.attrib.get('A'))
            except: pass
            
        # Strategy B: Check TASKDATA.XML if Strategy A failed
        if not ddi_list:
            tim = tsk.find("TIM")
            if tim is not None:
                for dlv in tim.findall("DLV"):
                    ddi_list.append(dlv.attrib.get('A'))

        # Strategy C: Fallback to Standard 4 (Prevent 0-byte payload crash)
        if not ddi_list:
            ddi_list = ['0054', '0095', '018D', '0063']

        # Metadata
        crop = "Unknown"
        pan = tsk.find("PAN")
        if pan is not None: # Fixed Deprecation Warning
            crop = products.get(pan.attrib.get('A'), crop)
        
        field_id = tsk.attrib.get('E')
        field_name = field_names.get(field_id, f"Unknown_{field_id}")
        bounds = field_bounds.get(field_id, (54.0, 58.0, 8.0, 16.0))
        
        try:
            # Safe date parsing
            tim_node = tsk.find("TIM")
            start_str = tim_node.attrib.get('A') if tim_node is not None else ""
            dt = datetime.fromisoformat(start_str.replace('Z',''))
            year = dt.year
        except: year = 2024
        
        tasks_list.append({
            'LogFilename': log_filename,
            'Year': year,
            'Crop': crop,
            'FieldName': field_name,
            'DDI_List': json.dumps(ddi_list),
            'Bounds': json.dumps(bounds)
        })

    return pd.DataFrame(tasks_list)

# ==========================================
# 2. DYNAMIC BINARY CONVERTER
# ==========================================
def convert_bin_to_csv(bin_path, out_csv_path, ddi_list, bounds):
    if not os.path.exists(bin_path): return False
    
    min_lat, max_lat, min_lon, max_lon = bounds
    
    # Calculate Payload Size dynamically
    # 14 sensors = 56 bytes
    num_sensors = len(ddi_list)
    payload_size = num_sensors * 4
    
    with open(bin_path, 'rb') as f:
        content = f.read()
        
    total_len = len(content)
    cursor = 0
    valid_rows = []
    last_valid_end = 0
    
    # Extended Map
    DDI_MAP = {
        '0054': 'Yield_Mass',   '0095': 'Yield_Vol', 
        '018D': 'Speed',        '0063': 'Moisture',
        '0053': 'Dry_Mass',     '008D': 'Engine_Load',
        '013A': 'Fuel_Rate',    'E122': 'Header_Status',
        '0055': 'Crop_Temp_Or_Count'
    }
    # Generate column names: Use Name if known, else DDI_Code
    col_names = [DDI_MAP.get(d, f"DDI_{d}") for d in ddi_list]
    
    while cursor < total_len - (12 + payload_size):
        match_found = False
        bytes_consumed = 0
        
        try:
            # 1. Header
            time_ms, lat_raw, lon_raw = struct.unpack('<Lii', content[cursor:cursor+12])
            lat = lat_raw * 1e-7
            lon = lon_raw * 1e-7
            
            # 2. Geo-Filter
            if (min_lat < lat < max_lat) and (min_lon < lon < max_lon):
                
                # 3. Dynamic Payload
                # Read exactly 'num_sensors' Unsigned Integers
                values = struct.unpack(f'{num_sensors}I', content[cursor+12 : cursor+12+payload_size])
                
                match_found = True
                bytes_consumed = 12 + payload_size
                
                row = {
                    'Time_ms': time_ms,
                    'Latitude': lat,
                    'Longitude': lon,
                    'Gap_Bytes': cursor - last_valid_end
                }
                
                # Assign values to columns
                for i, val in enumerate(values):
                    row[col_names[i]] = val
                    
                valid_rows.append(row)
                
        except: pass
            
        if match_found:
            last_valid_end = cursor + bytes_consumed
            cursor += bytes_consumed
        else:
            cursor += 1
            
    if valid_rows:
        pd.DataFrame(valid_rows).to_csv(out_csv_path, index=False)
        return True
    return False

# ==========================================
# 3. EXECUTION
# ==========================================
print("--- STAGE 1: METADATA & DDI MAPPING ---")
df_index = parse_isobus_taskdata(DATA_FOLDER)

if df_index is not None:
    df_index.to_csv(os.path.join(INTERIM_FOLDER, 'task_index.csv'), index=False)
    print(f"Index created. Found {len(df_index)} tasks.")
    
    print("\n--- STAGE 2: DYNAMIC BINARY EXTRACTION ---")
    count = 0
    for idx, row in df_index.iterrows():
        bin_file = os.path.join(DATA_FOLDER, row['LogFilename'])
        out_file = os.path.join(INTERIM_FOLDER, row['LogFilename'].replace('.bin', '.csv'))
        # Load the specific DDI list for this file
        ddi_list = json.loads(row['DDI_List'])
        bounds = json.loads(row['Bounds'])
        
        print(f"Processing {row['LogFilename']} ({len(ddi_list)} sensors)...", end='\r')
        if convert_bin_to_csv(bin_file, out_file, ddi_list, bounds):
            count += 1
            
    print(f"\nDone. Extracted {count} files.")
    print(f"Check {INTERIM_FOLDER}/TLG00008.csv - It should now have 17+ columns including Proprietary data.")
else:
    print("Failed to parse metadata.")

--- STAGE 1: METADATA & DDI MAPPING ---
Scanning ./data/TASKDATA for TASKDATA.XML...
Index created. Found 142 tasks.

--- STAGE 2: DYNAMIC BINARY EXTRACTION ---
./data/TASKDATA/TLG00001.bin
./data/TASKDATA/TLG00002.binsensors)...
./data/TASKDATA/TLG00003.binsensors)...
./data/TASKDATA/TLG00004.binsensors)...
./data/TASKDATA/TLG00005.binsensors)...
./data/TASKDATA/TLG00006.binsensors)...
./data/TASKDATA/TLG00007.binsensors)...
./data/TASKDATA/TLG00008.binsensors)...
./data/TASKDATA/TLG00009.binsensors)...
./data/TASKDATA/TLG00010.binsensors)...
./data/TASKDATA/TLG00011.binsensors)...
./data/TASKDATA/TLG00012.binsensors)...
./data/TASKDATA/TLG00013.binsensors)...
./data/TASKDATA/TLG00014.binsensors)...
./data/TASKDATA/TLG00015.binsensors)...
./data/TASKDATA/TLG00016.binsensors)...
./data/TASKDATA/TLG00017.binsensors)...
./data/TASKDATA/TLG00018.binsensors)...
./data/TASKDATA/TLG00019.binsensors)...
./data/TASKDATA/TLG00020.binsensors)...
./data/TASKDATA/TLG00021.binsensors)...
./data/TAS