In [2]:
import os
import struct
import pandas as pd
import xml.etree.ElementTree as ET
import json
import numpy as np
from datetime import datetime
import pyproj

In [3]:
# ==========================================
# CONFIGURATION
# ==========================================
DATA_FOLDER = r'./data/TASKDATA'
INTERIM_FOLDER  = r'./data/taskdata_out2'
ENRICHED_FOLDER = r'./data/ENRICHED'
OUT_ESPG = "EPSG:25832"
# Safety Buffer: Add ~1km (0.01 deg) around the field to catch headland turns
GEO_BUFFER = 0.01 
BBOX_DEFAULT = (54.0, 58.0, 8.0, 16.0) # Denmark
if not os.path.exists(INTERIM_FOLDER):
    os.makedirs(INTERIM_FOLDER)
if not os.path.exists(ENRICHED_FOLDER):
    os.makedirs(ENRICHED_FOLDER)

In [3]:
# ==========================================
# 1. METADATA PARSER (Generates Index + Bounding Box)
# ==========================================
def parse_isobus_taskdata(data_folder):
    print(f"Scanning {data_folder} for TASKDATA.XML...")
    
    # Find XML
    taskdata_path = None
    for file in os.listdir(data_folder):
        if file.upper() == 'TASKDATA.XML':
            taskdata_path = os.path.join(data_folder, file)
            break
            
    if taskdata_path is None:
        print("Error: TASKDATA.XML not found.")
        return None, None

    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
    except Exception as e:
        print(f"Error parsing XML: {e}")
        return None, None

    # --- A. Map Products (PDT) ---
    products = {}
    for pdt in root.findall(".//PDT"):
        products[pdt.attrib.get('A')] = pdt.attrib.get('B')

    # --- B. Store Field Geometries & Calculate Bounds ---
    field_shapes = {}
    field_bounds = {} # PFD_ID -> (min_lat, max_lat, min_lon, max_lon)
    field_names = {}

    for pfd in root.findall(".//PFD"):
        pfd_id = pfd.attrib.get('A')
        field_names[pfd_id] = pfd.attrib.get('C')
        
        for pln in pfd.findall("PLN"):
            for lsg in pln.findall("LSG"):
                lats, lons = [], []
                coords = []
                for pnt in lsg.findall("PNT"):
                    try:
                        lat = float(pnt.attrib.get('C'))
                        lon = float(pnt.attrib.get('D'))
                        lats.append(lat)
                        lons.append(lon)
                        coords.append([lon, lat])
                    except: continue
                
                if lats:
                    # Save Polygon
                    if coords[0] != coords[-1]: coords.append(coords[0])
                    field_shapes[pfd_id] = coords
                    
                    # Calculate Bounding Box (With Buffer)
                    field_bounds[pfd_id] = (
                        min(lats) - GEO_BUFFER, max(lats) + GEO_BUFFER,
                        min(lons) - GEO_BUFFER, max(lons) + GEO_BUFFER
                    )
                    break 

    # --- C. Build Task Index ---
    tasks_list = []
    geojson_features = []

    for tsk in root.findall(".//TSK"):
        task_id = tsk.attrib.get('A')
        field_ref = tsk.attrib.get('E')
        
        field_name = field_names.get(field_ref, f"Unknown ({field_ref})")
        
        # Get Log File
        tlg = tsk.find("TLG")
        log_filename = tlg.attrib.get('A') if tlg is not None else None

        # Get Crop
        crop_name = "Unknown"
        pan = tsk.find("PAN")
        if pan is not None:
            pdt_ref = pan.attrib.get('A')
            crop_name = products.get(pdt_ref, pdt_ref)
            
        # Get Year
        start_times = []
        for tim in tsk.findall("TIM"):
            start_str = tim.attrib.get('A') 
            if start_str:
                try:
                    dt = datetime.fromisoformat(start_str.replace('Z', '+00:00'))
                    start_times.append(dt)
                except: pass
        task_year = start_times[0].year if start_times else None

        if log_filename:
            # Get Bounds for this field (Default to Denmark if unknown)
            # Default: 54-58N, 8-16E
            bounds = field_bounds.get(field_ref, BBOX_DEFAULT)
            
            tasks_list.append({
                'TaskID': task_id,
                'Year': task_year,
                'Crop': crop_name,
                'FieldName': field_name,
                'LogFilename': log_filename + '.bin' if not log_filename.endswith('.bin') else log_filename,
                'MinLat': bounds[0], 'MaxLat': bounds[1],
                'MinLon': bounds[2], 'MaxLon': bounds[3]
            })

            # GeoJSON
            if field_ref in field_shapes:
                geojson_features.append({
                    "type": "Feature",
                    "properties": { "TaskID": task_id, "FieldName": field_name, "Crop": crop_name },
                    "geometry": { "type": "Polygon", "coordinates": [field_shapes[field_ref]] }
                })

    return { "type": "FeatureCollection", "features": geojson_features }, pd.DataFrame(tasks_list)

In [4]:
# ==========================================
# 2. RAW BINARY CONVERTER (Guided by BBox)
# ==========================================
def convert_bin_to_csv(bin_path, out_csv_path, bounds):
    """
    Reads binary, checks Geo-Bounds, outputs Raw Unsigned Values.
    """
    if not os.path.exists(bin_path): return False
    
    min_lat, max_lat, min_lon, max_lon = bounds
    
    with open(bin_path, 'rb') as f:
        content = f.read()
        
    total_len = len(content)
    cursor = 0
    valid_rows = []
    last_valid_end = 0
    
    while cursor < total_len - 30:
        match_found = False
        bytes_consumed = 0
        
        try:
            # 1. Header (Time=Unsigned, Lat/Lon=Signed)
            time_ms, lat_raw, lon_raw = struct.unpack('<Lii', content[cursor:cursor+12])
            lat = lat_raw * 1e-7
            lon = lon_raw * 1e-7
            
            # 2. Geo-Filter (Using XML Bounds)
            if (min_lat < lat < max_lat) and (min_lon < lon < max_lon):
                
                # 3. Payload (Strictly Unsigned 'I')
                # 4 sensors * 4 bytes = 16 bytes
                s1, s2, s3, s4 = struct.unpack('4I', content[cursor+12 : cursor+28])
                
                match_found = True
                bytes_consumed = 28
                
                # 4. Gap Logic
                gap_size = cursor - last_valid_end
                if gap_size == 0: pkt_type = "Contiguous"
                elif gap_size == 10: pkt_type = "Transport (10b)"
                elif gap_size == 25: pkt_type = "Harvest (25b)"
                else: pkt_type = f"Gap ({gap_size}b)"
                
                valid_rows.append({
                    'Time_ms': time_ms,
                    'Latitude': lat,
                    'Longitude': lon,
                    'Yield_Mass': s1, # Raw Unsigned
                    'Yield_Vol': s2,  # Raw Unsigned
                    'Speed': s3,      # Raw Unsigned
                    'Moisture': s4,   # Raw Unsigned
                    'Gap_Bytes': gap_size,
                    'Packet_Type': pkt_type
                })
                
        except: pass
            
        if match_found:
            last_valid_end = cursor + bytes_consumed
            cursor += bytes_consumed
        else:
            cursor += 1
            
    if valid_rows:
        pd.DataFrame(valid_rows).to_csv(out_csv_path, index=False)
        return True
    return False

In [5]:
# ==========================================
# 3. EXECUTION LOOP
# ==========================================
print("--- STAGE 1: METADATA & BOUNDS ---")
geojson, df_tasks = parse_isobus_taskdata(DATA_FOLDER)

if df_tasks is not None:
    # Save Master Index
    df_tasks.to_csv(os.path.join(INTERIM_FOLDER, 'task_index.csv'), index=False)
    
    # Save GeoJSON
    with open(os.path.join(INTERIM_FOLDER, 'harvest_tasks.geojson'), 'w') as f:
        json.dump(geojson, f)
        
    print(f"Index created. Found {len(df_tasks)} tasks.")
    
    print("\n--- STAGE 2: BINARY EXTRACTION ---")
    count = 0
    for idx, row in df_tasks.iterrows():
        bin_file = os.path.join(DATA_FOLDER, row['LogFilename'])
        out_file = os.path.join(INTERIM_FOLDER, row['LogFilename'].replace('.bin', '.csv'))
        
        # Get bounds from index
        bounds = (row['MinLat'], row['MaxLat'], row['MinLon'], row['MaxLon'])
        
        print(f"Processing {row['LogFilename']}...", end='\r')
        if convert_bin_to_csv(bin_file, out_file, bounds):
            count += 1
            
    print(f"\nDone. Extracted {count} files to {INTERIM_FOLDER}/")
else:
    print("Metadata parsing failed.")

--- STAGE 1: METADATA & BOUNDS ---
Scanning C:/dev/agri_analysis/data/taskdata for TASKDATA.XML...
Index created. Found 142 tasks.

--- STAGE 2: BINARY EXTRACTION ---
Processing TLG00142.bin...
Done. Extracted 142 files to C:/dev/agri_analysis/data/taskdata_out2/


In [4]:
# ==========================================
# 1. HELPER: GET ANCHOR TIMES FROM XML
# ==========================================
def load_anchor_times(xml_folder):
    """
    Extracts the PRECISE start time from XML to fix the "2003" binary date.
    """
    anchors = {} 
    xml_path = os.path.join(xml_folder, 'TASKDATA.XML')
    
    if os.path.exists(xml_path):
        try:
            tree = ET.parse(xml_path)
            for tsk in tree.findall(".//TSK"):
                tlg = tsk.find("TLG")
                tim = tsk.find("TIM")
                
                if tlg is not None and tim is not None:
                    filename = tlg.attrib.get('A') + '.bin'
                    start_str = tim.attrib.get('A')
                    try:
                        # Parse "2024-08-12T09:00:00"
                        dt = datetime.fromisoformat(start_str.replace('Z', '+00:00'))
                        anchors[filename] = dt
                    except: pass
        except: print("Warning: Could not parse XML dates.")
    
    return anchors

# ==========================================
# 2. ENRICHMENT PROCESSOR
# ==========================================
def enrich_file(raw_path, meta, anchor_time):
    # 1. Load Raw Bits
    df = pd.read_csv(raw_path)
    if df.empty: return False

    # 2. FIX TIME (Anchor + Offset)
    # We ignore the absolute value of Time_ms (2003) and just use the relative ticks
    t0 = df['Time_ms'].iloc[0]
    df['Seconds_Elapsed'] = (df['Time_ms'] - t0) / 1000.0
    
    # Fallback if XML time is missing: Use Jan 1st of the Task Year
    if anchor_time is None:
        year = meta['Year'] if pd.notnull(meta['Year']) else 2024
        anchor_time = datetime(int(year), 1, 1, 12, 0, 0)
        
    df['Datetime'] = anchor_time + pd.to_timedelta(df['Seconds_Elapsed'], unit='s')

    # 3. CLEAN & SCALE SENSORS
    # Helper: Clean unsigned noise (>2B) and scale
    def clean(s, scale):
        s = pd.to_numeric(s, errors='coerce').fillna(0)
        s = np.where(s > 2000000000, 0, s) # Filter Error Flags
        return s / scale

    # Scale based on your findings
    if 'Speed' in df.columns:
        df['Speed'] = clean(df['Speed'], 1000.0) # mm/s -> m/s
    if 'Moisture' in df.columns:
        df['Moisture'] = clean(df['Moisture'], 100.0) # 0.01% -> %
        
    df['Raw_Mass'] = clean(df['Yield_Mass'], 1_000_000.0) # mg/s -> kg/s
    df['Raw_Vol']  = clean(df['Yield_Vol'], 1000.0)       # ml/s -> L/s
    
    # 4. RECALCULATE GPS SPEED (Rolling Window)
    # 20m steps -> Window 5 (100m) is appropriate
    transformer = pyproj.Transformer.from_crs("EPSG:4326", OUT_ESPG, always_xy=True)
    xx, yy = transformer.transform(df['Longitude'].values, df['Latitude'].values)
    df['UTM_Easting'] = xx
    df['UTM_Northing'] = yy
    
    WINDOW = 5
    dx = df['UTM_Easting'].diff(WINDOW)
    dy = df['UTM_Northing'].diff(WINDOW)
    dist = np.sqrt(dx**2 + dy**2)
    dt = df['Seconds_Elapsed'].diff(WINDOW)
    
    df['GPS_Speed'] = dist / dt
    df['GPS_Speed'] = df['GPS_Speed'].bfill().fillna(0)

    # 5. YIELD CALCULATION
    # Use Sensor Speed if moving, else GPS Speed
    use_speed = np.where(df['Speed'] > 0.1, df['Speed'], df['GPS_Speed'])
    HEADER_WIDTH = 9.0 # Standard combine header width
    
    # Yield (t/ha) = (kg/s * 10) / (m/s * m)
    df['Yield_T_Ha'] = (df['Raw_Mass'] * 10.0) / (use_speed * HEADER_WIDTH)
    df['Yield_T_Ha'] = df['Yield_T_Ha'].fillna(0).replace([np.inf, -np.inf], 0)

    # 6. SAVE (Year_FieldName_Crop.csv)
    # Sanitize names for Windows filesystems
    safe_field = str(meta['FieldName']).replace(' ', '_').replace('/', '-').replace(',', '').replace('"', '')
    safe_crop  = str(meta['Crop']).replace(' ', '_').replace('/', '-')
    year = int(meta['Year'])
    
    # ---  Create Year Subfolder ---
    year_folder = os.path.join(ENRICHED_FOLDER, str(year))
    if not os.path.exists(year_folder):
        os.makedirs(year_folder)
    
    out_name = f"{year}_{safe_field}_{safe_crop}.csv"
    out_path = os.path.join(year_folder, out_name)
    
    # Append to existing if file exists (Merge split tasks)
    write_mode = 'w'
    write_header = True
    if os.path.exists(out_path):
        write_mode = 'a'
        write_header = False
        
    cols = ['Datetime', 'Latitude', 'Longitude','UTM_Northing', 'UTM_Easting','Yield_T_Ha', 'GPS_Speed', 'Speed', 'Moisture', 'Packet_Type']
    df[cols].to_csv(out_path, mode=write_mode, header=write_header, index=False)
    
    return True

# ==========================================
# 3. EXECUTION
# ==========================================
print("--- STARTING ENRICHMENT ---")
index_path = os.path.join(INTERIM_FOLDER, 'task_index.csv')

if os.path.exists(index_path):
    df_index = pd.read_csv(index_path)
    anchors = load_anchor_times(DATA_FOLDER)
    
    print(f"Found {len(df_index)} tasks. Processing...")
    
    count = 0
    for idx, row in df_index.iterrows():
        raw_csv = row['LogFilename'].replace('.bin', '.csv')
        raw_path = os.path.join(INTERIM_FOLDER, raw_csv)
        
        if os.path.exists(raw_path):
            start_time = anchors.get(row['LogFilename'])
            if enrich_file(raw_path, row, start_time):
                count += 1
                print(f"Processed: {row['LogFilename']} -> {row['Year']}_{row['FieldName']}_{row['Crop']}", end='\r')
                
    print(f"\n\nSUCCESS. {count} tasks enriched.")
    print(f"Data stored in: {ENRICHED_FOLDER}/")
else:
    print("Error: task_index.csv missing. Run the Extractor first.")

--- STARTING ENRICHMENT ---
Found 142 tasks. Processing...
Processed: TLG00142.bin -> 2023_010-0, Møllemark_HV - HvedeajgræsHvedeseyg

SUCCESS. 142 tasks enriched.
Data stored in: C:/dev/agri_analysis/data/ENRICHED/
