In [1]:
import struct
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import numpy as np
from datetime import datetime

In [2]:
# Soecifu the taskdata folder

data_folder = r'./data/taskdata_3' 
out_folder = r'./data/taskdata_3_out'
# Create output folder if it doesn't exist
os.makedirs(out_folder, exist_ok=True)

# Buffer to add around the farm boundaries (in degrees)
# 0.02 deg is approx 2km. This accounts for headlands/approach roads.
GEO_BUFFER = 0.02 

## XML task file reading

In [3]:
def parse_isobus_taskdata(data_folder):
    # 1. Find TASKDATA.XML
    taskdata_path = None
    for file in os.listdir(data_folder):
        if file.upper() == 'TASKDATA.XML':
            taskdata_path = os.path.join(data_folder, file)
            break
            
    if taskdata_path is None:
        print("Error: TASKDATA.XML not found.")
        return None, None

    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
    except Exception as e:
        print(f"Error parsing XML: {e}")
        return None, None

    # --- 1. Map Products (PDT) ---
    products = {}
    for pdt in root.findall(".//PDT"):
        products[pdt.attrib.get('A')] = pdt.attrib.get('B')

    # --- 2. Store Field Geometries (PFD) ---
    field_shapes = {}   # PFD_ID -> Coordinates
    field_names_map = {} # PFD_ID -> Name

    for pfd in root.findall(".//PFD"):
        pfd_id = pfd.attrib.get('A')
        field_names_map[pfd_id] = pfd.attrib.get('C')
        
        # Extract Geometry (Polygon)
        # We take the first valid polygon found
        for pln in pfd.findall("PLN"):
            for lsg in pln.findall("LSG"):
                coordinates = []
                for pnt in lsg.findall("PNT"):
                    try:
                        # XML Attributes are strings, convert to float
                        lat = float(pnt.attrib.get('C'))
                        lon = float(pnt.attrib.get('D'))
                        coordinates.append([lon, lat])
                    except: continue
                
                if len(coordinates) > 2:
                    # Close the polygon loop
                    if coordinates[0] != coordinates[-1]:
                        coordinates.append(coordinates[0])
                    field_shapes[pfd_id] = coordinates
                    break # Stop after first valid shape

    # --- 3. Build Tasks & Generate GeoJSON ---
    tasks_list = []
    geojson_features = []

    for tsk in root.findall(".//TSK"):
        task_id = tsk.attrib.get('A')
        field_ref = tsk.attrib.get('E')
        
        # Get Attributes
        field_name = field_names_map.get(field_ref, f"Unknown ({field_ref})")
        
        # Get Log File
        tlg = tsk.find("TLG")
        log_filename = tlg.attrib.get('A') if tlg is not None else None

        # Get Crop
        crop_name = "Unknown"
        pan = tsk.find("PAN")
        if pan is not None:
            pdt_ref = pan.attrib.get('A')
            crop_name = products.get(pdt_ref, pdt_ref)
            
        # --- NEW: GET YEAR ---
        # Find all TIM tags and get the earliest start time
        start_times = []
        for tim in tsk.findall("TIM"):
            start_str = tim.attrib.get('A') # Format: 2023-08-16T11:34:57...
            if start_str:
                try:
                    # Parse ISO format (handle timezone if present, simplistically here)
                    dt = datetime.fromisoformat(start_str.replace('Z', '+00:00'))
                    start_times.append(dt)
                except: pass
        
        task_year = start_times[0].year if start_times else None

        # -- Build DataFrame Entry --
        if log_filename: # Only save interesting tasks with logs
            tasks_list.append({
                'TaskID': task_id,
                'Year': task_year,
                'Crop': crop_name,
                'FieldName': field_name,
                'FieldID': field_ref,
                'LogFilename': log_filename
            })

            # -- Build GeoJSON Feature --
            if field_ref in field_shapes:
                feature = {
                    "type": "Feature",
                    "properties": {
                        "TaskID": task_id,
                        "Year": int(task_year) if task_year else None,
                        "FieldName": field_name,
                        "Crop": crop_name,
                        "LogFilename": log_filename 
                    },
                    "geometry": {
                        "type": "Polygon",
                        "coordinates": [field_shapes[field_ref]]
                    }
                }
                geojson_features.append(feature)

    # --- Output ---
    geojson_output = {
        "type": "FeatureCollection", 
        "features": geojson_features
    }
    df_tasks = pd.DataFrame(tasks_list)
    
    return geojson_output, df_tasks




In [4]:
# --- EXECUTION ---
geojson, df_tasks = parse_isobus_taskdata(data_folder)

if df_tasks is not None:
    # Save CSV
    csv_path = os.path.join(out_folder, 'task_index.csv')
    df_tasks.to_csv(csv_path, index=False)
    
    # Save GeoJSON
    geo_path = os.path.join(out_folder, 'harvest_tasks.geojson')
    with open(geo_path, 'w') as f:
        json.dump(geojson, f)
        
    print(f"Success. Index saved to {csv_path}")
    print(f"Sample:\n{df_tasks[['Year', 'Crop', 'FieldName']].head()}")

Success. Index saved to ./data/taskdata_3_out\task_index.csv
Sample:
   Year              Crop             FieldName
0  2022      RG - Rajgræs         021-0, Monica
1  2022      RG - Rajgræs         021-0, Monica
2  2021  RA - Raps / rybs  016-0, Stendyssegård
3  2021  RA - Raps / rybs  016-0, Stendyssegård
4  2022        HV - Hvede  037-0, Bispegård øst


## Binary file reading

In [5]:
# --- HELPER: PACKET CLASSIFIER ---
def classify_packet(gap_size):
    """
    Determines the machine state based on the size of the proprietary header
    preceding the data record.
    """
    if gap_size == 25:
        return "Harvest (Mode A)"
    elif gap_size == 10:
        return "Transport (Mode B)"
    elif gap_size == 0:
        return "Contiguous" # Rare, usually start of file
    else:
        return f"Transition ({gap_size}b)"

# --- HELPER: DYNAMIC BOUNDING BOX ---
def get_farm_bounding_box(taskdata_path):
    print(f"Reading Farm Geometry from: {taskdata_path}")
    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
        lats, lons = [], []
        for pnt in root.findall(".//PFD//PNT"):
            try:
                lats.append(float(pnt.attrib.get('C')))
                lons.append(float(pnt.attrib.get('D')))
            except: continue
            
        if not lats: return 54.5, 58.0, 8.0, 15.0
        return min(lats)-GEO_BUFFER, max(lats)+GEO_BUFFER, min(lons)-GEO_BUFFER, max(lons)+GEO_BUFFER
    except: return 54.5, 58.0, 8.0, 15.0

# --- HELPER: METADATA & DEFS ---
def load_metadata(taskdata_path):
    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
        pdt_map = {pdt.attrib.get('A'): pdt.attrib.get('B') for pdt in root.findall(".//PDT")}
        meta_map = {}
        for tsk in root.findall(".//TSK"):
            tlg = tsk.find(".//TLG")
            pan = tsk.find(".//PAN")
            tim = tsk.find(".//TIM")
            if tlg is not None:
                log_id = tlg.attrib.get('A')
                crop = pdt_map.get(pan.attrib.get('A'), 'Unknown') if pan is not None else 'Unknown'
                start = tim.attrib.get('A') if tim is not None else None
                meta_map[log_id] = {'Crop': crop, 'Start_Time': start}
                meta_map[log_id + '.bin'] = {'Crop': crop, 'Start_Time': start}
        return meta_map
    except: return {}

def get_dlv_defs(xml_path):
    try:
        tree = ET.parse(xml_path)
        return [elem.attrib.get('A') for elem in tree.findall(".//DLV")]
    except: return []

# --- CORE: ENRICHED FORENSIC SCANNER ---
def forensic_scan(bin_path, definitions, metadata, bounds):
    min_lat, max_lat, min_lon, max_lon = bounds
    filename = os.path.basename(bin_path)
    meta = metadata.get(filename, {'Crop': 'Unknown', 'Start_Time': None})
    
    with open(bin_path, 'rb') as f:
        content = f.read()

    total_len = len(content)
    cursor = 0
    valid_rows = []
    last_valid_end = 0

    ddi_lookup = {'0054': 'Yield_Mass', '0095': 'Yield_Vol', '018D': 'Speed', '0063': 'Moisture'}

    while cursor < total_len - 50:
        match_found = False
        bytes_consumed = 0
        row_data = None

        # Try Type 1 (32-bit)
        try:
            time_ms, lat_raw, lon_raw = struct.unpack('<Lii', content[cursor:cursor+12])
            lat = lat_raw * 1e-7
            lon = lon_raw * 1e-7
            
            if (min_lat < lat < max_lat) and (min_lon < lon < max_lon):
                count = len(definitions)
                payload_len = count * 4
                if cursor + 12 + payload_len <= total_len:
                    values = struct.unpack(f"{count}i", content[cursor+12 : cursor+12+payload_len])
                    
                    row_data = {'Time_ms': time_ms, 'Latitude': lat, 'Longitude': lon}
                    for i, val in enumerate(values):
                        ddi = definitions[i]
                        row_data[ddi_lookup.get(ddi, f'DDI_{ddi}')] = val
                    
                    bytes_consumed = 12 + payload_len
                    match_found = True
        except: pass

        if match_found:
            # --- NEW: PACKET CLASSIFICATION ---
            gap_size = cursor - last_valid_end
            
            # Enrich row with Forensic Info
            row_data['File'] = filename
            row_data['Crop'] = meta['Crop']
            row_data['Start_Time_Str'] = meta['Start_Time']
            row_data['Gap_Bytes'] = gap_size
            row_data['Packet_Type'] = classify_packet(gap_size)

            valid_rows.append(row_data)

            cursor += bytes_consumed
            last_valid_end = cursor
        else:
            cursor += 1

    return pd.DataFrame(valid_rows)


In [6]:
# --- EXECUTION ---
taskdata_xml = os.path.join(data_folder, 'TASKDATA.XML')
farm_bounds = get_farm_bounding_box(taskdata_xml)
metadata_map = load_metadata(taskdata_xml)

all_data = []

print("Starting Enriched Forensic Scan...")

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".BIN"):
        xml_full = os.path.join(data_folder, filename.rsplit('.', 1)[0] + ".xml")
        bin_full = os.path.join(data_folder, filename)
        
        if os.path.exists(xml_full):
            
            defs = get_dlv_defs(xml_full)
            
            df = forensic_scan(bin_full, defs, metadata_map, farm_bounds)
 
            if not df.empty:
                all_data.append(df)

# --- POST-PROCESSING ---
if all_data:
    print("Processing Metrics...")
    master = pd.concat(all_data, ignore_index=True)
    
    # Standard Post-Processing (Time, Speed, Yield, Density)
    master['Task_Start'] = pd.to_datetime(master['Start_Time_Str'])
    master = master.sort_values(['File', 'Time_ms'])
    master['Time_Fix'] = master['Time_ms'] + master.groupby(['File', 'Time_ms']).cumcount() * 10
    master['Datetime'] = master['Task_Start'] + pd.to_timedelta(master['Time_Fix'], unit='ms')

    master['Raw_Mass'] = pd.to_numeric(master['Yield_Mass'], errors='coerce')
    master['Raw_Vol']  = pd.to_numeric(master['Yield_Vol'], errors='coerce')
    master['Status_Code'] = master['Raw_Mass'].fillna(0).astype(int) & 0xFF
    master['Density_kg_L'] = np.where(master['Raw_Vol'] > 1000, (master['Raw_Mass'] / master['Raw_Vol']) * 10.0, 0)

    # GPS Speed
    lat_rad = np.radians(master['Latitude'])
    dlat = master.groupby('File')['Latitude'].diff() * 111132
    dlon = master.groupby('File')['Longitude'].diff() * 111132 * np.cos(lat_rad)
    dist = np.sqrt(dlat**2 + dlon**2)
    dt = master.groupby('File')['Time_Fix'].diff() / 1000.0
    master['GPS_Speed'] = dist / dt
    master['GPS_Speed'] = master['GPS_Speed'].fillna(0).replace([np.inf, -np.inf], 0)
    master['GPS_Speed'] = master['GPS_Speed'].rolling(window=5, min_periods=1).mean()

    # Yield
    HEADER_WIDTH = 9.0
    master['Mass_kg_s'] = master['Raw_Mass'] / 1_000_000
    master['Yield_T_Ha'] = (master['Mass_kg_s']) / (master['GPS_Speed'] * HEADER_WIDTH)
    master['Yield_T_Ha'] = master['Yield_T_Ha'].fillna(0).replace([np.inf, -np.inf], 0)

    # --- SAVING ---
    out_csv = os.path.join(out_folder, 'UNIVERSAL_ENRICHED_DATASET.csv')
    
    # Include the new forensic columns
    cols = ['Datetime', 'Latitude', 'Longitude', 'File', 'Crop', 
            'Packet_Type', 'Gap_Bytes',   # <-- NEW COLUMNS
            'Yield_T_Ha', 'Density_kg_L', 'GPS_Speed', 'Status_Code', 
            'Raw_Mass', 'Raw_Vol']
            
    master[cols].to_csv(out_csv, index=False)
    
    print(f"\nSUCCESS.")
    print(f"Dataset with Packet Types saved to: {out_csv}")
    print("Check the 'Packet_Type' column to see 'Harvest (Mode A)' vs 'Transport (Mode B)'")

else:
    print("No valid data found.")

Reading Farm Geometry from: ./data/taskdata_3\TASKDATA.XML
Starting Enriched Forensic Scan...
Processing Metrics...

SUCCESS.
Dataset with Packet Types saved to: ./data/taskdata_3_out\UNIVERSAL_ENRICHED_DATASET.csv
Check the 'Packet_Type' column to see 'Harvest (Mode A)' vs 'Transport (Mode B)'
