In [50]:
import xml.etree.ElementTree as ET
import os
import pandas as pd
import json
import struct

In [46]:
# Soecifu the taskdata folder

data_folder = r'./data/taskdata_3' 
out_folder = r'./data/taskdata_3_out'
# Create output folder if it doesn't exist
os.makedirs(out_folder, exist_ok=True)


In [29]:
def parse_isobus_taskdata(data_folder):
    # 1. Find TASKDATA.XML
    taskdata_path = None
    for file in os.listdir(data_folder):
        if file.upper() == 'TASKDATA.XML':
            taskdata_path = os.path.join(data_folder, file)
            break
            
    if taskdata_path is None:
        return "Error: TASKDATA.XML not found.", None

    try:
        tree = ET.parse(taskdata_path)
        root = tree.getroot()
    except Exception as e:
        return f"Error parsing XML: {e}", None

    # --- 1. Map Products (PDT) ---
    # PDT ID -> Crop Name
    products = {}
    for pdt in root.findall(".//PDT"):
        products[pdt.attrib.get('A')] = pdt.attrib.get('B')

    # --- 2. Store Field Geometries (PFD) ---
    # We don't save GeoJSON yet; we just store the shapes in a dictionary
    # so we can look them up later by Field ID.
    field_shapes = {}   # Mapping: PFD_ID -> List of coordinates
    field_names_map = {} # Mapping: PFD_ID -> Name

    for pfd in root.findall(".//PFD"):
        pfd_id = pfd.attrib.get('A')
        field_names_map[pfd_id] = pfd.attrib.get('C')
        
        # Extract Geometry
        for pln in pfd.findall("PLN"):
            for lsg in pln.findall("LSG"):
                coordinates = []
                for pnt in lsg.findall("PNT"):
                    try:
                        lat = float(pnt.attrib.get('C'))
                        lon = float(pnt.attrib.get('D'))
                        coordinates.append([lon, lat])
                    except (ValueError, TypeError):
                        continue
                
                if len(coordinates) > 2:
                    if coordinates[0] != coordinates[-1]:
                        coordinates.append(coordinates[0])
                    # Store the shape
                    field_shapes[pfd_id] = coordinates

    # --- 3. Build Tasks & Generate GeoJSON ---
    tasks_list = []
    geojson_features = []

    for tsk in root.findall(".//TSK"):
        task_id = tsk.attrib.get('A')
        field_ref = tsk.attrib.get('E')
        
        # Get Attributes
        field_name = field_names_map.get(field_ref, f"Unknown ({field_ref})")
        
        # Get Log File
        tlg = tsk.find("TLG")
        log_filename = tlg.attrib.get('A') if tlg is not None else None

        # Get Crop
        crop_name = None
        pan = tsk.find("PAN")
        if pan is not None:
            pdt_ref = pan.attrib.get('A')
            crop_name = products.get(pdt_ref, pdt_ref)

        # -- Build DataFrame Entry --
        tasks_list.append({
            'TaskID': task_id,
            'FieldRef': field_ref,
            'FieldName': field_name,
            'LogFilename': log_filename,
            'Crop': crop_name
        })

        # -- Build GeoJSON Feature --
        # Only create a polygon if we actually have geometry for this field
        if field_ref in field_shapes:
            feature = {
                "type": "Feature",
                "properties": {
                    "TaskID": task_id,
                    "FieldName": field_name,
                    "Crop": crop_name,
                    "LogFilename": log_filename 
                },
                "geometry": {
                    "type": "Polygon",
                    "coordinates": [field_shapes[field_ref]]
                }
            }
            geojson_features.append(feature)

    # --- Output ---
    geojson_output = {
        "type": "FeatureCollection", 
        "features": geojson_features
    }
    df_tasks = pd.DataFrame(tasks_list)
    
    return geojson_output, df_tasks



In [51]:

# --- HELPER: Binary Parser ---
def parse_tlg_binary(xml_path, bin_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
    except:
        return None

    # DDI Mapping (Based on your file snippets)
    ddi_map = {
        '0054': 'Yield_Mass_Flow',   # Mass Yield
        '0053': 'Yield_Volume_Flow', # Volume Yield
        '0063': 'Moisture_Pct',      # Moisture (Specific to your file)
        '0106': 'Moisture_Pct_Alt',  # Alternate Moisture
        '018D': 'Speed_ms',          # Speed
        '0095': 'Fuel_Rate',         # Fuel
        '0074': 'Area_Worked'
    }

    # Build Structure
    struct_fmt = '<L' # Time (4 bytes)
    columns = ['Time_ms']
    
    # Position
    has_pos = False
    if root.find("PTN") is not None:
        struct_fmt += 'ii' # Lat, Lon
        columns.extend(['Latitude', 'Longitude'])
        has_pos = True
    
    # Values
    for dlv in root.findall("DLV"):
        struct_fmt += 'i'
        ddi = dlv.attrib.get('A', '')
        columns.append(ddi_map.get(ddi, f"DDI_{ddi}"))

    row_size = struct.calcsize(struct_fmt)
    data_rows = []

    # Read Binary
    try:
        if os.path.exists(bin_path):
            with open(bin_path, 'rb') as f:
                while True:
                    chunk = f.read(row_size)
                    if len(chunk) < row_size: break
                    val = list(struct.unpack(struct_fmt, chunk))
                    if has_pos:
                        val[1] *= 1e-7 # Scale Lat
                        val[2] *= 1e-7 # Scale Lon
                    data_rows.append(val)
    except:
        return None

    df = pd.DataFrame(data_rows, columns=columns)
    if has_pos:
        # Filter invalid coordinates (0,0)
        df = df[(df['Latitude'] != 0) & (df['Longitude'] != 0)]
    return df

# --- MAIN EXECUTION ---
print(f"Processing files from: {data_folder}")
print(f"Saving results to:     {out_folder}")

all_dfs = []

# 1. Process every TLG file
for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".XML"):
        xml_full = os.path.join(data_folder, filename)
        bin_full = os.path.join(data_folder, filename.replace('.xml', '.bin').replace('.XML', '.BIN'))
        
        if os.path.exists(bin_full):
            # print(f"  Parsing {filename}...") # Uncomment to see progress
            df = parse_tlg_binary(xml_full, bin_full)
            
            if df is not None and not df.empty:
                # Add LogID for merging later
                log_id = os.path.splitext(filename)[0] # e.g., TLG00001
                df['LogID'] = log_id
                all_dfs.append(df)

# 2. Combine all data
if all_dfs:
    master_df = pd.concat(all_dfs, ignore_index=True)
    print(f"Combined {len(master_df)} data points.")

    # 3. Try to Merge with Field Names (Task Index)
    # We look for 'task_index.csv' in the output folder (where you likely saved it last time)
    # OR the data_folder.
    index_path = os.path.join(out_folder, 'task_index.csv')
    if not os.path.exists(index_path):
        index_path = os.path.join(data_folder, 'task_index.csv')

    if os.path.exists(index_path):
        print("Found Task Index! Merging Field Names and Crops...")
        df_index = pd.read_csv(index_path)
        
        # Ensure ID columns match types (string)
        master_df['LogID'] = master_df['LogID'].astype(str)
        df_index['LogFilename'] = df_index['LogFilename'].astype(str)
        
        # Merge
        master_df = pd.merge(master_df, df_index, left_on='LogID', right_on='LogFilename', how='left')
        
        # Drop rows that don't belong to a known field (optional)
        # master_df = master_df.dropna(subset=['FieldName'])
    else:
        print("Warning: 'task_index.csv' not found. Data will have no Field Names.")

    # 4. Save Final CSV
    out_file = os.path.join(out_folder, 'FULL_HARVEST_DATASET.csv')
    master_df.to_csv(out_file, index=False)
    print(f"\nDONE. Saved to: {out_file}")
    print(master_df.head())

else:
    print("No valid TLG data found.")

Processing files from: ./data/taskdata_3
Saving results to:     ./data/taskdata_3_out
Combined 51952 data points.
Found Task Index! Merging Field Names and Crops...

DONE. Saved to: ./data/taskdata_3_out\FULL_HARVEST_DATASET.csv
      Time_ms      Latitude   Longitude    Speed_ms  Fuel_Rate  \
0    61342000  1.144798e+02 -188.559331    33621782     164864   
1      162304  7.680000e-05  100.296294  1018561448  556156618   
2  3053693030  6.421264e+01   37.829533      131335          0   
3  3053693050  6.421269e+01   37.829584      131335          0   
4           0  3.000000e-07 -146.479718  1228715523 -685693370   

   Yield_Volume_Flow  Yield_Mass_Flow    DDI_0055  Moisture_Pct    DDI_013A  \
0                768        347602944  1018561448     556156224   118918706   
1          118918226      -1493171455    50331648             0      123136   
2                  3      -1468893184   339523075    1445013062    17241740   
3                  3      -1467101184   255636995    16463

In [45]:
# --- process task metadata ---


geojson, df_tasks = parse_isobus_taskdata(data_folder)

if df_tasks is not None:
    # Save the Enhanced GeoJSON
    out_geo = os.path.join(out_folder, 'harvest_tasks.geojson')
    with open(out_geo, 'w') as f:
        json.dump(geojson, f)
    
    # Save the CSV Index
    df_tasks.to_csv(os.path.join(out_folder, 'task_index.csv'), index=False)
    
    print(f"Success! Saved {out_geo} with Crop and Log info.")

Success! Saved ./data/taskdata_3_out\harvest_tasks.geojson with Crop and Log info.


In [53]:

# 3. Build the full paths automatically
xml_full_path = os.path.join(data_folder, file_name + '.xml')
bin_full_path = os.path.join(data_folder, file_name + '.bin')

# 4. Run the parser
df = parse_tlg_binary(xml_full_path, bin_full_path)

if df is not None:
    print(f"Loaded {file_name} successfully.")
    # Export to CSV inside the same folder
    output_csv = os.path.join(data_folder, f'{file_name}_harvest_points.csv')
    df.to_csv(output_csv, index=False)
else:
    print(status)

Loaded TLG00001 successfully.


In [43]:
if geojson_data is not None:
    # Define the output path
    output_path = os.path.join(out_folder, 'fields.geojson')
    
    # Save to file
    with open(output_path, 'w') as f:
        json.dump(geojson_data, f)

In [54]:


# ISO 11783-10 Clause 8.6.4 Format
# < = Little Endian
# I = Time (4 bytes)
# q = Latitude (8 bytes, signed long long)
# q = Longitude (8 bytes, signed long long)
# B = Position Status (1 byte)
# B = Count of DLVs following (1 byte)
HEADER_FMT = "<IqqBB"
HEADER_SIZE = struct.calcsize(HEADER_FMT)

def parse_xml_definitions(xml_path):
    """
    Reads the TLG...XML file to understand what 'Index 0', 'Index 1' mean.
    Returns a list of DDI strings (e.g., ['0054', '0106', ...])
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        # The binary file refers to DLVs by their ORDER in the XML.
        # So we just make a list of them.
        definitions = []
        for dlv in root.findall(".//DLV"):
            ddi = dlv.attrib.get('A')
            if ddi:
                definitions.append(ddi)
        return definitions
    except Exception as e:
        print(f"XML Error {xml_path}: {e}")
        return []

def parse_complex_log(bin_path, dlv_definitions):
    data_rows = []
    
    # Common readable names for the DDI codes
    ddi_lookup = {
        '0054': 'Yield_Mass', 
        '0053': 'Yield_Vol', 
        '018D': 'Speed', 
        '0063': 'Moisture',
        '0106': 'Moisture_Alt',
        '0074': 'Area'
    }

    with open(bin_path, 'rb') as f:
        file_content = f.read()
        
    offset = 0
    total_len = len(file_content)
    
    while offset + HEADER_SIZE <= total_len:
        # 1. Read the Fixed Header
        try:
            chunk = file_content[offset : offset + HEADER_SIZE]
            time_ms, lat_raw, lon_raw, pos_status, dlv_count = struct.unpack(HEADER_FMT, chunk)
            offset += HEADER_SIZE
            
            # 2. Parse Coordinates (Clause 8.6.4 uses 1e-16 scaling)
            lat = lat_raw * 1e-16
            lon = lon_raw * 1e-16
            
            # 3. Read Dynamic Data (DLVs)
            # The structure is: [List of Indices (Bytes)] + [List of Values (Integers)]
            # Size = (Count * 1 byte) + (Count * 4 bytes)
            payload_size = dlv_count + (dlv_count * 4)
            
            if offset + payload_size > total_len:
                break # Unexpected end of file
            
            payload = file_content[offset : offset + payload_size]
            offset += payload_size
            
            # Unpack Indices
            indices = struct.unpack(f"{dlv_count}B", payload[:dlv_count])
            # Unpack Values
            values = struct.unpack(f"{dlv_count}i", payload[dlv_count:])
            
            # 4. Build the Row
            row = {
                'Time_ms': time_ms,
                'Latitude': lat,
                'Longitude': lon,
                'Status': pos_status
            }
            
            # Map values to their DDI names using the XML definition list
            for i, dlv_index in enumerate(indices):
                if dlv_index < len(dlv_definitions):
                    ddi_code = dlv_definitions[dlv_index]
                    col_name = ddi_lookup.get(ddi_code, f"DDI_{ddi_code}")
                    row[col_name] = values[i]
            
            data_rows.append(row)
            
        except struct.error:
            break # Stop if parsing fails

    return pd.DataFrame(data_rows)

# --- MAIN EXECUTION ---
print("Starting Clause 8.6.4 Decoder...")
all_dfs = []

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".XML"):
        xml_path = os.path.join(data_folder, filename)
        bin_path = os.path.join(data_folder, filename.replace('.xml', '.bin').replace('.XML', '.BIN'))
        
        if os.path.exists(bin_path):
            # 1. Get Definitions from XML
            dlv_defs = parse_xml_definitions(xml_path)
            
            if dlv_defs:
                # 2. Parse Binary
                # print(f"Processing {filename}...") 
                df = parse_complex_log(bin_path, dlv_defs)
                
                if not df.empty:
                    # Sanity Check: If Lat is > 90, this might be a Type 1 log (the old format)
                    # But since your data looked "Russian" (Lat 59) instead of 114, 
                    # this format is likely the correct one.
                    
                    df['LogID'] = os.path.splitext(filename)[0]
                    all_dfs.append(df)

# --- SAVE ---
if all_dfs:
    master_df = pd.concat(all_dfs, ignore_index=True)
    
    # Optional: Filter out pure noise (0,0) or physics errors
    master_df = master_df[
        (master_df['Latitude'].between(54, 58)) & # Rough box for Denmark
        (master_df['Longitude'].between(8, 15))
    ]
    
    out_file = os.path.join(out_folder, 'FIXED_HARVEST_DATA.csv')
    master_df.to_csv(out_file, index=False)
    
    print(f"DONE! Processed {len(master_df)} points.")
    print(f"Saved to: {out_file}")
    print("Load this into QGIS - The 'Russian' cluster should now be in Denmark.")
else:
    print("No valid data found.")

Starting Clause 8.6.4 Decoder...
DONE! Processed 0 points.
Saved to: ./data/taskdata_3_out\FIXED_HARVEST_DATA.csv
Load this into QGIS - The 'Russian' cluster should now be in Denmark.


In [56]:
import struct
import xml.etree.ElementTree as ET
import pandas as pd
import os

# --- CONFIGURATION ---

data_folder = r'./data/taskdata_3' 
out_folder = r'./data/taskdata_3_out'
os.makedirs(out_folder, exist_ok=True)

# --- PARSING HELPERS ---

def get_dlv_defs(xml_path):
    """Extracts DDI definitions from the XML sidecar file."""
    try:
        tree = ET.parse(xml_path)
        return [elem.attrib.get('A') for elem in tree.findall(".//DLV")]
    except:
        return []

def parse_type_1(chunk, offset):
    """Attempts to parse as ISOBUS Type 1 (32-bit, 1e-7)."""
    # Format: Time(4) + Lat(4) + Lon(4) = 12 bytes min (simplified)
    # This is a heuristic guess since Type 1 is variable.
    # We assume [Time, Lat, Lon] follows immediately if PTN is present.
    try:
        # Unpack Time(L), Lat(i), Lon(i)
        time_ms, lat_raw, lon_raw = struct.unpack('<Lii', chunk[:12])
        lat = lat_raw * 1e-7
        lon = lon_raw * 1e-7
        return lat, lon, 12 # Return bytes consumed
    except:
        return None, None, 0

def parse_type_2(chunk):
    """Attempts to parse as ISOBUS Type 2 (64-bit, 1e-16)."""
    # Format: Time(4) + Lat(8) + Lon(8) + Status(1) + Count(1) = 22 bytes
    try:
        time_ms, lat_raw, lon_raw, _, _ = struct.unpack('<IqqBB', chunk[:22])
        lat = lat_raw * 1e-16
        lon = lon_raw * 1e-16
        return lat, lon, 22
    except:
        return None, None, 0

def process_file_smart(bin_path, xml_path, definitions):
    """
    Decides whether the file is Type 1 or Type 2 by checking the first point,
    then parses the whole file.
    """
    with open(bin_path, 'rb') as f:
        content = f.read()

    if len(content) < 50: return pd.DataFrame() # Skip empty files

    # --- AUTO-DETECTION ---
    format_type = None
    
    # Check Type 2 First (The "Russian" Fix)
    lat_2, lon_2, _ = parse_type_2(content)
    # Check if it lands roughly in Denmark (54-58N, 8-15E)
    if lat_2 and 54 < lat_2 < 58 and 8 < lon_2 < 15:
        format_type = "TYPE_2"
    
    if not format_type:
        # Check Type 1 (The "Standard" Fix)
        lat_1, lon_1, _ = parse_type_1(content, 0)
        if lat_1 and 54 < lat_1 < 58 and 8 < lon_1 < 15:
            format_type = "TYPE_1"

    if not format_type:
        print(f"  [SKIP] {os.path.basename(bin_path)}: Could not auto-detect format (Lat/Lon outside Denmark).")
        return pd.DataFrame()

    print(f"  [OK]   {os.path.basename(bin_path)} detected as {format_type}")

    # --- FULL PARSING ---
    data_rows = []
    offset = 0
    total_len = len(content)
    
    ddi_lookup = {'0054': 'Yield_Mass', '0053': 'Yield_Vol', '018D': 'Speed', '0063': 'Moisture', '0106': 'Moisture_Alt', '0074': 'Area'}

    while offset < total_len:
        try:
            if format_type == "TYPE_2":
                # Header: 22 bytes
                if offset + 22 > total_len: break
                time_ms, lat_raw, lon_raw, status, count = struct.unpack('<IqqBB', content[offset:offset+22])
                offset += 22
                
                lat = lat_raw * 1e-16
                lon = lon_raw * 1e-16
                
                # Payload: count bytes (indices) + count*4 bytes (values)
                payload_len = count + (count * 4)
                if offset + payload_len > total_len: break
                
                chunk = content[offset : offset + payload_len]
                offset += payload_len
                
                indices = struct.unpack(f"{count}B", chunk[:count])
                values = struct.unpack(f"{count}i", chunk[count:])
                
            elif format_type == "TYPE_1":
                # Header: 4 bytes (Time)
                if offset + 4 > total_len: break
                time_ms = struct.unpack('<L', content[offset:offset+4])[0]
                offset += 4
                
                # Check for Position (PTN) logic is complex in Type 1 without full XML parsing
                # SIMPLIFICATION: Assuming PTN is always first if present, 2x 4bytes
                # We reuse the "detect" logic to assume 8 bytes of Lat/Lon follow
                if offset + 8 > total_len: break
                lat_raw, lon_raw = struct.unpack('<ii', content[offset:offset+8])
                offset += 8
                
                lat = lat_raw * 1e-7
                lon = lon_raw * 1e-7
                
                # In Type 1, the values follow based on the XML order. 
                # This is "risky" without strict XML parsing, but usually works for simple logs.
                # We will skip value parsing for Type 1 here to avoid crashes, 
                # focusing on recovering the Type 2 "Russian" data.
                indices = []
                values = []
                # Type 1 is tricky because DLV sizes aren't in the binary header.
                # If you need Type 1 values, use the specific parser for those files.
                # For this script, we assume Type 1 files use a fixed block size or we skip the DLVs.
                # Hack: Skip a fixed amount (e.g. 50 bytes) to next row? No, impossible.
                # BETTER: If Type 1, use the basic parser logic from Script 1.
                # ... Implementing simplified Type 1 value skipper ...
                # Let's assume standard DLV count matching definitions
                count = len(definitions)
                needed = count * 4 # 4 bytes per DLV value
                if offset + needed > total_len: break
                values = struct.unpack(f"{count}i", content[offset:offset+needed])
                indices = range(count) # Assume sequential
                offset += needed

            # Store Data
            row = {'Time_ms': time_ms, 'Latitude': lat, 'Longitude': lon, 'File': os.path.basename(bin_path)}
            
            for i, idx in enumerate(indices):
                # Handle Type 1 vs Type 2 index differences
                actual_idx = idx if format_type == "TYPE_2" else indices[i]
                if actual_idx < len(definitions):
                    ddi = definitions[actual_idx]
                    row[ddi_lookup.get(ddi, f'DDI_{ddi}')] = values[i]

            data_rows.append(row)

        except struct.error:
            break

    return pd.DataFrame(data_rows)

# --- MAIN LOOP ---
print("Starting Smart Hybrid Parser...")
all_dfs = []

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".BIN"):
        xml_name = filename.rsplit('.', 1)[0] + ".xml"
        xml_full = os.path.join(data_folder, xml_name)
        bin_full = os.path.join(data_folder, filename)
        
        # 1. Get Definitions
        defs = get_dlv_defs(xml_full)
        
        # 2. Process
        df = process_file_smart(bin_full, xml_full, defs)
        
        if not df.empty:
            all_dfs.append(df)

# --- SAVE ---
if all_dfs:
    master = pd.concat(all_dfs, ignore_index=True)
    # Final cleanup of noise
    master = master[(master['Latitude'].between(54, 58)) & (master['Longitude'].between(8, 15))]
    
    out_path = os.path.join(out_folder, 'SMART_FIXED_HARVEST.csv')
    master.to_csv(out_path, index=False)
    print(f"\nSUCCESS. Saved {len(master)} points to {out_path}")
else:
    print("\nNo valid data found (Check paths or definitions).")

Starting Smart Hybrid Parser...
  [SKIP] TLG00001.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00002.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00003.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00004.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00005.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00006.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00007.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00008.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00009.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00010.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00011.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00012.bin: Could not auto-detect format (Lat/Lon outside Denmark).
  [SKIP] TLG00013.bi

In [57]:
import struct
import xml.etree.ElementTree as ET
import pandas as pd
import os

# --- CONFIGURATION ---
data_folder = r'./data/taskdata'
out_folder = r'./data/taskdata_out'
os.makedirs(out_folder, exist_ok=True)

# LEDREBORG BOUNDING BOX (The "Target")
LAT_MIN, LAT_MAX = 55.5, 55.7
LON_MIN, LON_MAX = 11.8, 12.1

def get_dlv_defs(xml_path):
    try:
        tree = ET.parse(xml_path)
        return [elem.attrib.get('A') for elem in tree.findall(".//DLV")]
    except: return []

def parse_with_auto_align(bin_path, definitions):
    with open(bin_path, 'rb') as f:
        content = f.read()

    total_len = len(content)
    cursor = 0
    valid_rows = []
    
    # Header format: Time(4) + Lat(8) + Lon(8) + Status(1) + Count(1) = 22 bytes
    HEADER_SIZE = 22
    
    print(f"  Scanning {os.path.basename(bin_path)}...")

    while cursor < total_len - HEADER_SIZE:
        # 1. Try to decode at current cursor
        try:
            time_ms, lat_raw, lon_raw, status, count = struct.unpack('<IqqBB', content[cursor:cursor+HEADER_SIZE])
            
            # Apply Type 4 Scaling
            lat = lat_raw * 1e-16
            lon = lon_raw * 1e-16
            
            # 2. CHECK: Is this valid Ledreborg data?
            if LAT_MIN < lat < LAT_MAX and LON_MIN < lon < LON_MAX:
                # HIT! We found a valid row.
                
                # Parse Payload
                payload_len = count + (count * 4)
                if cursor + HEADER_SIZE + payload_len > total_len: break
                
                chunk = content[cursor + HEADER_SIZE : cursor + HEADER_SIZE + payload_len]
                indices = struct.unpack(f"{count}B", chunk[:count])
                values = struct.unpack(f"{count}i", chunk[count:])
                
                # Store Data
                row = {'Time_ms': time_ms, 'Latitude': lat, 'Longitude': lon, 'File': os.path.basename(bin_path)}
                for i, idx in enumerate(indices):
                    if idx < len(definitions):
                        row[definitions[idx]] = values[i]
                valid_rows.append(row)
                
                # Advance Cursor by the full row length (Header + Payload)
                cursor += HEADER_SIZE + payload_len
                
            else:
                # MISS! This byte alignment is wrong (or it's header/garbage).
                # Shift forward by ONE byte and try again.
                cursor += 1
                
        except struct.error:
            cursor += 1

    return pd.DataFrame(valid_rows)

# --- MAIN EXECUTION ---
all_dfs = []
ddi_names = {'0054': 'Yield_Mass', '0053': 'Yield_Vol', '018D': 'Speed', '0063': 'Moisture', '0074': 'Area'}

print("Starting Byte-Shift Scanner...")

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".BIN"):
        xml_full = os.path.join(data_folder, filename.rsplit('.', 1)[0] + ".xml")
        bin_full = os.path.join(data_folder, filename)
        
        if os.path.exists(xml_full):
            defs = get_dlv_defs(xml_full)
            # Run the scanner
            df = parse_with_auto_align(bin_full, defs)
            
            if not df.empty:
                print(f"    -> Recovered {len(df)} valid points.")
                all_dfs.append(df)
            else:
                print("    -> No valid data found.")

if all_dfs:
    master = pd.concat(all_dfs, ignore_index=True)
    
    # Rename columns to readable names
    master.rename(columns=ddi_names, inplace=True)
    
    out_path = os.path.join(out_folder, 'ALIGNED_HARVEST_DATA.csv')
    master.to_csv(out_path, index=False)
    print(f"\nSUCCESS! Total {len(master)} points recovered.")
    print(f"Saved to: {out_path}")
    print("This dataset should have ZERO Russian points and ZERO noise.")

Starting Byte-Shift Scanner...
  Scanning TLG00001.bin...
    -> No valid data found.
  Scanning TLG00002.bin...
    -> No valid data found.
  Scanning TLG00003.bin...
    -> No valid data found.
  Scanning TLG00004.bin...
    -> No valid data found.
  Scanning TLG00005.bin...
    -> No valid data found.
  Scanning TLG00006.bin...
    -> No valid data found.
  Scanning TLG00007.bin...
    -> No valid data found.
  Scanning TLG00008.bin...
    -> No valid data found.
  Scanning TLG00009.bin...
    -> No valid data found.
  Scanning TLG00010.bin...
    -> No valid data found.
  Scanning TLG00011.bin...
    -> No valid data found.
  Scanning TLG00012.bin...
    -> No valid data found.
  Scanning TLG00013.bin...
    -> No valid data found.
  Scanning TLG00014.bin...
    -> No valid data found.
  Scanning TLG00015.bin...
    -> No valid data found.
  Scanning TLG00016.bin...
    -> No valid data found.
  Scanning TLG00017.bin...
    -> No valid data found.
  Scanning TLG00018.bin...
    -> 

In [58]:
import struct
import xml.etree.ElementTree as ET
import pandas as pd
import os

# --- CONFIGURATION ---
data_folder = r'./data/taskdata'  # Path to your TLG files
out_folder = r'./data/taskdata_out'
os.makedirs(out_folder, exist_ok=True)

def get_dlv_defs(xml_path):
    try:
        tree = ET.parse(xml_path)
        return [elem.attrib.get('A') for elem in tree.findall(".//DLV")]
    except: return []

def parse_blindly(bin_path, definitions):
    """
    Scans file byte-by-byte. Accepts ANY valid coordinate (-90 to 90).
    """
    with open(bin_path, 'rb') as f:
        content = f.read()

    total_len = len(content)
    cursor = 0
    valid_rows = []
    
    # ISO 11783-10 Type 4 Header: Time(4) + Lat(8) + Lon(8) + Status(1) + Count(1)
    HEADER_SIZE = 22
    
    print(f"  Scanning {os.path.basename(bin_path)}...")

    while cursor < total_len - HEADER_SIZE:
        try:
            # 1. Try to decode header
            time_ms, lat_raw, lon_raw, status, count = struct.unpack('<IqqBB', content[cursor:cursor+HEADER_SIZE])
            
            # 2. Check Type 4 Scaling (1e-16)
            lat = lat_raw * 1e-16
            lon = lon_raw * 1e-16
            
            # 3. SANITY CHECK (Is it a coordinate on Earth?)
            # We accept "Russian" data (Lat 60) and "Danish" data (Lat 55).
            # We reject pure noise (Lat 0 or > 90).
            if abs(lat) > 1 and abs(lat) <= 90 and abs(lon) > 1 and abs(lon) <= 180 and count < 50:
                
                # We found a valid frame!
                payload_len = count + (count * 4)
                if cursor + HEADER_SIZE + payload_len > total_len: break
                
                chunk = content[cursor + HEADER_SIZE : cursor + HEADER_SIZE + payload_len]
                indices = struct.unpack(f"{count}B", chunk[:count])
                values = struct.unpack(f"{count}i", chunk[count:])
                
                row = {
                    'Time_ms': time_ms, 
                    'Latitude': lat, 
                    'Longitude': lon, 
                    'File': os.path.basename(bin_path)
                }
                
                # Map DDI values
                for i, idx in enumerate(indices):
                    if idx < len(definitions):
                        row[definitions[idx]] = values[i]
                
                valid_rows.append(row)
                
                # Jump forward by the full packet size
                cursor += HEADER_SIZE + payload_len
            else:
                # Garbage/Header/Offset -> Shift 1 byte and try again
                cursor += 1
                
        except struct.error:
            cursor += 1

    return pd.DataFrame(valid_rows)

# --- EXECUTION ---
all_dfs = []
ddi_lookup = {'0054': 'Yield_Mass', '0053': 'Yield_Vol', '018D': 'Speed', '0063': 'Moisture', '0074': 'Area'}

print("Starting Blind Scanner...")

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".BIN"):
        xml_full = os.path.join(data_folder, filename.rsplit('.', 1)[0] + ".xml")
        bin_full = os.path.join(data_folder, filename)
        
        if os.path.exists(xml_full):
            defs = get_dlv_defs(xml_full)
            df = parse_blindly(bin_full, defs)
            
            if not df.empty:
                print(f"    -> Found {len(df)} points.")
                all_dfs.append(df)

if all_dfs:
    master = pd.concat(all_dfs, ignore_index=True)
    master.rename(columns=ddi_lookup, inplace=True)
    
    out_path = os.path.join(out_folder, 'BLIND_RAW_DATA.csv')
    master.to_csv(out_path, index=False)
    print(f"\nSUCCESS! Extracted {len(master)} raw points.")
    print(f"Saved to: {out_path}")
else:
    print("No data found even with blind scanning.")

Starting Blind Scanner...
  Scanning TLG00001.bin...
    -> Found 235 points.
  Scanning TLG00002.bin...
    -> Found 283 points.
  Scanning TLG00003.bin...
    -> Found 2913 points.
  Scanning TLG00004.bin...
    -> Found 3086 points.
  Scanning TLG00005.bin...
    -> Found 325 points.
  Scanning TLG00006.bin...
    -> Found 194 points.
  Scanning TLG00007.bin...
    -> Found 5965 points.
  Scanning TLG00008.bin...
    -> Found 7862 points.
  Scanning TLG00009.bin...
    -> Found 6498 points.
  Scanning TLG00010.bin...
    -> Found 215 points.
  Scanning TLG00011.bin...
    -> Found 404 points.
  Scanning TLG00012.bin...
    -> Found 408 points.
  Scanning TLG00013.bin...
    -> Found 18 points.
  Scanning TLG00014.bin...
    -> Found 157 points.
  Scanning TLG00015.bin...
    -> Found 175 points.
  Scanning TLG00016.bin...
    -> Found 332 points.
  Scanning TLG00017.bin...
    -> Found 320 points.
  Scanning TLG00018.bin...
    -> Found 789 points.
  Scanning TLG00019.bin...
    -> 

In [59]:
import struct
import xml.etree.ElementTree as ET
import pandas as pd
import os

# --- CONFIGURATION ---
data_folder = r'./data/taskdata'
out_folder = r'./data/taskdata_out'
os.makedirs(out_folder, exist_ok=True)

# THE TARGET: Ledreborg Gods (Strict Filter)
# We only accept data that falls exactly inside this box.
TARGET_LAT_MIN, TARGET_LAT_MAX = 55.50, 55.75
TARGET_LON_MIN, TARGET_LON_MAX = 11.80, 12.10

def get_dlv_defs(xml_path):
    try:
        tree = ET.parse(xml_path)
        return [elem.attrib.get('A') for elem in tree.findall(".//DLV")]
    except: return []

def brute_force_recover(bin_path, definitions):
    print(f"Forensic scan of: {os.path.basename(bin_path)}...")
    
    with open(bin_path, 'rb') as f:
        content = f.read()

    total_len = len(content)
    cursor = 0
    valid_rows = []
    
    # We define the column names mapping here
    ddi_lookup = {
        '0054': 'Yield_Mass', 
        '0053': 'Yield_Vol', 
        '018D': 'Speed', 
        '0063': 'Moisture', 
        '0074': 'Area',
        '0106': 'Moisture_Alt'
    }

    while cursor < total_len - 50: # Need at least 50 bytes for a row
        match_found = False
        
        # --- ATTEMPT 1: TYPE 2 (64-bit, 1e-16 scaling) ---
        # Header: Time(4) + Lat(8) + Lon(8) + Status(1) + Count(1) = 22 bytes
        try:
            time_ms, lat_raw, lon_raw, status, count = struct.unpack('<IqqBB', content[cursor:cursor+22])
            
            lat = lat_raw * 1e-16
            lon = lon_raw * 1e-16
            
            if (TARGET_LAT_MIN < lat < TARGET_LAT_MAX) and (TARGET_LON_MIN < lon < TARGET_LON_MAX):
                # HIT! We found a Type 2 row in Ledreborg
                payload_len = count + (count * 4)
                
                # Check if payload fits in file
                if cursor + 22 + payload_len <= total_len:
                    chunk = content[cursor + 22 : cursor + 22 + payload_len]
                    indices = struct.unpack(f"{count}B", chunk[:count])
                    values = struct.unpack(f"{count}i", chunk[count:])
                    
                    row = {'Time_ms': time_ms, 'Latitude': lat, 'Longitude': lon, 'Type': 'Type2'}
                    for i, idx in enumerate(indices):
                        if idx < len(definitions):
                            ddi = definitions[idx]
                            row[ddi_lookup.get(ddi, f'DDI_{ddi}')] = values[i]
                    
                    valid_rows.append(row)
                    cursor += 22 + payload_len
                    match_found = True
        except: pass

        if match_found: continue

        # --- ATTEMPT 2: TYPE 1 (32-bit, 1e-7 scaling) ---
        # Header: Time(4) + Lat(4) + Lon(4) = 12 bytes
        # Note: Type 1 is harder because it doesn't have a "Count" byte. 
        # We assume the number of DLVs matches the XML definition exactly.
        try:
            time_ms, lat_raw, lon_raw = struct.unpack('<Lii', content[cursor:cursor+12])
            
            lat = lat_raw * 1e-7
            lon = lon_raw * 1e-7
            
            if (TARGET_LAT_MIN < lat < TARGET_LAT_MAX) and (TARGET_LON_MIN < lon < TARGET_LON_MAX):
                # HIT! We found a Type 1 row in Ledreborg
                
                # Assume standard payload: all DLVs defined in XML are present as 4-byte ints
                count = len(definitions)
                payload_len = count * 4
                
                if cursor + 12 + payload_len <= total_len:
                    values = struct.unpack(f"{count}i", content[cursor+12 : cursor+12+payload_len])
                    
                    row = {'Time_ms': time_ms, 'Latitude': lat, 'Longitude': lon, 'Type': 'Type1'}
                    for i, val in enumerate(values):
                        ddi = definitions[i]
                        row[ddi_lookup.get(ddi, f'DDI_{ddi}')] = val
                    
                    valid_rows.append(row)
                    cursor += 12 + payload_len
                    match_found = True
        except: pass

        if match_found: continue
        
        # If no match, shift by ONE byte and try again
        cursor += 1

    return pd.DataFrame(valid_rows)

# --- EXECUTION ---
all_dfs = []
print(f"Starting Forensic Scan (Target: Ledreborg {TARGET_LAT_MIN}-{TARGET_LAT_MAX}N)...")

for filename in os.listdir(data_folder):
    if filename.upper().startswith("TLG") and filename.upper().endswith(".BIN"):
        xml_full = os.path.join(data_folder, filename.rsplit('.', 1)[0] + ".xml")
        bin_full = os.path.join(data_folder, filename)
        
        if os.path.exists(xml_full):
            defs = get_dlv_defs(xml_full)
            df = brute_force_recover(bin_full, defs)
            
            if not df.empty:
                df['File'] = filename
                print(f"  -> SUCCESS: Recovered {len(df)} points from {filename}")
                all_dfs.append(df)
            else:
                print(f"  -> No valid Ledreborg data found in {filename}")

if all_dfs:
    master = pd.concat(all_dfs, ignore_index=True)
    out_path = os.path.join(out_folder, 'FORENSIC_HARVEST_DATA.csv')
    master.to_csv(out_path, index=False)
    print(f"\nDONE. Saved {len(master)} clean points to: {out_path}")
else:
    print("\nFAILED. Could not find any coordinates inside Ledreborg.")

Starting Forensic Scan (Target: Ledreborg 55.5-55.75N)...
Forensic scan of: TLG00001.bin...
  -> SUCCESS: Recovered 205 points from TLG00001.bin
Forensic scan of: TLG00002.bin...
  -> SUCCESS: Recovered 275 points from TLG00002.bin
Forensic scan of: TLG00003.bin...
  -> SUCCESS: Recovered 2349 points from TLG00003.bin
Forensic scan of: TLG00004.bin...
  -> SUCCESS: Recovered 2753 points from TLG00004.bin
Forensic scan of: TLG00005.bin...
  -> SUCCESS: Recovered 300 points from TLG00005.bin
Forensic scan of: TLG00006.bin...
  -> SUCCESS: Recovered 185 points from TLG00006.bin
Forensic scan of: TLG00007.bin...
  -> SUCCESS: Recovered 4390 points from TLG00007.bin
Forensic scan of: TLG00008.bin...
  -> SUCCESS: Recovered 4782 points from TLG00008.bin
Forensic scan of: TLG00009.bin...
  -> SUCCESS: Recovered 4213 points from TLG00009.bin
Forensic scan of: TLG00010.bin...
  -> SUCCESS: Recovered 151 points from TLG00010.bin
Forensic scan of: TLG00011.bin...
  -> SUCCESS: Recovered 367 point

In [62]:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import os

# --- CONFIGURATION ---
input_csv = './data/taskdata_out/FORENSIC_HARVEST_DATA.csv'
taskdata_xml = './data/taskdata/TASKDATA.XML'
output_csv = './data/taskdata_out/CLEANED_HARVEST_WITH_DATE.csv'

# --- 1. LOAD DATA & DATES ---
print("Loading data...")
df = pd.read_csv(input_csv)
df['Time_ms'] = pd.to_numeric(df['Time_ms'], errors='coerce')

# Parse Task Start Times from XML
print("Parsing Task Start Times...")
tree = ET.parse(taskdata_xml)
root = tree.getroot()

tlg_map = {}
for tsk in root.findall(".//TSK"):
    tlg = tsk.find("TLG")
    tim = tsk.find("TIM")
    if tlg is not None and tim is not None:
        fname = tlg.attrib.get('A')
        start = tim.attrib.get('A')
        if fname and start:
            # Map both "TLG00001" and "TLG00001.bin" to be safe
            tlg_map[fname] = start
            tlg_map[fname + '.bin'] = start

# --- 2. CALCULATE DATETIME ---
print("Calculating Datetime...")
# Map the file to its start time
df['TaskStart'] = df['File'].map(tlg_map)
df['TaskStart'] = pd.to_datetime(df['TaskStart'])

# Add the millisecond offset
df['Datetime'] = df['TaskStart'] + pd.to_timedelta(df['Time_ms'], unit='ms')

# --- 3. CLEANING & RECALCULATION ---
print("Cleaning Noise...")

# A. Keep only rows with valid Yield (0 to 50 kg/s)
# Raw Yield is mg/s. 50 kg/s = 50,000,000.
df_clean = df[
    (df['Yield_Mass'] > 0) & 
    (df['Yield_Mass'] < 50000000)
].copy()

# B. Deduplicate
# The scanner found duplicates (same time). We keep the first valid one.
df_clean = df_clean.drop_duplicates(subset=['File', 'Time_ms'])

# C. Calculate GPS Speed (Fixes the "Strange Effect" from broken Speed sensor)
print("Recalculating Speed from GPS...")
df_clean = df_clean.sort_values(['File', 'Time_ms'])

# Calculate Distance (Pythagoras on Lat/Lon degrees -> meters)
# 1 deg Lat ~= 111,132 m
# 1 deg Lon ~= 111,132 * cos(lat) m
lat_rad = np.radians(df_clean['Latitude'])
dlat = df_clean.groupby('File')['Latitude'].diff() * 111132
dlon = df_clean.groupby('File')['Longitude'].diff() * 111132 * np.cos(lat_rad)
dist_m = np.sqrt(dlat**2 + dlon**2)

# Calculate Time Delta (seconds)
dt_s = df_clean.groupby('File')['Time_ms'].diff() / 1000.0

# Speed = Dist / Time
df_clean['Speed_GPS_m_s'] = dist_m / dt_s

# Smooth Speed (Rolling average to remove GPS jitter)
df_clean['Speed_Smooth'] = df_clean.groupby('File')['Speed_GPS_m_s'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean()
)

# Filter out impossible speeds (e.g., > 20 m/s or stopped)
df_final = df_clean[
    (df_clean['Speed_Smooth'] > 0.5) & 
    (df_clean['Speed_Smooth'] < 20)
].copy()

# --- 4. FORMAT OUTPUT ---
# Convert Yield to kg/s for readability
df_final['Yield_kg_s'] = df_final['Yield_Mass'] / 1_000_000

# Select useful columns
out_cols = [
    'Datetime', 'Latitude', 'Longitude', 
    'Yield_kg_s', 'Speed_Smooth', 'Moisture', 'File'
]

df_final[out_cols].to_csv(output_csv, index=False)

print(f"Success! Saved {len(df_final)} clean points to {output_csv}")
print("Columns: Datetime, Latitude, Longitude, Yield_kg_s, Speed_Smooth (m/s)")

Loading data...
Parsing Task Start Times...
Calculating Datetime...
Cleaning Noise...
Recalculating Speed from GPS...
Success! Saved 8 clean points to ./data/taskdata_out/CLEANED_HARVEST_WITH_DATE.csv
Columns: Datetime, Latitude, Longitude, Yield_kg_s, Speed_Smooth (m/s)
