In [2]:
import os
import struct
import json
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from datetime import datetime
import pyproj

# --- CONFIGURATION -----------------------------------------------------------
INPUT_FOLDER    = r'/Users/holmes/local_dev/agri_analysis/data/TASKDATA'
OUTPUT_FOLDER   = r'/Users/holmes/local_dev/agri_analysis/data/ENRICHED_SIMPLE'
MIN_LAT, MAX_LAT = 54.0, 58.0   # Denmark Latitude Bounds
MIN_LON, MAX_LON = 8.0, 16.0    # Denmark Longitude Bounds
# -----------------------------------------------------------------------------

# DDI Registry (The Dictionary of known sensors)
DDI_MAP = {
    '018D': 'Speed_M_S',           # Scaled to m/s
    '0095': 'Yield_Vol_L_S',       # Scaled to L/s
    '0054': 'Yield_Mass_Kg_S',     # Scaled to kg/s
    '0063': 'Moisture_Pct',        # Scaled to %
    '013A': 'Fuel_Rate_L_H',       # Scaled to L/h
    '008D': 'Work_State',          # 0-100%
    '0055': 'Crop_Temp_C',         # Celsius
    'E122': 'Header_Status',       # Enum
    '0074': 'Total_Area_Ha',       # Hectares
    '0077': 'Duration_Sec',        # Seconds
}

def ensure_folder(path):
    if not os.path.exists(path): os.makedirs(path)

def parse_xml_structure(xml_path):
    """Reads the sidecar XML to find the list of sensors (DDIs)."""
    ddis = []
    if os.path.exists(xml_path):
        try:
            tree = ET.parse(xml_path)
            # Find all Data Log Values (DLV) in the Time (TIM) block
            for dlv in tree.findall(".//DLV"):
                ddis.append(dlv.attrib.get('A'))
        except: pass
    
    # Fallback default if XML is broken/missing
    if not ddis: ddis = ['0054', '0095', '018D', '0063']
    return ddis

def detect_stride(content):
    """Finds the packet size by looking for the repeating Denmark coordinate signature."""
    offsets = []
    limit = min(len(content), 200000) # Scan first 200KB
    
    # Header is always Time(4) + Lat(4) + Lon(4) = 12 bytes
    for i in range(0, limit - 12):
        try:
            # Look for Lat/Lon at bytes 4-12 relative to start
            lat, lon = struct.unpack('<ii', content[i+4:i+12])
            lat, lon = lat * 1e-7, lon * 1e-7
            if (MIN_LAT < lat < MAX_LAT) and (MIN_LON < lon < MAX_LON):
                offsets.append(i)
        except: pass

    if len(offsets) < 10: return None
    
    # Find most common distance between headers
    diffs = np.diff(offsets)
    # Filter for realistic packet sizes (16 to 256 bytes)
    valid_diffs = [d for d in diffs if 16 <= d <= 256]
    if not valid_diffs: return None
    
    # Return the Mode (most frequent stride)
    return max(set(valid_diffs), key=valid_diffs.count)

def process_file(bin_path, xml_path, meta):
    # 1. Setup
    with open(bin_path, 'rb') as f: content = f.read()
    
    # 2. Auto-Detect Structure
    stride = detect_stride(content)
    if not stride: return False # Skip garbage files
    
    ddi_list = parse_xml_structure(xml_path)
    payload_len = stride - 12
    num_ints = payload_len // 4
    
    # 3. Extract Raw Data
    rows = []
    cursor = 0
    while cursor < len(content) - stride:
        try:
            # Read Header
            time_ms, lat_raw, lon_raw = struct.unpack('<Lii', content[cursor:cursor+12])
            lat, lon = lat_raw * 1e-7, lon_raw * 1e-7
            
            # Geo-Check
            if (MIN_LAT < lat < MAX_LAT) and (MIN_LON < lon < MAX_LON):
                # Read Payload
                p_start = cursor + 12
                # Unpack as generic integers
                values = struct.unpack(f'<{num_ints}I', content[p_start : p_start + (num_ints*4)])
                
                row = {'Time_Raw': time_ms, 'Latitude': lat, 'Longitude': lon}
                
                # Smart Mapping: Map known DDIs to values
                # Case A: Forensic 31-byte format (Compact)
                if stride == 31:
                    # Specific offsets for TLG00001 style files
                    payload_bytes = content[p_start : p_start+19]
                    row['Yield_Mass_Kg_S'] = struct.unpack('<I', payload_bytes[2:6])[0]
                    row['Speed_M_S']       = struct.unpack('<H', payload_bytes[10:12])[0]
                    row['Moisture_Pct']    = struct.unpack('<H', payload_bytes[15:17])[0]
                
                # Case B: Standard XML format
                else:
                    for i, val in enumerate(values):
                        if i < len(ddi_list):
                            code = ddi_list[i]
                            name = DDI_MAP.get(code, f"DDI_{code}")
                            row[name] = val
                            
                rows.append(row)
                cursor += stride # Jump to next
            else:
                cursor += 1 # Scan forward
        except: cursor += 1
            
    if not rows: return False
    df = pd.DataFrame(rows)

    # 4. Clean & Scale Values
    # Speed
    if 'Speed_M_S' in df.columns:
        s = df['Speed_M_S'].fillna(0)
        # Fix unsigned wrap noise (>2B) and scale mm/s -> m/s
        s = np.where(s > 2000000000, 0, s) * 0.001 
        df['Speed_M_S'] = s
    else: df['Speed_M_S'] = 2.0 # Default

    # Yield
    if 'Yield_Mass_Kg_S' in df.columns:
        m = df['Yield_Mass_Kg_S'].fillna(0)
        m = np.where(m > 2000000000, 0, m) * 0.000001 # mg/s -> kg/s
        df['Yield_Mass_Kg_S'] = m
    else: df['Yield_Mass_Kg_S'] = 0

    # Moisture
    if 'Moisture_Pct' in df.columns:
        mst = df['Moisture_Pct'].fillna(0)
        mst = np.where(mst > 2000000000, 0, mst) * 0.0001 * 100 # ppm -> %
        df['Moisture_Pct'] = mst
    else:
        df['Moisture_Pct'] = 0.0 # Force creation if missing

    # 5. Physics-Based Time & Yield Recalculation
    # Project Lat/Lon to Meters (UTM32N)
    transformer = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:25832", always_xy=True)
    xx, yy = transformer.transform(df['Longitude'].values, df['Latitude'].values)
    
    # Calculate Dist & Time
    dist = np.sqrt(np.diff(xx, prepend=xx[0])**2 + np.diff(yy, prepend=yy[0])**2)
    # Avoid Div/0: Minimum speed 0.1 m/s
    speed = df['Speed_M_S'].clip(lower=0.1) 
    dt = dist / speed
    dt = np.clip(dt, 0, 15) # Cap gaps at 15s
    
    # Reconstruct Timeline
    start_dt = meta.get('Start', datetime(2024,1,1))
    df['Datetime'] = start_dt + pd.to_timedelta(np.cumsum(dt), unit='s')
    
    # Calculate Yield (t/ha)
    # (kg/s * 10) / (m/s * width_m)
    HEADER_WIDTH = 9.0 
    df['Yield_T_Ha'] = (df['Yield_Mass_Kg_S'] * 10.0) / (speed * HEADER_WIDTH)
    df['Yield_T_Ha'] = df['Yield_T_Ha'].fillna(0).replace([np.inf, -np.inf], 0)

    # 6. Save
    year = meta.get('Year', 2024)
    # Sanitize Filename
    def clean(s): return str(s).strip().replace(' ', '_').replace('/', '-').replace(':', '')
    
    fname = f"{year}_{clean(meta['Field'])}_{clean(meta['Crop'])}.csv"
    if 'Import' in fname: fname = f"{year}_{clean(meta['LogID'])}.csv"
    
    out_dir = os.path.join(OUTPUT_FOLDER, str(year))
    ensure_folder(out_dir)
    
    # Handle collisions
    full_path = os.path.join(out_dir, fname)
    if os.path.exists(full_path):
        fname = fname.replace('.csv', f"_{clean(meta['LogID'])}.csv")
        full_path = os.path.join(out_dir, fname)
        
    # Select columns - FIX: Ensure all columns exist
    cols = ['Datetime', 'Latitude', 'Longitude', 'Yield_T_Ha', 'Speed_M_S', 'Moisture_Pct', 'Yield_Mass_Kg_S']
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
            
    df[cols].to_csv(full_path, index=False)
    return True

# --- MAIN EXECUTION ----------------------------------------------------------
if __name__ == '__main__':
    ensure_folder(OUTPUT_FOLDER)
    
    print("1. Indexing Tasks...")
    # Parse TASKDATA.XML for Metadata
    tasks = []
    try:
        tree = ET.parse(os.path.join(INPUT_FOLDER, 'TASKDATA.XML'))
        products = {p.attrib.get('A'): p.attrib.get('B') for p in tree.findall(".//PDT")}
        fields = {f.attrib.get('A'): f.attrib.get('C') for f in tree.findall(".//PFD")}
        
        for tsk in tree.findall(".//TSK"):
            tlg = tsk.find("TLG")
            if tlg is None: continue
            log_id = tlg.attrib.get('A')
            
            # Crop & Field
            crop_id = tsk.find("PAN").attrib.get('A') if tsk.find("PAN") is not None else ""
            crop = products.get(crop_id, "Unknown")
            field = fields.get(tsk.attrib.get('E'), "Unknown")
            
            # Time
            s_time = tsk.find("TIM").attrib.get('A') if tsk.find("TIM") is not None else ""
            try: start_dt = datetime.fromisoformat(s_time.replace('Z',''))
            except: start_dt = datetime(2024,1,1)
            
            tasks.append({
                'LogID': log_id,
                'BinPath': os.path.join(INPUT_FOLDER, log_id + '.bin'),
                'XmlPath': os.path.join(INPUT_FOLDER, log_id + '.xml'),
                'Year': start_dt.year,
                'Start': start_dt,
                'Crop': crop,
                'Field': field
            })
    except Exception as e:
        print(f"Error reading TASKDATA.XML: {e}")

    print(f"Found {len(tasks)} tasks. Processing...")
    
    count = 0
    for t in tasks:
        if os.path.exists(t['BinPath']):
            print(f"Processing {t['LogID']}...", end='\r')
            if process_file(t['BinPath'], t['XmlPath'], t):
                count += 1
                
    print(f"\nDone! Processed {count} files. Check: {OUTPUT_FOLDER}")

1. Indexing Tasks...
Found 142 tasks. Processing...
Processing TLG00142...
Done! Processed 141 files. Check: /Users/holmes/local_dev/agri_analysis/data/ENRICHED_SIMPLE
