<a href="https://colab.research.google.com/github/A00785001/TC5035/blob/main/01_rosbag_lidar_pre_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ROS Bag LiDAR Data Reader & 1D CNN Preprocessor
Extract and process LiDAR data for Geometric Branch (1D CNN) feature extraction

## Section 1: Read and Visualize LiDAR Data

In [None]:
# Install required packages
!pip install --quiet --upgrade jinja2>=3.1.2 markupsafe>=2.1.0
!pip install --quiet bagpy pandas numpy matplotlib seaborn

print("✓ All packages installed successfully!")

In [None]:
# Import libraries
from bagpy import bagreader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from glob import glob
import json
import csv
from datetime import datetime
import warnings

# Filter warnings for invalid LiDAR values (expected behavior)
warnings.filterwarnings('ignore', message='invalid value encountered')

print("Libraries loaded successfully!")

In [None]:
# Upload your ROS bag file (for Colab)
from google.colab import files
uploaded = files.upload()
bag_file = list(uploaded.keys())[0]
print(f"Uploaded: {bag_file}")

In [None]:
# Read the bag file
bag = bagreader(bag_file)
print(f"Opened bag file: {bag_file}")

In [None]:
# List all topics in the bag
print("Topics in bag:")
print(bag.topic_table)
print("\nLook for LiDAR topics like: /scan, /lidar/scan, /laser_scan, etc.")

In [None]:
# Specify your LiDAR topic (common for Waveshare Jetbot with RPLidar)
lidar_topic = "/scan"  # Modify if your topic is different
print(f"Using LiDAR topic: {lidar_topic}")

In [None]:
# Extract LiDAR data from the topic
print(f"Extracting LiDAR data from topic: {lidar_topic}")
lidar_csv_path = bag.message_by_topic(lidar_topic)
print(f"LiDAR data extracted to: {lidar_csv_path}")

In [None]:
# Load LiDAR data from CSV
print("Loading LiDAR data...")
lidar_df = pd.read_csv(lidar_csv_path)

print(f"\nLoaded {len(lidar_df)} LiDAR messages")
print(f"\nDataFrame shape: {lidar_df.shape}")
print(f"\nColumn names:")
print(lidar_df.columns.tolist())

In [None]:
# Display first few rows
print("First few rows of LiDAR data:")
lidar_df.head()

In [None]:
# Display basic statistics (filtering invalid values)
print("LiDAR data statistics (valid readings only):")
numeric_cols = lidar_df.select_dtypes(include=[np.number]).columns
clean_df = lidar_df[numeric_cols].replace([np.inf, -np.inf], np.nan)
clean_df.describe()

In [None]:
# Parse LaserScan data
def parse_array_string(array_str):
    """
    Parse array string from bagpy CSV into numpy array
    Example: '[1.2, 3.4, 5.6]' -> np.array([1.2, 3.4, 5.6])
    """
    if pd.isna(array_str) or array_str == '':
        return np.array([])

    array_str = str(array_str).strip('[]')
    if array_str == '':
        return np.array([])

    try:
        values = [float(x.strip()) for x in array_str.split(',') if x.strip()]
        return np.array(values)
    except:
        return np.array([])

print("Array parsing function ready!")

In [None]:
# Extract scan parameters from first message
first_scan = lidar_df.iloc[0]

print("LaserScan Parameters:")
print(f"  angle_min: {first_scan['angle_min']:.4f} rad ({np.rad2deg(first_scan['angle_min']):.1f}°)")
print(f"  angle_max: {first_scan['angle_max']:.4f} rad ({np.rad2deg(first_scan['angle_max']):.1f}°)")
print(f"  angle_increment: {first_scan['angle_increment']:.6f} rad ({np.rad2deg(first_scan['angle_increment']):.2f}°)")
print(f"  time_increment: {first_scan['time_increment']:.6f} sec")
print(f"  scan_time: {first_scan['scan_time']:.4f} sec")
print(f"  range_min: {first_scan['range_min']:.4f} m")
print(f"  range_max: {first_scan['range_max']:.4f} m")

# Parse ranges from first scan
ranges_sample = parse_array_string(first_scan['ranges'])
print(f"\n  Number of range readings per scan: {len(ranges_sample)}")

In [None]:
# Function to convert LaserScan to Cartesian coordinates
def laserscan_to_cartesian(ranges, angle_min, angle_increment):
    """
    Convert LaserScan ranges to Cartesian (x, y) coordinates
    """
    angles = angle_min + np.arange(len(ranges)) * angle_increment
    valid_mask = np.isfinite(ranges) & (ranges > 0)

    valid_ranges = ranges[valid_mask]
    valid_angles = angles[valid_mask]

    x = valid_ranges * np.cos(valid_angles)
    y = valid_ranges * np.sin(valid_angles)

    return x, y, valid_angles, valid_ranges

print("Cartesian conversion function ready!")

In [None]:
# Visualize a single LiDAR scan
scan_idx = 0
scan = lidar_df.iloc[scan_idx]

ranges = parse_array_string(scan['ranges'])
x, y, angles, valid_ranges = laserscan_to_cartesian(
    ranges, scan['angle_min'], scan['angle_increment']
)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Polar plot
ax1 = plt.subplot(121, projection='polar')
ax1.scatter(angles, valid_ranges, s=5, c=valid_ranges, cmap='viridis')
ax1.set_title(f'LiDAR Scan #{scan_idx} - Polar View', fontsize=14)
ax1.set_theta_zero_location('N')
ax1.set_ylim(0, scan['range_max'])

# Cartesian plot
ax2 = plt.subplot(122)
scatter = ax2.scatter(x, y, s=10, c=valid_ranges, cmap='viridis')
ax2.plot(0, 0, 'r*', markersize=15, label='Robot')
ax2.set_xlabel('X (meters)', fontsize=12)
ax2.set_ylabel('Y (meters)', fontsize=12)
ax2.set_title(f'LiDAR Scan #{scan_idx} - Cartesian View', fontsize=14)
ax2.axis('equal')
ax2.grid(True, alpha=0.3)
ax2.legend()
plt.colorbar(scatter, ax=ax2, label='Range (m)')

plt.tight_layout()
plt.show()

print(f"Scan timestamp: {scan['Time']}")
print(f"Valid points: {len(valid_ranges)} / {len(ranges)}")

In [None]:
# Visualize multiple scans
num_scans_to_plot = min(6, len(lidar_df))
scan_indices = np.linspace(0, len(lidar_df)-1, num_scans_to_plot, dtype=int)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, scan_idx in enumerate(scan_indices):
    scan = lidar_df.iloc[scan_idx]
    ranges = parse_array_string(scan['ranges'])
    x, y, angles, valid_ranges = laserscan_to_cartesian(
        ranges, scan['angle_min'], scan['angle_increment']
    )

    ax = axes[i]
    scatter = ax.scatter(x, y, s=5, c=valid_ranges, cmap='viridis', vmin=0, vmax=scan['range_max'])
    ax.plot(0, 0, 'r*', markersize=10)
    ax.set_xlabel('X (m)')
    ax.set_ylabel('Y (m)')
    ax.set_title(f'Scan #{scan_idx} - t={scan["Time"]:.2f}s')
    ax.axis('equal')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Analyze range statistics across all scans
print("Analyzing range statistics across all scans...")

all_ranges = []
for idx in range(len(lidar_df)):
    ranges = parse_array_string(lidar_df.iloc[idx]['ranges'])
    valid_ranges = ranges[np.isfinite(ranges) & (ranges > 0)]
    all_ranges.extend(valid_ranges)

all_ranges = np.array(all_ranges)

print(f"\nTotal valid range measurements: {len(all_ranges):,}")
print(f"Min range: {all_ranges.min():.4f} m")
print(f"Max range: {all_ranges.max():.4f} m")
print(f"Mean range: {all_ranges.mean():.4f} m")
print(f"Median range: {np.median(all_ranges):.4f} m")
print(f"Std dev: {all_ranges.std():.4f} m")

In [None]:
# Plot range distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(all_ranges, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Range (meters)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of LiDAR Range Measurements', fontsize=14)
axes[0].grid(True, alpha=0.3)

axes[1].boxplot(all_ranges, vert=True)
axes[1].set_ylabel('Range (meters)', fontsize=12)
axes[1].set_title('LiDAR Range Box Plot', fontsize=14)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create summary statistics
print("\n" + "="*60)
print("LIDAR DATA SUMMARY")
print("="*60)
print(f"Source bag: {bag_file}")
print(f"LiDAR topic: {lidar_topic}")
print(f"Total scans: {len(lidar_df)}")
print(f"\nScan parameters:")
print(f"  Angular range: {np.rad2deg(first_scan['angle_min']):.1f}° to {np.rad2deg(first_scan['angle_max']):.1f}°")
print(f"  Angular resolution: {np.rad2deg(first_scan['angle_increment']):.2f}°")
print(f"  Range: {first_scan['range_min']:.2f}m to {first_scan['range_max']:.2f}m")
print(f"  Points per scan: {len(ranges_sample)}")
print(f"  Scan rate: {1.0/first_scan['scan_time']:.1f} Hz")
print(f"\nData quality:")
print(f"  Total measurements: {len(all_ranges):,}")
print(f"  Average valid points per scan: {len(all_ranges)/len(lidar_df):.1f}")
print("="*60)

## Section 2: Process LiDAR for 1D CNN (Geometric Branch)

### Purpose
This section prepares LiDAR scans for geometric feature extraction using a custom 1D CNN. The goal is to clean, normalize, and format 360-degree distance measurements for the Geometric Branch of the sensor fusion pipeline.

### What This Section Does
1. **Cleans** invalid readings (inf/nan) by replacing with max_range
2. **Resamples** to exactly 360 points (if needed)
3. **Normalizes** ranges to [0, 1] for CNN training stability
4. **Saves** as CSV files (one scan per file)
5. **Preserves** ROS timestamps for temporal alignment with camera
6. **Generates** metadata for tracking and reproducibility

### Processing Pipeline
```
ROS Bag Scans → Parse Ranges → Clean (inf→max_range) → Normalize [0,1] → Save CSV
```

### Output Structure
```
processed_lidar/
├── scan_00000.csv         # Single row: 360 normalized values
├── scan_00001.csv
├── scan_00002.csv
├── ...
├── metadata.csv           # Per-scan data (timestamps, statistics)
└── dataset_info.json      # Dataset-level metadata
```

### File Formats

**Scan CSVs (scan_*.csv):**
- Format: CSV with **no header**, single row
- Shape: 1 row × 360 columns
- Values: Normalized ranges in [0, 1]
- Example: `0.234,0.456,0.789,...` (360 values)
- Load with: `pd.read_csv('scan_00000.csv', header=None).values[0]`

**metadata.csv:**
| Column | Type | Description |
|--------|------|-------------|
| filename | str | Scan filename (scan_XXXXX.csv) |
| timestamp | float | ROS timestamp (seconds) |
| timestamp_sec | int | Timestamp seconds part |
| timestamp_nsec | int | Timestamp nanoseconds part |
| frame_id | int | Sequential frame number |
| num_points | int | Number of points (always 360) |
| valid_points | int | Original valid points before cleaning |
| avg_range | float | Average range (meters, before normalization) |
| min_range | float | Minimum range (meters) |
| max_range | float | Maximum range (meters) |

**dataset_info.json:**
- ROS bag information (source, duration, timestamps)
- LiDAR specifications (model, topic, angular resolution, range limits)
- Processing parameters (invalid handling, normalization method)
- Geometric branch pipeline info (input shape, expected output)
- Dataset statistics (scan rate, average valid points)

### Data Cleaning Rationale

**Why replace inf/nan with max_range?**
- `inf` means "beyond sensor range" → semantically equals max_range
- `nan` means "no valid reading" → treat as max_range (no obstacle)
- Preserves information: "far away" vs "error at 0m"
- CNN-friendly: no special handling for inf/nan needed

**Why normalize to [0, 1]?**
- Standard practice for neural network inputs
- Training stability and convergence
- Model becomes sensor-agnostic (different LiDARs have different max_range)
- Value of 1.0 = max_range, 0.0 = at sensor

### Geometric Branch Pipeline

**Target Architecture:**
- Input: 360 distance values (normalized)
- Model: 1D CNN with 4 Conv1D layers + Global Average Pooling
- Parameters: ~350K
- Output: 256D raw geometric descriptor
- Post-processing: L2 normalization
- Final output: 256D normalized feature vector
- Inference: ~20-30ms on TensorRT FP16

### Important Notes
- **No feature extraction yet**: This notebook only preprocesses data. Feature extraction happens in a separate notebook.
- **Timestamps critical**: Enable alignment with camera features for sensor fusion.
- **No labels yet**: Data is unlabeled; loop closure labels assigned after feature extraction.
- **Normalized output**: Values in [0, 1] ready for 1D CNN input.

### Next Step: Feature Extraction
After running this section, scans are ready for 1D CNN feature extraction. The feature extraction notebook will:
1. Load these normalized CSVs
2. Pass through 1D CNN (4 Conv1D + GAP)
3. Output 256D geometric descriptors
4. Apply L2 normalization
5. Save features for sensor fusion

### Usage Example (for next pipeline stage)
```python
# Load a processed scan
import pandas as pd
import torch

scan = pd.read_csv('processed_lidar/scan_00000.csv', header=None).values[0]
# Shape: (360,), Range: [0, 1]

# Convert to tensor for PyTorch
scan_tensor = torch.tensor(scan, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
# Shape: (1, 1, 360) for Conv1D input

# Pass through 1D CNN (in feature extraction notebook)
# features = model(scan_tensor)  # Output: (1, 256)
```

In [None]:
# Set processing parameters
RANGE_MIN = first_scan['range_min']
RANGE_MAX = first_scan['range_max']
TARGET_LENGTH = 360  # Expected by 1D CNN

print(f"Processing parameters:")
print(f"  Range min: {RANGE_MIN:.2f} m")
print(f"  Range max: {RANGE_MAX:.2f} m")
print(f"  Target length: {TARGET_LENGTH} points")
print(f"  Output format: CSV")
print(f"  Normalization: [0, 1]")

In [None]:
# Create output folder
OUTPUT_DIR = "processed_lidar"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Created output directory: {OUTPUT_DIR}/")

In [None]:
# Function to process LiDAR scan
def process_lidar_scan(ranges, range_max, target_length=360):
    """
    Process LiDAR scan for 1D CNN:
    1. Replace inf/nan with max_range (semantically correct)
    2. Resample to target_length if needed
    3. Normalize to [0, 1]

    Returns: processed_ranges (normalized), num_valid_points, statistics
    """
    # Count original valid points
    original_valid = np.sum(np.isfinite(ranges) & (ranges > 0))

    # Replace inf with max_range (beyond range = max_range)
    ranges_clean = np.where(np.isinf(ranges), range_max, ranges)

    # Replace nan with max_range (no reading = max_range)
    ranges_clean = np.where(np.isnan(ranges_clean), range_max, ranges_clean)

    # Resample to target length if needed
    if len(ranges_clean) != target_length:
        indices = np.linspace(0, len(ranges_clean)-1, target_length)
        ranges_clean = np.interp(indices, np.arange(len(ranges_clean)), ranges_clean)

    # Calculate statistics before normalization
    stats = {
        'min': float(np.min(ranges_clean)),
        'max': float(np.max(ranges_clean)),
        'mean': float(np.mean(ranges_clean)),
        'median': float(np.median(ranges_clean))
    }

    # Normalize to [0, 1]
    ranges_normalized = ranges_clean / range_max

    return ranges_normalized, original_valid, stats

print("Processing function ready!")

In [None]:
# Process all scans
print("Processing LiDAR scans...")

metadata_list = []
processed_count = 0
failed_count = 0

for idx in range(len(lidar_df)):
    try:
        scan = lidar_df.iloc[idx]

        # Parse ranges
        ranges = parse_array_string(scan['ranges'])

        if len(ranges) == 0:
            print(f"Warning: Empty scan at index {idx}")
            failed_count += 1
            continue

        # Process scan
        ranges_processed, valid_points, stats = process_lidar_scan(
            ranges, RANGE_MAX, TARGET_LENGTH
        )

        # Save as CSV (one row with 360 values)
        output_filename = f"scan_{idx:05d}.csv"
        output_path = os.path.join(OUTPUT_DIR, output_filename)

        # Save the normalized ranges as a single row
        pd.DataFrame([ranges_processed]).to_csv(output_path, index=False, header=False)

        # Get timestamp info
        timestamp = scan['Time']
        timestamp_sec = int(timestamp)
        timestamp_nsec = int((timestamp - timestamp_sec) * 1e9)

        # Store metadata
        metadata_list.append({
            'filename': output_filename,
            'timestamp': timestamp,
            'timestamp_sec': timestamp_sec,
            'timestamp_nsec': timestamp_nsec,
            'frame_id': idx,
            'num_points': TARGET_LENGTH,
            'valid_points': valid_points,
            'avg_range': round(stats['mean'], 4),
            'min_range': round(stats['min'], 4),
            'max_range': round(stats['max'], 4)
        })

        processed_count += 1

        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1}/{len(lidar_df)} scans")

    except Exception as e:
        print(f"Error processing scan {idx}: {e}")
        failed_count += 1

print(f"\n✓ Completed: {processed_count} scans processed")
print(f"✗ Failed: {failed_count} scans")

In [None]:
# Save metadata CSV
csv_path = os.path.join(OUTPUT_DIR, 'metadata.csv')

with open(csv_path, 'w', newline='') as csvfile:
    fieldnames = ['filename', 'timestamp', 'timestamp_sec', 'timestamp_nsec',
                  'frame_id', 'num_points', 'valid_points', 'avg_range', 'min_range', 'max_range']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in metadata_list:
        writer.writerow(row)

print(f"✓ CSV metadata saved: {csv_path}")

In [None]:
# Calculate statistics
if metadata_list:
    avg_valid_points = np.mean([m['valid_points'] for m in metadata_list])
    avg_range = np.mean([m['avg_range'] for m in metadata_list])

    # Calculate scan rate
    if len(metadata_list) > 1:
        time_diff = metadata_list[-1]['timestamp'] - metadata_list[0]['timestamp']
        scan_rate = len(metadata_list) / time_diff if time_diff > 0 else 0
    else:
        scan_rate = 0
else:
    avg_valid_points = 0
    avg_range = 0
    scan_rate = 0

In [None]:
# Create JSON metadata
json_metadata = {
    "dataset_metadata": {
        "creation_date": datetime.now().isoformat(),
        "ros_bag_info": {
            "source_file": bag_file,
            "bag_duration_sec": metadata_list[-1]['timestamp'] - metadata_list[0]['timestamp'] if len(metadata_list) > 1 else 0,
            "bag_start_time": metadata_list[0]['timestamp'] if metadata_list else 0,
            "bag_end_time": metadata_list[-1]['timestamp'] if metadata_list else 0
        },
        "lidar_info": {
            "model": "RPLidar (detected from data)",
            "topic": lidar_topic,
            "angle_min_deg": round(np.rad2deg(first_scan['angle_min']), 2),
            "angle_max_deg": round(np.rad2deg(first_scan['angle_max']), 2),
            "angle_resolution_deg": round(np.rad2deg(first_scan['angle_increment']), 2),
            "scan_rate_hz": round(scan_rate, 2),
            "range_min_m": float(RANGE_MIN),
            "range_max_m": float(RANGE_MAX)
        },
        "processing_info": {
            "target_length": TARGET_LENGTH,
            "output_format": "csv",
            "invalid_handling": "inf/nan -> max_range",
            "normalization": "[0, 1] range",
            "normalization_factor": float(RANGE_MAX),
            "total_scans_processed": processed_count,
            "processing_script": "rosbag_lidar_processor_v1.ipynb"
        }
    },
    "geometric_branch_pipeline": {
        "input_shape": [TARGET_LENGTH],
        "input_range": [0, 1],
        "model_type": "1D CNN (4 Conv1D + GAP)",
        "output_features": 256,
        "normalization": "L2",
        "notes": "Ready for loop closure detection via geometric descriptors"
    },
    "statistics": {
        "scan_rate_hz": round(scan_rate, 2),
        "avg_valid_points_per_scan": round(avg_valid_points, 1),
        "avg_range_m": round(avg_range, 2),
        "total_scans": processed_count,
        "failed_scans": failed_count
    }
}

# Save JSON metadata
json_path = os.path.join(OUTPUT_DIR, 'dataset_info.json')
with open(json_path, 'w') as jsonfile:
    json.dump(json_metadata, jsonfile, indent=2)

print(f"✓ JSON metadata saved: {json_path}")

In [None]:
# Display processing summary
print("\n" + "="*60)
print("LIDAR PROCESSING COMPLETE")
print("="*60)
print(f"Output directory: {OUTPUT_DIR}/")
print(f"Total scans processed: {processed_count}")
print(f"Points per scan: {TARGET_LENGTH}")
print(f"Value range: [0, 1] (normalized)")
print(f"Average scan rate: {scan_rate:.2f} Hz")
print(f"\nMetadata files:")
print(f"  - {csv_path}")
print(f"  - {json_path}")
print("\n" + "="*60)

In [None]:
# Verify processed data
print("\nVerifying processed scans...")

# Load a sample processed scan
sample_file = os.path.join(OUTPUT_DIR, metadata_list[0]['filename'])
sample_scan = pd.read_csv(sample_file, header=None).values[0]

print(f"Sample scan shape: {sample_scan.shape}")
print(f"Value range: [{sample_scan.min():.4f}, {sample_scan.max():.4f}]")
print(f"Mean value: {sample_scan.mean():.4f}")
print(f"\nFirst 10 values: {sample_scan[:10]}")

# Visualize processed scan
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Line plot
ax1.plot(sample_scan, linewidth=0.5)
ax1.set_xlabel('Point Index', fontsize=12)
ax1.set_ylabel('Normalized Range [0, 1]', fontsize=12)
ax1.set_title('Processed LiDAR Scan (Normalized)', fontsize=14)
ax1.grid(True, alpha=0.3)

# Polar plot
ax2 = plt.subplot(122, projection='polar')
angles = np.linspace(0, 2*np.pi, TARGET_LENGTH, endpoint=False)
ax2.plot(angles, sample_scan, linewidth=0.5)
ax2.set_title('Polar View (Normalized)', fontsize=14)
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

## Next Steps: 1D CNN Feature Extraction

Processed scans are ready for the Geometric Branch pipeline:

```python
import pandas as pd
import torch

# Load metadata
metadata = pd.read_csv('processed_lidar/metadata.csv')

# Load a scan
scan = pd.read_csv(f"processed_lidar/{metadata.iloc[0]['filename']}", header=None).values[0]

# Convert to tensor (shape: [1, 360])
scan_tensor = torch.tensor(scan, dtype=torch.float32).unsqueeze(0)

# Pass through 1D CNN (your model)
# features = model(scan_tensor)  # Output: [1, 256] raw features
# features_normalized = F.normalize(features, p=2, dim=1)  # L2 normalized
```

**Data Format:**
- Each CSV: 360 normalized range values (single row)
- Range: [0, 1] where 1.0 = max_range
- Ready for Conv1D input: (batch, channels=1, length=360)
- Timestamps preserved for sensor fusion alignment