In [None]:
import os
import sys
import argparse
import shutil
import urllib.request
import zipfile
import logging
from pathlib import Path
import numpy as np
import time

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# KITTI dataset URLs
KITTI_URLS = [
    ('https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_velodyne.zip', 'data_object_velodyne.zip'),
    ('https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_calib.zip', 'data_object_calib.zip'),
    ('https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_label_2.zip', 'data_object_label_2.zip'),
    ('https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_image_2.zip', 'data_object_image_2.zip')
]

# Default output paths
DEFAULT_OUTPUT_PATH = './data/kitti'
DEFAULT_TEMP_DIR = './data/temp/kitti'

print("Libraries imported and basic parameters set up ✓")


Libraries imported and basic parameters set up ✓


In [2]:
# Cell 2: Define helper classes and utility functions
# ===================================
class DownloadProgressBar:
    """Progress bar for downloads"""
    def __init__(self, url):
        self.url = url
        self.filename = os.path.basename(url)
        self.downloaded_bytes = 0
        self.total_size = 0
        self.start_time = time.time()
        self.last_print_time = 0
    
    def __call__(self, count, block_size, total_size):
        self.total_size = total_size
        self.downloaded_bytes = count * block_size
        
        # Update progress every 0.5 seconds
        current_time = time.time()
        if current_time - self.last_print_time > 0.5:
            self.last_print_time = current_time
            
            # Calculate percentage and speed
            percent = min(100, self.downloaded_bytes * 100 // self.total_size) if self.total_size > 0 else 0
            elapsed_time = current_time - self.start_time
            speed = self.downloaded_bytes / (1024 * 1024 * elapsed_time) if elapsed_time > 0 else 0
            
            # Print progress
            sys.stdout.write(f"\r{self.filename}: {percent}% | {self.downloaded_bytes/(1024*1024):.1f}MB of {self.total_size/(1024*1024):.1f}MB | {speed:.1f} MB/s")
            sys.stdout.flush()

def download_file(url, output_path, force_download=False):
    """Download a file from URL to the specified output path"""
    if os.path.exists(output_path) and not force_download:
        logger.info(f"File already exists at {output_path}, skipping download")
        return
    
    logger.info(f"Downloading {url} to {output_path}")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    try:
        progress_bar = DownloadProgressBar(url)
        urllib.request.urlretrieve(url, output_path, reporthook=progress_bar)
        print()  # New line after progress bar
        logger.info(f"Successfully downloaded {url}")
    except Exception as e:
        logger.error(f"Failed to download {url}: {e}")
        if os.path.exists(output_path):
            os.remove(output_path)
        raise

def extract_file(input_path, output_dir, delete_after=True):
    """Extract a compressed file to the specified directory"""
    os.makedirs(output_dir, exist_ok=True)
    
    logger.info(f"Extracting {input_path} to {output_dir}")
    
    try:
        if input_path.endswith('.zip'):
            with zipfile.ZipFile(input_path, 'r') as zip_ref:
                # Get total number of files for progress tracking
                total_files = len(zip_ref.namelist())
                extracted_files = 0
                
                for file in zip_ref.namelist():
                    zip_ref.extract(file, output_dir)
                    extracted_files += 1
                    if extracted_files % 100 == 0 or extracted_files == total_files:
                        percent = min(100, extracted_files * 100 // total_files)
                        sys.stdout.write(f"\rExtracting {os.path.basename(input_path)}: {percent}%")
                        sys.stdout.flush()
                
                print()  # New line after progress bar
        else:
            logger.error(f"Unsupported file format: {input_path}")
            return
        
        logger.info(f"Successfully extracted {input_path}")
        
        # Delete zip file after extraction if requested
        if delete_after:
            logger.info(f"Deleting archive file {input_path}")
            os.remove(input_path)
            
    except Exception as e:
        logger.error(f"Failed to extract {input_path}: {e}")
        raise

print("Helper classes and utility functions defined ✓")

Helper classes and utility functions defined ✓


In [4]:
# Cell 3: Define main functions
# ===================================
def create_kitti_structure(kitti_root):
    """Create the necessary directory structure for KITTI dataset"""
    logger.info("Creating KITTI directory structure")
    
    # Create required directories
    directories = [
        os.path.join(kitti_root, 'training', 'velodyne'),
        os.path.join(kitti_root, 'training', 'calib'),
        os.path.join(kitti_root, 'training', 'label_2'),
        os.path.join(kitti_root, 'training', 'image_2'),
        os.path.join(kitti_root, 'testing', 'velodyne'),
        os.path.join(kitti_root, 'testing', 'calib'),
        os.path.join(kitti_root, 'testing', 'image_2'),
        os.path.join(kitti_root, 'ImageSets')
    ]
    
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    
    logger.info("KITTI directory structure created")

def check_dataset_files(kitti_root):
    """Check if dataset files exist in the final location"""
    logger.info("Checking for dataset files in the final location")
    
    # Define expected directories and minimum file counts
    expected_dirs = {
        os.path.join(kitti_root, 'training', 'velodyne'): 10,
        os.path.join(kitti_root, 'training', 'calib'): 10,
        os.path.join(kitti_root, 'training', 'label_2'): 10,
        os.path.join(kitti_root, 'training', 'image_2'): 10,
        os.path.join(kitti_root, 'testing', 'velodyne'): 5,
        os.path.join(kitti_root, 'testing', 'calib'): 5,
        os.path.join(kitti_root, 'testing', 'image_2'): 5,
        os.path.join(kitti_root, 'ImageSets'): 1
    }
    
    all_valid = True
    
    for dir_path, min_files in expected_dirs.items():
        if not os.path.exists(dir_path):
            logger.warning(f"Directory {dir_path} does not exist")
            all_valid = False
            continue
            
        files_count = len([f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))])
        if files_count < min_files:
            logger.warning(f"Directory {dir_path} contains only {files_count} files, expected at least {min_files}")
            all_valid = False
        else:
            logger.info(f"Directory {dir_path} contains {files_count} files ✓")
    
    if all_valid:
        logger.info("All dataset files verified successfully ✓")
    else:
        logger.warning("Some dataset files are missing or incomplete!")
    
    return all_valid

def organize_kitti_files(kitti_root, temp_dir, immediate_cleanup=True):
    """Organize extracted KITTI files into the proper structure"""
    logger.info("Organizing KITTI files")
    
    # Define source and destination folders
    train_dirs = ['velodyne', 'calib', 'label_2', 'image_2']
    test_dirs = ['velodyne', 'calib', 'image_2']
    
    # Process training data
    for folder in train_dirs:
        src_folder = os.path.join(temp_dir, 'training', folder)
        dst_folder = os.path.join(kitti_root, 'training', folder)
        
        if os.path.exists(src_folder):
            logger.info(f"Moving {folder} training files")
            os.makedirs(dst_folder, exist_ok=True)
            
            file_count = len([f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))])
            processed = 0
            
            for filename in os.listdir(src_folder):
                src_file = os.path.join(src_folder, filename)
                dst_file = os.path.join(dst_folder, filename)
                
                if os.path.isfile(src_file):
                    shutil.copy2(src_file, dst_file)
                    processed += 1
                    
                    # Print progress
                    if processed % 100 == 0 or processed == file_count:
                        percent = min(100, processed * 100 // file_count) if file_count > 0 else 100
                        sys.stdout.write(f"\rMoving {folder} training files: {percent}%")
                        sys.stdout.flush()
            
            print()  # New line after progress
    
    # Process testing data
    for folder in test_dirs:
        src_folder = os.path.join(temp_dir, 'testing', folder)
        dst_folder = os.path.join(kitti_root, 'testing', folder)
        
        if os.path.exists(src_folder):
            logger.info(f"Moving {folder} testing files")
            os.makedirs(dst_folder, exist_ok=True)
            
            file_count = len([f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))])
            processed = 0
            
            for filename in os.listdir(src_folder):
                src_file = os.path.join(src_folder, filename)
                dst_file = os.path.join(dst_folder, filename)
                
                if os.path.isfile(src_file):
                    shutil.copy2(src_file, dst_file)
                    processed += 1
                    
                    # Print progress
                    if processed % 100 == 0 or processed == file_count:
                        percent = min(100, processed * 100 // file_count) if file_count > 0 else 100
                        sys.stdout.write(f"\rMoving {folder} testing files: {percent}%")
                        sys.stdout.flush()
            
            print()  # New line after progress
    
    logger.info("KITTI files organized successfully")
    
    # Verify dataset files are in place
    check_result = check_dataset_files(kitti_root)
    
    # Immediately clean up temporary directory if requested
    if immediate_cleanup and os.path.exists(temp_dir):
        logger.info(f"Cleaning up temporary directory {temp_dir}")
        shutil.rmtree(temp_dir)
        logger.info(f"Temporary directory {temp_dir} has been removed")
        
    return check_result

def create_imagesets(kitti_root):
    """Create train/val/test splits for KITTI if they don't already exist"""
    logger.info("Checking dataset splits")
    
    # Create ImageSets directory
    imagesets_dir = os.path.join(kitti_root, 'ImageSets')
    os.makedirs(imagesets_dir, exist_ok=True)
    
    # Check if split files already exist
    if all(os.path.exists(os.path.join(imagesets_dir, f"{split}.txt")) for split in ['train', 'val', 'test']):
        logger.info("Dataset splits already exist, preserving existing files")
        return
    
    # Get training sample IDs
    train_velodyne_dir = os.path.join(kitti_root, 'training', 'velodyne')
    if not os.path.exists(train_velodyne_dir):
        logger.error(f"Training velodyne directory {train_velodyne_dir} doesn't exist")
        return
    
    # Get sample IDs (remove file extension)
    train_samples = [os.path.splitext(f)[0] for f in os.listdir(train_velodyne_dir) if f.endswith('.bin')]
    train_samples.sort()
    
    # Get testing sample IDs
    test_velodyne_dir = os.path.join(kitti_root, 'testing', 'velodyne')
    test_samples = []
    if os.path.exists(test_velodyne_dir):
        test_samples = [os.path.splitext(f)[0] for f in os.listdir(test_velodyne_dir) if f.endswith('.bin')]
        test_samples.sort()
    
    # Split training into train and val (80/20 split)
    # Use a fixed seed for reproducibility
    np.random.seed(42)
    indices = np.random.permutation(len(train_samples))
    split_idx = int(len(train_samples) * 0.8)
    
    train_idx = indices[:split_idx]
    val_idx = indices[split_idx:]
    
    train_split = [train_samples[i] for i in train_idx]
    val_split = [train_samples[i] for i in val_idx]
    
    # Sort for better readability
    train_split.sort()
    val_split.sort()
    
    # Check individual split files and only create missing ones
    if not os.path.exists(os.path.join(imagesets_dir, 'train.txt')):
        with open(os.path.join(imagesets_dir, 'train.txt'), 'w') as f:
            f.write('\n'.join(train_split))
        logger.info(f"Created train split with {len(train_split)} samples")
    else:
        logger.info("Using existing train.txt file")
    
    if not os.path.exists(os.path.join(imagesets_dir, 'val.txt')):
        with open(os.path.join(imagesets_dir, 'val.txt'), 'w') as f:
            f.write('\n'.join(val_split))
        logger.info(f"Created validation split with {len(val_split)} samples")
    else:
        logger.info("Using existing val.txt file")
    
    if not os.path.exists(os.path.join(imagesets_dir, 'test.txt')):
        with open(os.path.join(imagesets_dir, 'test.txt'), 'w') as f:
            f.write('\n'.join(test_samples))
        logger.info(f"Created test split with {len(test_samples)} samples")
    else:
        logger.info("Using existing test.txt file")

print("Main functions defined ✓")

Main functions defined ✓


In [5]:
# Cell 4: Set up paths and parameters
# ===================================
# This cell can be run multiple times to change parameter settings

# User-modifiable parameters
output_path = DEFAULT_OUTPUT_PATH  # Path to save the dataset
temp_dir = DEFAULT_TEMP_DIR        # Temporary directory
force_download = False             # Whether to force re-download
skip_download = False              # Whether to skip download
keep_temp = False                  # Whether to keep temporary files
keep_archives = False              # Whether to keep archive files

# Get script directory for reference
try:
    script_dir = os.path.dirname(os.path.abspath("__file__"))
except:
    script_dir = os.getcwd()

# Handle paths - make them absolute and ensure they exist
if output_path.startswith('./'):
    output_path = os.path.abspath(os.path.join(script_dir, output_path))
else:
    output_path = os.path.abspath(output_path)

if temp_dir.startswith('./'):
    temp_dir = os.path.abspath(os.path.join(script_dir, temp_dir))
else:
    temp_dir = os.path.abspath(temp_dir)

# Ensure directories exist
os.makedirs(temp_dir, exist_ok=True)

logger.info(f"Script directory: {script_dir}")
logger.info(f"Setting up KITTI dataset at {output_path}")
logger.info(f"Using temporary directory: {temp_dir}")

print("Parameters set up ✓")
print("\nCurrent parameters:")
print(f"- Dataset save path: {output_path}")
print(f"- Temporary directory: {temp_dir}")
print(f"- Force re-download: {force_download}")
print(f"- Skip download: {skip_download}")
print(f"- Keep temporary files: {keep_temp}")
print(f"- Keep archive files: {keep_archives}")


2025-03-18 00:37:38 - INFO - Script directory: /teamspace/studios/this_studio/FinalYearProject
2025-03-18 00:37:38 - INFO - Setting up KITTI dataset at /teamspace/studios/this_studio/FinalYearProject/data/kitti
2025-03-18 00:37:38 - INFO - Using temporary directory: /teamspace/studios/this_studio/FinalYearProject/data/temp/kitti


Parameters set up ✓

Current parameters:
- Dataset save path: /teamspace/studios/this_studio/FinalYearProject/data/kitti
- Temporary directory: /teamspace/studios/this_studio/FinalYearProject/data/temp/kitti
- Force re-download: False
- Skip download: False
- Keep temporary files: False
- Keep archive files: False


In [6]:
# Cell 5: Download dataset (optional)
# ===================================
# If disk space is insufficient or download is interrupted, you can skip this step or continue later

if not skip_download:
    try:
        os.makedirs(temp_dir, exist_ok=True)
        for url, filename in KITTI_URLS:
            download_path = os.path.join(temp_dir, filename)
            download_file(url, download_path, force_download)
        
        print("Dataset download completed ✓")
    except Exception as e:
        print(f"Error during download process: {e}")
        print("You can retry later or set skip_download = True and continue with the next steps")
else:
    print("Skipping download step")

2025-03-18 00:37:51 - INFO - Downloading https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_velodyne.zip to /teamspace/studios/this_studio/FinalYearProject/data/temp/kitti/data_object_velodyne.zip


data_object_velodyne.zip: 9% | 2575.9MB of 27418.8MB | 29.4 MB/s

KeyboardInterrupt: 

In [None]:
# Cell 6: Extract dataset (optional)
# ===================================
# If disk space is insufficient or extraction is interrupted, you can continue later

if not skip_download:
    try:
        for _, filename in KITTI_URLS:
            input_path = os.path.join(temp_dir, filename)
            if os.path.exists(input_path):
                extract_file(input_path, temp_dir, not keep_archives)
        
        print("Dataset extraction completed ✓")
    except Exception as e:
        print(f"Error during extraction process: {e}")
        print("You can retry later")
else:
    print("Skipping extraction step")


In [None]:
# Cell 7: Create directory structure
# ===================================
try:
    create_kitti_structure(output_path)
    print("KITTI directory structure created ✓")
except Exception as e:
    print(f"Error creating directory structure: {e}")

In [5]:
# Cell 8: Organize files and verify
# ===================================
try:
    # immediate_cleanup parameter controls whether to delete temporary files immediately
    organize_result = organize_kitti_files(output_path, temp_dir, immediate_cleanup=not keep_temp)
    if organize_result:
        print("File organization and verification completed ✓")
    else:
        print("File organization completed, but verification found issues")
except Exception as e:
    print(f"Error organizing files: {e}")


2025-03-16 00:06:22 - INFO - Organizing KITTI files
2025-03-16 00:06:22 - INFO - Moving velodyne testing files


Moving velodyne testing files: 100%

2025-03-16 00:09:57 - INFO - Moving calib testing files



Moving calib testing files: 100%

2025-03-16 00:10:37 - INFO - Moving image_2 testing files



Moving image_2 testing files: 100%

2025-03-16 00:12:16 - INFO - KITTI files organized successfully
2025-03-16 00:12:16 - INFO - Checking for dataset files in the final location





2025-03-16 00:12:17 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/velodyne contains 7481 files ✓
2025-03-16 00:12:18 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/calib contains 7481 files ✓
2025-03-16 00:12:19 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/label_2 contains 7481 files ✓
2025-03-16 00:12:21 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/image_2 contains 7481 files ✓
2025-03-16 00:12:22 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/velodyne contains 7518 files ✓
2025-03-16 00:12:23 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/calib contains 7518 files ✓
2025-03-16 00:12:25 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/image_2 contains 7518 files ✓
2025-03-16 00:12:25 - INFO - Directory /workspace/FinalYearProject/data/kitti/ImageSets contains 3 files ✓
2025-03-16 00:12:25 - INFO - All dataset files verified successfully ✓
2025-03-

File organization and verification completed ✓


In [6]:
# Cell 9: Create dataset splits
# ===================================
try:
    create_imagesets(output_path)
    print("Dataset splits created ✓")
except Exception as e:
    print(f"Error creating dataset splits: {e}")


2025-03-16 00:12:34 - INFO - Checking dataset splits
2025-03-16 00:12:34 - INFO - Dataset splits already exist, preserving existing files


Dataset splits created ✓


In [7]:
# Cell 10: Final verification
# ===================================
try:
    final_check = check_dataset_files(output_path)
    
    print("===============================")
    if final_check:
        print("KITTI dataset setup completed successfully!")
    else:
        print("KITTI dataset setup completed with warnings!")
    print(f"Dataset location: {output_path}")
    print("===============================")
    print("\nTo use this dataset with OpenPCDet:")
    print("1. Ensure the pcdet package is installed")
    print("2. Use the KITTI dataset configuration in your training/testing scripts")
    print("3. Sample command: python tools/train.py --cfg_file tools/cfgs/kitti_models/second.yaml")
except Exception as e:
    print(f"Error during final verification: {e}")

2025-03-16 00:12:37 - INFO - Checking for dataset files in the final location
2025-03-16 00:12:38 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/velodyne contains 7481 files ✓
2025-03-16 00:12:39 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/calib contains 7481 files ✓
2025-03-16 00:12:40 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/label_2 contains 7481 files ✓
2025-03-16 00:12:42 - INFO - Directory /workspace/FinalYearProject/data/kitti/training/image_2 contains 7481 files ✓
2025-03-16 00:12:43 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/velodyne contains 7518 files ✓
2025-03-16 00:12:44 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/calib contains 7518 files ✓
2025-03-16 00:12:45 - INFO - Directory /workspace/FinalYearProject/data/kitti/testing/image_2 contains 7518 files ✓
2025-03-16 00:12:45 - INFO - Directory /workspace/FinalYearProject/data/kitti/ImageSets contains 3 files ✓
2

KITTI dataset setup completed successfully!
Dataset location: /workspace/FinalYearProject/data/kitti

To use this dataset with OpenPCDet:
1. Ensure the pcdet package is installed
2. Use the KITTI dataset configuration in your training/testing scripts
3. Sample command: python tools/train.py --cfg_file tools/cfgs/kitti_models/second.yaml
