###1: Mount Drive, Install Libraries & Safety Checks

In [None]:
# --- CELL 1: SETUP ---
import os
import shutil
import kagglehub
import psutil  # To monitor RAM usage

def check_ram():
    """Helper to check how much RAM we have left"""
    ram = psutil.virtual_memory()
    print(f"RAM Used: {ram.percent}% | Available: {ram.available / (1024**3):.2f} GB")

print("Installing libraries...")
!pip install -q ultralytics

from google.colab import drive
drive.mount('/content/drive')
check_ram()

Installing libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
RAM Used: 9.0% | Available: 11.53 GB


###2: Download & Move (The "Zero-RAM" Method)

In [None]:
# --- CELL 2: DOWNLOAD & STRUCTURE ---
print("Downloading dataset via KaggleHub...")
# This downloads to the hidden /root/.cache folder
download_path = kagglehub.dataset_download("anulayakhare/crackathon-data")
print(f"Original Download Path: {download_path}")

target_dir = '/content/dataset'

# Safety check: Clean up old run if it exists (frees up disk space)
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

print(f"Moving dataset to {target_dir}...")

# We use move, NOT copy. Copy duplicates data (fills disk). Move is instant.
shutil.move(download_path, target_dir)

check_ram() # Verify RAM didn't spike

Downloading dataset via KaggleHub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/anulayakhare/crackathon-data?dataset_version_number=1...


100%|██████████| 9.90G/9.90G [01:36<00:00, 110MB/s]

Extracting files...





Original Download Path: /root/.cache/kagglehub/datasets/anulayakhare/crackathon-data/versions/1
Moving dataset to /content/dataset...
RAM Used: 9.3% | Available: 11.49 GB


###3: Verify & Fix Nesting

In [None]:
# --- CELL 3: FIX FOLDER STRUCTURE ---
# We need 'train' to be directly inside '/content/dataset'
contents = os.listdir(target_dir)
print(f"Folder contents: {contents}")

# If the data is nested inside another folder (e.g., 'randomized_dataset'), move it up
if 'train' not in contents:
    # Find the subfolder that holds the data
    subfolder = [x for x in contents if os.path.isdir(os.path.join(target_dir, x))][0]
    subfolder_path = os.path.join(target_dir, subfolder)

    print(f"Data found nested in '{subfolder}'. Moving it up...")

    # Move contents up one level
    for item in os.listdir(subfolder_path):
        shutil.move(os.path.join(subfolder_path, item), target_dir)

    # Remove empty shell folder
    os.rmdir(subfolder_path)

print("\nFinal Structure Check (Should see train, val, test):")
print(os.listdir(target_dir))

Folder contents: ['randomized_dataset']
Data found nested in 'randomized_dataset'. Moving it up...

Final Structure Check (Should see train, val, test):
['train', 'test', 'val']


###4: Generate Config (data.yaml)

In [None]:
# --- CELL 4: CREATE & BACKUP CONFIG ---
import yaml
import os
import shutil

# 1. Define the Config Content
# Note: 'path' must be the absolute path in the COLAB environment
yaml_content = {
    'path': '/content/dataset',  # Where the images are located NOW (in Colab)
    'train': 'train/images',     # Relative path to train images
    'val': 'val/images',         # Relative path to val images
    'test': 'test/images',       # Relative path to test images

    # EXACT CLASS MAPPING
    'names': {
        0: 'Longitudinal Crack',
        1: 'Transverse Crack',
        2: 'Alligator Crack',
        3: 'Other Corruption',
        4: 'Pothole'
    }
}

# 2. Define Paths
local_yaml_path = '/content/data.yaml'           # Active copy (for YOLO to use now)
drive_config_dir = '/content/drive/MyDrive/Crackathon/04_configs' # Backup folder
drive_yaml_path = f'{drive_config_dir}/data.yaml' # Permanent copy

# 3. Create the file locally
with open(local_yaml_path, 'w') as f:
    yaml.dump(yaml_content, f, sort_keys=False)

# 4. Backup to Drive
if not os.path.exists(drive_config_dir):
    os.makedirs(drive_config_dir) # Create folder if it doesn't exist

shutil.copy(local_yaml_path, drive_yaml_path)

print(f" Active config created at: {local_yaml_path}")
print(f" Permanent backup saved to: {drive_yaml_path}")

 Active config created at: /content/data.yaml
 Permanent backup saved to: /content/drive/MyDrive/Crackathon/04_configs/data.yaml
