# Language-Conditioned Segmentation for Drywall - Vast.ai Training Setup

This notebook prepares and launches training on Vast.ai.

## Steps:
1. Install dependencies (PyTorch with CUDA)
2. Verify GPU availability
3. Download datasets from Roboflow (COCO format)
4. Organize dataset directories
5. Launch the training script


## 1. Install Dependencies


In [None]:
# Install PyTorch with CUDA support
# Check CUDA version first, then install appropriate PyTorch
import subprocess
import sys

# Check CUDA version
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
if result.returncode == 0:
    print("CUDA available. Installing PyTorch with CUDA support...")
    # Install PyTorch with CUDA 12.1 (compatible with most modern GPUs)
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
else:
    print("Warning: nvidia-smi not found. Installing CPU-only PyTorch.")
    !pip install torch torchvision torchaudio


In [None]:
# Install other dependencies from requirements.txt
!pip install pillow>=8.0.0 numpy>=1.21.0 pycocotools>=2.0.4 ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install roboflow tensorboard


## 2. Verify GPU Availability


In [None]:
import torch
import sys

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
    print("\n✓ GPU verification successful!")
else:
    print("\n✗ Warning: CUDA not available. Training will be slow on CPU.")
    sys.exit(1)


## 3. Download Datasets from Roboflow


In [None]:
import os
from pathlib import Path

# Set up Roboflow API key (set this as environment variable on Vast.ai)
# You can also set it here if needed: os.environ["ROBOFLOW_API_KEY"] = "your_api_key"

if "ROBOFLOW_API_KEY" not in os.environ:
    print("Warning: ROBOFLOW_API_KEY not set. Please set it as an environment variable.")
    print("You can get your API key from: https://app.roboflow.com/")
    # Uncomment and set your API key here if needed:
    # os.environ["ROBOFLOW_API_KEY"] = "your_api_key_here"
else:
    print("✓ ROBOFLOW_API_KEY found")

from roboflow import Roboflow

# Initialize Roboflow
rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])

# Download the cracks dataset
# Update these values based on your Roboflow project
workspace_name = "Ayush-Lab"  # Update with your workspace
project_name = "cracks"  # Update with your project name
version = 1  # Update with your dataset version

print(f"\nDownloading dataset: {workspace_name}/{project_name} (version {version})")
project = rf.workspace(workspace_name).project(project_name)
dataset = project.version(version).download("coco")

print(f"\n✓ Dataset downloaded to: {dataset.location}")


In [None]:
import shutil
from pathlib import Path

# Expected directory structure:
# Dataset/
#   cracks/
#     train/
#       _annotations.coco.json
#       *.jpg
#     valid/
#       _annotations.coco.json
#       *.jpg
#   prompts.json

# Find the downloaded dataset directory
# Roboflow typically downloads to: {project_name}-{version}
dataset_dir = Path(dataset.location)
print(f"Dataset location: {dataset_dir}")

# Check if train/valid directories exist
if (dataset_dir / "train").exists() and (dataset_dir / "valid").exists():
    print("✓ Train/valid splits found")
    
    # Create Dataset directory structure
    target_dir = Path("Dataset/cracks")
    target_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy train split
    train_source = dataset_dir / "train"
    train_target = target_dir / "train"
    if train_target.exists():
        shutil.rmtree(train_target)
    shutil.copytree(train_source, train_target)
    print(f"✓ Copied train split to {train_target}")
    
    # Copy valid split
    valid_source = dataset_dir / "valid"
    valid_target = target_dir / "valid"
    if valid_target.exists():
        shutil.rmtree(valid_target)
    shutil.copytree(valid_source, valid_target)
    print(f"✓ Copied valid split to {valid_target}")
    
    # Verify COCO annotation files exist
    train_annotations = train_target / "_annotations.coco.json"
    valid_annotations = valid_target / "_annotations.coco.json"
    
    if train_annotations.exists():
        print(f"✓ Train annotations found: {train_annotations}")
    else:
        print(f"✗ Warning: Train annotations not found at {train_annotations}")
    
    if valid_annotations.exists():
        print(f"✓ Valid annotations found: {valid_annotations}")
    else:
        print(f"✗ Warning: Valid annotations not found at {valid_annotations}")
    
    # Count images
    train_images = list(train_target.glob("*.jpg")) + list(train_target.glob("*.png"))
    valid_images = list(valid_target.glob("*.jpg")) + list(valid_target.glob("*.png"))
    print(f"\nTrain images: {len(train_images)}")
    print(f"Valid images: {len(valid_images)}")
    
else:
    print("✗ Error: Train/valid splits not found in downloaded dataset")
    print(f"Available directories: {list(dataset_dir.iterdir())}")


In [None]:
# Create prompts.json file if it doesn't exist
import json

prompts_path = Path("Dataset/prompts.json")
prompts_dir = prompts_path.parent
prompts_dir.mkdir(parents=True, exist_ok=True)

if not prompts_path.exists():
    # Default prompts for crack segmentation
    default_prompts = {
        "crack": [
            "segment crack",
            "segment cracks",
            "segment wall crack",
            "segment wall cracks",
            "segment surface crack",
            "segment surface cracks",
            "segment drywall crack",
            "segment drywall cracks",
            "find crack",
            "find cracks",
            "detect crack region",
            "detect crack regions"
        ],
        "taping_area": [
            "segment taping area",
            "segment drywall tape",
            "segment wall joint",
            "segment drywall seam",
            "segment taped joint",
            "segment joint area",
            "find drywall seam",
            "detect taping region"
        ]
    }
    
    with open(prompts_path, 'w') as f:
        json.dump(default_prompts, f, indent=2)
    print(f"✓ Created prompts.json at {prompts_path}")
else:
    print(f"✓ prompts.json already exists at {prompts_path}")


## 5. Verify Setup


In [None]:
# Verify all required files and directories exist
required_paths = [
    Path("Dataset/cracks/train/_annotations.coco.json"),
    Path("Dataset/cracks/valid/_annotations.coco.json"),
    Path("Dataset/prompts.json"),
    Path("scripts/train_cracks.py"),
    Path("scripts/model.py"),
    Path("scripts/dataset.py"),
    Path("scripts/losses.py"),
    Path("scripts/utils.py")
]

print("Verifying setup...")
all_good = True
for path in required_paths:
    if path.exists():
        print(f"✓ {path}")
    else:
        print(f"✗ Missing: {path}")
        all_good = False

if all_good:
    print("\n✓ All required files found!")
else:
    print("\n✗ Some files are missing. Please check your setup.")


## 6. Launch Training


In [None]:
# Training configuration
training_config = {
    "data_root": "Dataset/cracks",
    "prompts_path": "Dataset/prompts.json",
    "batch_size": 8,  # Adjust based on GPU memory
    "num_epochs": 50,
    "learning_rate": 1e-4,
    "seed": 42,
    "clip_model": "ViT-B/32",
    "freeze_encoder": True,  # Recommended for initial training
    "output_dir": "checkpoints",
    "save_predictions": False,  # Set to True to save predictions after training
    "val_every_n_epochs": 1  # Validate every N epochs
}

print("Training configuration:")
for key, value in training_config.items():
    print(f"  {key}: {value}")


In [None]:
# Build training command
cmd_parts = ["python", "scripts/train_cracks.py"]

for key, value in training_config.items():
    if isinstance(value, bool):
        if value:
            cmd_parts.append(f"--{key}")
    else:
        cmd_parts.append(f"--{key}")
        cmd_parts.append(str(value))

cmd = " ".join(cmd_parts)
print(f"\nLaunching training with command:\n{cmd}\n")

# Launch training
import subprocess
process = subprocess.Popen(
    cmd_parts,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    universal_newlines=True,
    bufsize=1
)

# Stream output
for line in process.stdout:
    print(line, end='')

process.wait()

if process.returncode == 0:
    print("\n✓ Training completed successfully!")
else:
    print(f"\n✗ Training failed with exit code {process.returncode}")


## Alternative: Run Training in Background

If you want to run training in the background and continue using the notebook:


In [None]:
# Uncomment to run training in background
# import subprocess
# import os
# 
# log_file = "training.log"
# with open(log_file, 'w') as f:
#     process = subprocess.Popen(
#         cmd_parts,
#         stdout=f,
#         stderr=subprocess.STDOUT
#     )
# 
# print(f"Training started in background. Process ID: {process.pid}")
# print(f"Logs are being written to: {log_file}")
# print(f"Monitor progress with: tail -f {log_file}")


## Notes

- Make sure to set `ROBOFLOW_API_KEY` as an environment variable on Vast.ai
- Adjust `batch_size` based on your GPU memory (start with 4-8)
- Training logs will be saved to `runs/` directory (view with TensorBoard)
- Model checkpoints will be saved to `checkpoints/` directory
- Best model will be saved as `checkpoints/best_model.pth`
