# Construction Crack Detection - Data Exploration

This notebook explores the dataset for crack detection in construction images.

In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
import random
from tqdm import tqdm

# Add project root to path
sys.path.append('..')
from crackdetect.data.preprocessing import ImagePreprocessor
from crackdetect.utils.crack_analysis import CrackAnalyzer

## 1. Dataset Overview

Let's first define the paths to our dataset.

In [None]:
# Set paths
data_dir = Path("../data")
train_dir = data_dir / "train"
val_dir = data_dir / "val"
test_dir = data_dir / "test"

train_images_dir = train_dir / "images"
train_masks_dir = train_dir / "masks"

# Check if directories exist
print(f"Train images directory exists: {train_images_dir.exists()}")
print(f"Train masks directory exists: {train_masks_dir.exists()}")
print(f"Validation directory exists: {val_dir.exists()}")
print(f"Test directory exists: {test_dir.exists()}")

### Data Statistics

In [None]:
# Count images in each directory
train_images = list(train_images_dir.glob("*.jpg")) + list(train_images_dir.glob("*.png"))
train_masks = list(train_masks_dir.glob("*.png"))

print(f"Number of training images: {len(train_images)}")
print(f"Number of training masks: {len(train_masks)}")

if val_dir.exists():
    val_images = list((val_dir / "images").glob("*.jpg")) + list((val_dir / "images").glob("*.png"))
    val_masks = list((val_dir / "masks").glob("*.png"))
    print(f"Number of validation images: {len(val_images)}")
    print(f"Number of validation masks: {len(val_masks)}")

if test_dir.exists():
    test_images = list((test_dir / "images").glob("*.jpg")) + list((test_dir / "images").glob("*.png"))
    print(f"Number of test images: {len(test_images)}")

## 2. Image Visualization

Let's visualize some sample images and their masks to understand the dataset.

In [None]:
# Initialize preprocessor
preprocessor = ImagePreprocessor()

# Function to display images with masks
def display_images_with_masks(image_paths, mask_paths, num_samples=5):
    # Randomly sample images
    indices = random.sample(range(len(image_paths)), min(num_samples, len(image_paths)))
    
    fig, axes = plt.subplots(num_samples, 3, figsize=(15, 5*num_samples))
    fig.tight_layout(pad=3.0)
    
    for i, idx in enumerate(indices):
        img_path = image_paths[idx]
        mask_path = mask_paths[idx] if idx < len(mask_paths) else None
        
        # Read image
        image = preprocessor.read_image(img_path)
        
        # Read mask if available
        if mask_path:
            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
            mask = (mask > 0).astype(np.uint8) * 255
            
            # Create overlay
            overlay = image.copy()
            overlay[mask > 0] = [255, 0, 0]
            overlay = cv2.addWeighted(image, 0.7, overlay, 0.3, 0)
        else:
            mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
            overlay = image.copy()
        
        # Display images
        axes[i, 0].imshow(image)
        axes[i, 0].set_title(f"Image: {img_path.name}")
        axes[i, 0].axis('off')
        
        axes[i, 1].imshow(mask, cmap='gray')
        axes[i, 1].set_title("Mask")
        axes[i, 1].axis('off')
        
        axes[i, 2].imshow(overlay)
        axes[i, 2].set_title("Overlay")
        axes[i, 2].axis('off')
    
    plt.show()

# Display training images
display_images_with_masks(train_images, train_masks, num_samples=5)

## 3. Image Preprocessing

Let's explore the preprocessing steps and see how they affect the images.

In [None]:
# Function to display preprocessing steps
def display_preprocessing_steps(image_path):
    # Read image
    original = preprocessor.read_image(image_path)
    
    # Apply preprocessing steps
    denoised = preprocessor.denoise(original)
    enhanced = preprocessor.enhance_contrast(denoised)
    resized = preprocessor.resize(enhanced)
    normalized = preprocessor.normalize(resized)
    edges = preprocessor.extract_edges(resized)
    
    # Display results
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.tight_layout(pad=3.0)
    
    axes[0, 0].imshow(original)
    axes[0, 0].set_title("Original Image")
    axes[0, 0].axis('off')
    
    axes[0, 1].imshow(denoised)
    axes[0, 1].set_title("Denoised")
    axes[0, 1].axis('off')
    
    axes[0, 2].imshow(enhanced)
    axes[0, 2].set_title("Contrast Enhanced")
    axes[0, 2].axis('off')
    
    axes[1, 0].imshow(resized)
    axes[1, 0].set_title(f"Resized to {resized.shape[:2]}")
    axes[1, 0].axis('off')
    
    axes[1, 1].imshow(normalized)
    axes[1, 1].set_title("Normalized [0, 1]")
    axes[1, 1].axis('off')
    
    axes[1, 2].imshow(edges, cmap='gray')
    axes[1, 2].set_title("Edge Detection")
    axes[1, 2].axis('off')
    
    plt.show()

# Display preprocessing steps for a sample image
sample_image = random.choice(train_images)
display_preprocessing_steps(sample_image)

## 4. Crack Analysis

Let's analyze the properties of cracks in our dataset.

In [None]:
# Initialize crack analyzer
analyzer = CrackAnalyzer(pixel_mm_ratio=1.0)

# Function to analyze cracks in an image
def analyze_sample_cracks(image_path, mask_path, pixel_mm_ratio=1.0):
    # Read image and mask
    image = preprocessor.read_image(image_path)
    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
    mask = (mask > 0).astype(np.float32)
    
    # Preprocess image
    processed_image = preprocessor.preprocess(image)
    
    # Set pixel to mm ratio
    analyzer.pixel_mm_ratio = pixel_mm_ratio
    
    # Analyze cracks
    crack_properties = analyzer.analyze_mask(mask)
    
    # Visualize results
    result_image = analyzer.visualize_analysis(processed_image, mask, crack_properties)
    
    # Display results
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    axes[0].imshow(image)
    axes[0].set_title("Original Image")
    axes[0].axis('off')
    
    axes[1].imshow(mask, cmap='gray')
    axes[1].set_title("Crack Mask")
    axes[1].axis('off')
    
    axes[2].imshow(result_image)
    axes[2].set_title("Crack Analysis")
    axes[2].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print crack properties
    print(f"Number of cracks detected: {len(crack_properties)}")
    for i, props in enumerate(crack_properties):
        print(f"\nCrack #{i+1}:")
        print(f"  Severity: {props.severity}")
        print(f"  Average Width: {props.width_avg:.2f} mm")
        print(f"  Maximum Width: {props.width_max:.2f} mm")
        print(f"  Length: {props.length:.2f} mm")
        print(f"  Area: {props.area:.2f} mm²")
        print(f"  Orientation: {props.orientation:.1f}°")

# Select a sample image with mask
sample_idx = random.randint(0, len(train_images) - 1)
sample_image = train_images[sample_idx]
sample_mask = train_masks[sample_idx]

# Analyze sample cracks
analyze_sample_cracks(sample_image, sample_mask, pixel_mm_ratio=0.1)

## 5. Dataset Statistics

Let's calculate some statistics about the dataset.

In [None]:
# Function to calculate dataset statistics
def calculate_dataset_statistics(image_paths, mask_paths):
    # Statistics to collect
    image_sizes = []
    mask_ratios = []
    crack_counts = []
    crack_widths = []
    crack_lengths = []
    crack_areas = []
    
    # Process each image
    for i, (img_path, mask_path) in enumerate(tqdm(zip(image_paths, mask_paths), total=len(image_paths))):
        # Limit the number of images to process for efficiency
        if i >= 50:  # Process only 50 images for this analysis
            break
        
        # Read image and mask
        image = preprocessor.read_image(img_path)
        mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
        mask = (mask > 0).astype(np.float32)
        
        # Image size
        image_sizes.append(image.shape[:2])
        
        # Mask ratio (% of pixels that are cracks)
        mask_ratio = np.mean(mask) * 100
        mask_ratios.append(mask_ratio)
        
        # Analyze cracks
        crack_properties = analyzer.analyze_mask(mask)
        
        # Crack count
        crack_counts.append(len(crack_properties))
        
        # Crack properties
        for props in crack_properties:
            crack_widths.append(props.width_avg)
            crack_lengths.append(props.length)
            crack_areas.append(props.area)
    
    return {
        'image_sizes': image_sizes,
        'mask_ratios': mask_ratios,
        'crack_counts': crack_counts,
        'crack_widths': crack_widths,
        'crack_lengths': crack_lengths,
        'crack_areas': crack_areas
    }

# Calculate statistics
stats = calculate_dataset_statistics(train_images, train_masks)

In [None]:
# Visualize statistics
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.tight_layout(pad=3.0)

# Image sizes
heights = [size[0] for size in stats['image_sizes']]
widths = [size[1] for size in stats['image_sizes']]
axes[0, 0].scatter(widths, heights)
axes[0, 0].set_title('Image Sizes')
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Height (pixels)')
axes[0, 0].grid(True)

# Mask ratios
axes[0, 1].hist(stats['mask_ratios'], bins=20)
axes[0, 1].set_title('Crack Coverage Distribution')
axes[0, 1].set_xlabel('Crack Coverage (%)')
axes[0, 1].set_ylabel('Number of Images')
axes[0, 1].grid(True)

# Crack counts
axes[0, 2].hist(stats['crack_counts'], bins=max(10, max(stats['crack_counts'])))
axes[0, 2].set_title('Crack Count Distribution')
axes[0, 2].set_xlabel('Number of Cracks per Image')
axes[0, 2].set_ylabel('Number of Images')
axes[0, 2].grid(True)

# Crack widths
axes[1, 0].hist(stats['crack_widths'], bins=20)
axes[1, 0].set_title('Crack Width Distribution')
axes[1, 0].set_xlabel('Crack Width (mm)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True)

# Crack lengths
axes[1, 1].hist(stats['crack_lengths'], bins=20)
axes[1, 1].set_title('Crack Length Distribution')
axes[1, 1].set_xlabel('Crack Length (mm)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(True)

# Crack areas
axes[1, 2].hist(stats['crack_areas'], bins=20)
axes[1, 2].set_title('Crack Area Distribution')
axes[1, 2].set_xlabel('Crack Area (mm²)')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].grid(True)

plt.show()

# Print summary statistics
print("Dataset Statistics:")
print(f"Number of images analyzed: {len(stats['image_sizes'])}")
print(f"Average image dimensions: {np.mean(heights):.1f} x {np.mean(widths):.1f} pixels")
print(f"Average crack coverage: {np.mean(stats['mask_ratios']):.2f}%")
print(f"Average number of cracks per image: {np.mean(stats['crack_counts']):.2f}")

if stats['crack_widths']:
    print(f"Average crack width: {np.mean(stats['crack_widths']):.2f} mm")
    print(f"Average crack length: {np.mean(stats['crack_lengths']):.2f} mm")
    print(f"Average crack area: {np.mean(stats['crack_areas']):.2f} mm²")