In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
import pandas as pd
from tqdm import tqdm

In [None]:
def load_tiff_image(file_path):
    """Load a TIFF image with all bands."""
    with rasterio.open(file_path) as src:
        return src.read()

In [None]:
def analyze_dataset(data_dir):
    """Analyzes the dataset to understand its characteristics."""
    
    image_stats = []
    mask_stats = []
    
    # Get all image files
    image_files = [f for f in os.listdir(os.path.join(data_dir, 'images')) 
                  if f.endswith('.tiff')]
    
    for img_file in tqdm(image_files):
        # Load image and mask
        img_path = os.path.join(data_dir, 'images', img_file)
        mask_path = os.path.join(data_dir, 'masks', img_file)
        
        img = load_tiff_image(img_path)
        mask = load_tiff_image(mask_path)
        
        # Calculate statistics
        img_mean = np.mean(img, axis=(1, 2))
        img_std = np.std(img, axis=(1, 2))
        cloud_percentage = np.mean(mask) * 100
        
        # Categorize images by cloud cover
        if cloud_percentage < 10:
            cloud_category = "Cloud-free"
        elif cloud_percentage < 50:
            cloud_category = "Partially cloudy"
        else:
            cloud_category = "Fully clouded"
            
        # Store statistics
        image_stats.append({
            'filename': img_file,
            'red_mean': img_mean[0],
            'green_mean': img_mean[1],
            'blue_mean': img_mean[2],
            'ir_mean': img_mean[3],
            'red_std': img_std[0],
            'green_std': img_std[1],
            'blue_std': img_std[2],
            'ir_std': img_std[3]
        })
        
        mask_stats.append({
            'filename': img_file,
            'cloud_percentage': cloud_percentage,
            'cloud_category': cloud_category
        })
    
    return pd.DataFrame(image_stats), pd.DataFrame(mask_stats)

In [None]:
def visualize_sample(img_path, mask_path):
    """Visualize a sample image and its mask."""
    img = load_tiff_image(img_path)
    mask = load_tiff_image(mask_path)
    
    fig, axs = plt.subplots(1, 5, figsize=(20, 5))
    
    # Display each band and the composite
    for i, band in enumerate(['Red', 'Green', 'Blue', 'IR']):
        axs[i].imshow(img[i], cmap='gray')
        axs[i].set_title(f'{band} Band')
        axs[i].axis('off')
    
    # Display mask
    axs[4].imshow(mask[0], cmap='gray')
    axs[4].set_title('Cloud Mask')
    axs[4].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # RGB composite
    rgb = np.stack((img[0], img[1], img[2]), axis=2)
    rgb = (rgb - rgb.min()) / (rgb.max() - rgb.min())  # Normalize
    
    plt.figure(figsize=(8, 8))
    plt.imshow(rgb)
    plt.title('RGB Composite')
    plt.axis('off')
    plt.show()