### 先比較 training_imgs 是否有與 training_masks 有不重複

In [2]:
import os

def check_files(image_dir, mask_dir):
    """
    比對兩個資料夾中的檔案，檢查是否配對
    """
    # 獲取所有檔案名稱
    image_files = [f.replace('.png', '') for f in os.listdir(image_dir) if f.endswith('.png')]
    mask_files = [f.replace('_mask.png', '') for f in os.listdir(mask_dir) if f.endswith('_mask.png')]
    
    # 找出不配對的檔案
    missing_masks = set(image_files) - set(mask_files)
    missing_images = set(mask_files) - set(image_files)
    
    print(f"\n{'='*50}")
    print(f"檔案配對檢查結果:")
    print(f"{'='*50}")
    print(f"訓練圖片數量: {len(image_files)}")
    print(f"Mask圖片數量: {len(mask_files)}")
    
    if missing_masks:
        print(f"\n缺少mask的檔案 ({len(missing_masks)} 個):")
        print(f"{'-'*40}")
        for f in sorted(missing_masks):
            print(f"- {f}.png -> 缺少 {f}_mask.png")
            
    if missing_images:
        print(f"\n缺少原始圖片的檔案 ({len(missing_images)} 個):")
        print(f"{'-'*40}")
        for f in sorted(missing_images):
            print(f"- {f}_mask.png -> 缺少 {f}.png")
    
    if not missing_masks and not missing_images:
        print("\n所有檔案都有正確配對！")

    # # 刪除缺少mask的原始圖片
    # for f in missing_masks:
    #     file_path = os.path.join(image_dir, f + '.png')
    #     if os.path.exists(file_path):
    #         os.remove(file_path)
    #         print(f"已刪除: {f}.png")
       
    #    # 刪除缺少原始圖片的mask
    # for f in missing_images:
    #     file_path = os.path.join(mask_dir, f + '_mask.png')
    #     if os.path.exists(file_path):
    #         os.remove(file_path)
    #         print(f"已刪除: {f}_mask.png")
    
    # 顯示清理後的統計
    remaining_images = len([f for f in os.listdir(image_dir) if f.endswith('.png')])
    remaining_masks = len([f for f in os.listdir(mask_dir) if f.endswith('_mask.png')])
    print(f"\n{'='*50}")
    print(f"清理後的檔案統計:")
    print(f"{'='*50}")
    print(f"訓練圖片數量: {remaining_images}")
    print(f"Mask圖片數量: {remaining_masks}")

# 使用範例
if __name__ == "__main__":
    # 設定資料夾路徑
    train_dir = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_IW\imgs"  # 替換成你的訓練圖片資料夾路徑
    gt_dir = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_IW\mask"        # 替換成你的GT圖片資料夾路徑
    
    try:
        check_files(train_dir, gt_dir)
    except Exception as e:
        print(f"發生錯誤: {str(e)}")




檔案配對檢查結果:
訓練圖片數量: 378
Mask圖片數量: 378

所有檔案都有正確配對！

清理後的檔案統計:
訓練圖片數量: 378
Mask圖片數量: 378


In [7]:
import os
import shutil
import random
from pathlib import Path

def verify_pairs(imgs_dir, masks_dir):
    """
    驗證圖片和遮罩是否完全配對，考慮 _mask 後綴
    """
    # 獲取所有檔案名稱並移除副檔名
    img_files = {os.path.splitext(f)[0] for f in os.listdir(imgs_dir) if f.endswith(('.png', '.jpg', '.jpeg'))}
    mask_files = {os.path.splitext(f)[0].replace('_mask', '') for f in os.listdir(masks_dir) if f.endswith(('.png', '.jpg', '.jpeg'))}
    
    # 檢查是否有遺漏的配對
    imgs_without_masks = img_files - mask_files
    masks_without_imgs = mask_files - img_files
    
    if imgs_without_masks:
        print("警告：以下圖片沒有對應的遮罩：")
        for img in sorted(imgs_without_masks):
            print(f"  - {img}.png")
    
    if masks_without_imgs:
        print("警告：以下遮罩沒有對應的圖片：")
        for mask in sorted(masks_without_imgs):
            print(f"  - {mask}_mask.png")
    
    return len(imgs_without_masks) == 0 and len(masks_without_imgs) == 0

def split_dataset(source_imgs_dir, source_masks_dir, output_base_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_seed=42):
    """
    Split dataset into train, validation and test sets while maintaining the pairing between images and masks.
    """
    # 先驗證源資料的配對完整性
    print("驗證源資料配對...")
    if not verify_pairs(source_imgs_dir, source_masks_dir):
        raise ValueError("源資料中的圖片和遮罩配對不完整！請檢查資料集。")
    
    # Set random seed for reproducibility
    random.seed(random_seed)
    
    # Create output directories
    splits = ['train', 'val', 'test']
    for split in splits:
        for subdir in ['imgs', 'masks']:
            os.makedirs(os.path.join(output_base_dir, split, subdir), exist_ok=True)
    
    # Get list of image files
    img_files = [f for f in os.listdir(source_imgs_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
    
    # 確保順序一致
    img_files.sort()
    random.shuffle(img_files)
    
    # Calculate split sizes
    total_size = len(img_files)
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    
    # Split the files
    train_files = img_files[:train_size]
    val_files = img_files[train_size:train_size + val_size]
    test_files = img_files[train_size + val_size:]
    
    def copy_files(file_list, split_name):
        for filename in file_list:
            # 準備檔案名稱
            base_name = os.path.splitext(filename)[0]
            ext = os.path.splitext(filename)[1]
            
            # Copy image
            src_img = os.path.join(source_imgs_dir, filename)
            dst_img = os.path.join(output_base_dir, split_name, 'imgs', filename)
            
            # Copy corresponding mask (with _mask suffix)
            mask_filename = f"{base_name}_mask{ext}"
            src_mask = os.path.join(source_masks_dir, mask_filename)
            dst_mask = os.path.join(output_base_dir, split_name, 'masks', mask_filename)
            
            # 確認檔案存在
            if not os.path.exists(src_img):
                raise FileNotFoundError(f"找不到圖片：{src_img}")
            if not os.path.exists(src_mask):
                raise FileNotFoundError(f"找不到遮罩：{src_mask}")
            
            # 複製檔案
            shutil.copy2(src_img, dst_img)
            shutil.copy2(src_mask, dst_mask)
    
    # Copy files to respective directories
    print("正在複製訓練集...")
    copy_files(train_files, 'train')
    print("正在複製驗證集...")
    copy_files(val_files, 'val')
    print("正在複製測試集...")
    copy_files(test_files, 'test')
    
    # 驗證分割後的資料集
    print("\n驗證分割後的資料集...")
    for split in splits:
        print(f"\n檢查 {split} 集的配對...")
        split_imgs_dir = os.path.join(output_base_dir, split, 'imgs')
        split_masks_dir = os.path.join(output_base_dir, split, 'masks')
        verify_pairs(split_imgs_dir, split_masks_dir)
    
    # Print statistics
    print("\n資料集分割完成:")
    print(f"總圖片數量: {total_size}")
    print(f"訓練集: {len(train_files)} 張 ({train_ratio*100:.1f}%)")
    print(f"驗證集: {len(val_files)} 張 ({val_ratio*100:.1f}%)")
    print(f"測試集: {len(test_files)} 張 ({test_ratio*100:.1f}%)")

if __name__ == "__main__":
    # Define your directories
    source_imgs_dir = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_noIW\imgs"
    source_masks_dir = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_noIW\mask"
    output_base_dir = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_noIW\split"
    
    # Run the split
    split_dataset(source_imgs_dir, source_masks_dir, output_base_dir)

驗證源資料配對...
正在複製訓練集...
正在複製驗證集...
正在複製測試集...

驗證分割後的資料集...

檢查 train 集的配對...

檢查 val 集的配對...

檢查 test 集的配對...

資料集分割完成:
總圖片數量: 1076
訓練集: 753 張 (70.0%)
驗證集: 161 張 (15.0%)
測試集: 162 張 (15.0%)


In [11]:
import os
import shutil
from pathlib import Path
import glob

def merge_image_folders(base_path, source_folders, destination_folder):
    """
    將多個來源資料夾中的圖片合併到目標資料夾，支援 imgs 和 masks 子資料夾
    """
    print("\n=== 開始執行合併作業 ===")
    print(f"基礎路徑: {base_path}")
    print(f"目標資料夾: {destination_folder}")
    
    # 建立目標資料夾結構
    full_dest_path = os.path.join(base_path, destination_folder)
    dest_imgs_path = os.path.join(full_dest_path, 'imgs')
    dest_masks_path = os.path.join(full_dest_path, 'masks')
    
    os.makedirs(dest_imgs_path, exist_ok=True)
    os.makedirs(dest_masks_path, exist_ok=True)
    
    # 支援的圖片格式
    image_extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff',
                       '.JPG', '.JPEG', '.PNG', '.TIF', '.TIFF']
    
    # 統計資訊
    stats = {
        'total_files_moved': {'imgs': 0, 'masks': 0},
        'files_per_source': {},
        'file_types_found': set()
    }
    
    # 處理每個來源資料夾
    for source_folder in source_folders:
        full_source_path = os.path.join(base_path, source_folder)
        print(f"\n檢查來源資料夾: {source_folder}")
        
        if not os.path.exists(full_source_path):
            print(f"警告: 來源資料夾不存在 - {full_source_path}")
            continue
        
        stats['files_per_source'][source_folder] = {'imgs': 0, 'masks': 0}
        
        # 處理 train, val, test 子資料夾
        for split_folder in ['train', 'val', 'test']:
            split_path = os.path.join(full_source_path, split_folder)
            if not os.path.exists(split_path):
                continue
                
            print(f"\n處理 {split_folder} 資料夾:")
            
            # 處理 imgs 和 masks 資料夾
            for data_type in ['imgs', 'masks']:
                data_path = os.path.join(split_path, data_type)
                if not os.path.exists(data_path):
                    print(f"  {data_type} 資料夾不存在: {data_path}")
                    continue
                    
                print(f"  檢查 {data_type} 資料夾: {data_path}")
                
                # 搜尋所有檔案並記錄類型
                all_files = glob.glob(os.path.join(data_path, '*.*'))
                for file in all_files:
                    ext = os.path.splitext(file)[1].lower()
                    stats['file_types_found'].add(ext)
                    
                    # 只處理圖片檔案
                    if ext.lower() in [x.lower() for x in image_extensions]:
                        try:
                            file_name = os.path.basename(file)
                            # 根據類型選擇目標資料夾
                            dest_path = dest_imgs_path if data_type == 'imgs' else dest_masks_path
                            dest_file = os.path.join(dest_path, file_name)
                            
                            # 處理檔案名稱衝突
                            if os.path.exists(dest_file):
                                base_name, ext = os.path.splitext(file_name)
                                new_name = f"{base_name}_{source_folder}_{split_folder}{ext}"
                                dest_file = os.path.join(dest_path, new_name)
                            
                            shutil.copy2(file, dest_file)
                            stats['total_files_moved'][data_type] += 1
                            stats['files_per_source'][source_folder][data_type] += 1
                            print(f"    複製檔案: {file_name} → {os.path.basename(dest_file)}")
                            
                        except Exception as e:
                            print(f"    錯誤: 複製檔案失敗 {file}: {str(e)}")
    
    return stats

def main():
    # 設定基礎路徑和資料夾名稱
    base_path = r"C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031"
    source_folders = ['20241031_train_IW', '20241031_train_noIW']
    destination_folder = '20241031dataset'
    
    try:
        stats = merge_image_folders(base_path, source_folders, destination_folder)
        
        if stats:
            print("\n=== 處理結果統計 ===")
            print(f"總共移動的檔案:")
            print(f"- 圖片 (imgs): {stats['total_files_moved']['imgs']} 個檔案")
            print(f"- 遮罩 (masks): {stats['total_files_moved']['masks']} 個檔案")
            
            print("\n各資料夾處理統計：")
            for source, counts in stats['files_per_source'].items():
                print(f"- {source}:")
                print(f"  - 圖片 (imgs): {counts['imgs']} 個檔案")
                print(f"  - 遮罩 (masks): {counts['masks']} 個檔案")
            
            if stats['file_types_found']:
                print("\n發現的檔案類型：")
                for ext in sorted(stats['file_types_found']):
                    print(f"- {ext}")
            
    except Exception as e:
        print(f"\n錯誤: 執行過程中發生錯誤: {str(e)}")

if __name__ == "__main__":
    main()


=== 開始執行合併作業 ===
基礎路徑: C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031
目標資料夾: 20241031dataset

檢查來源資料夾: 20241031_train_IW

處理 train 資料夾:
  檢查 imgs 資料夾: C:\Users\Alan\Dropbox\Himawari8_Wave_Speed_Detection\datas\Band03IW\Original_imgs\NLM\Labeled\訓練資料\20241031\20241031_train_IW\train\imgs
    複製檔案: 201905080700.png → 201905080700.png
    複製檔案: 201905080710.png → 201905080710.png
    複製檔案: 201905080720.png → 201905080720.png
    複製檔案: 201905080740.png → 201905080740.png
    複製檔案: 201905080750.png → 201905080750.png
    複製檔案: 201905080800.png → 201905080800.png
    複製檔案: 201905080810.png → 201905080810.png
    複製檔案: 201905080820.png → 201905080820.png
    複製檔案: 201905080830.png → 201905080830.png
    複製檔案: 201905080840.png → 201905080840.png
    複製檔案: 201905080850.png → 201905080850.png
    複製檔案: 201905090740.png → 201905090740.png
    複製檔案: 201906040250.png → 201906040250.png
    複製檔案: 201906040300.png → 201906040300.png
    複製