<a href="https://colab.research.google.com/github/Fiyuudump/simple-cnn/blob/main/Code_Splitting_DeepL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Environment Setup - Detect if running on Google Colab or locally
import os
import sys
from pathlib import Path

# Check if running in Google Colab
try:
    import google.colab
    IS_COLAB = True
    print("Running on Google Colab")
except ImportError:
    IS_COLAB = False
    print("Running locally")

# Mount Google Drive if on Colab
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = Path('/content/drive/MyDrive/grid')  # Adjust this path to your Google Drive folder
    TEMP_PATH = Path('/content/temp_data')  # Session storage for Colab
    TEMP_PATH.mkdir(exist_ok=True)
    print(f"Base path set to: {BASE_PATH}")
    print(f"Temp path set to: {TEMP_PATH}")
else:
    BASE_PATH = Path.cwd()  # Current working directory for local
    TEMP_PATH = None  # Not used for local
    print(f"Base path set to: {BASE_PATH}")

print("Environment setup completed!")

Running on Google Colab
Mounted at /content/drive
Base path set to: /content/drive/MyDrive/grid
Temp path set to: /content/temp_data
Environment setup completed!


In [2]:
from pathlib import Path
from PIL import Image

root = Path(BASE_PATH)  # folder train/test/ground_truth dsb.
sizes = set()

for img_path in root.rglob("*.png"):
    with Image.open(img_path) as im:
        sizes.add(im.size)   # (width, height)

print(sizes)  # akan kelihatan apakah semua (1024, 1024) atau bervariasi


{(1024, 1024)}


In [None]:
import shutil

print("Starting cleanup process...")

if IS_COLAB:
    # Clean up session storage (temp directory) on Colab
    cleaned_folders = []
    if TEMP_PATH.exists():
        for item in TEMP_PATH.iterdir():
            if item.is_dir():
                shutil.rmtree(item)
                cleaned_folders.append(item.name)
                print(f"Removed session folder: {item.name}")
            elif item.is_file():
                item.unlink()
                print(f"Removed session file: {item.name}")
        print("Session storage cleaned!")
        if cleaned_folders:
            print(f"Cleaned folders: {', '.join(cleaned_folders)}")
    else:
        print("No session storage to clean.")
else:
    # Clean up entire train folder locally (except 'good' folder)
    train_dir = BASE_PATH / 'train'
    cleaned_folders = []
    if train_dir.exists():
        # Get all folders in test directory except 'good' to know what to clean
        test_dir = BASE_PATH / 'test'
        test_folders = [f for f in test_dir.iterdir() if f.is_dir() and f.name != 'good']

        # Remove defect folders from train
        for folder in test_folders:
            train_folder_path = train_dir / folder.name
            if train_folder_path.exists():
                shutil.rmtree(train_folder_path)
                cleaned_folders.append(folder.name)
                print(f"Removed local folder: train/{folder.name}")

        if cleaned_folders:
            print(f"\nLocal train folder cleaned!")
            print(f"Cleaned folders: {', '.join(cleaned_folders)}")
            print(f"Preserved folder: good")
        else:
            print("No defect folders found to clean.")
    else:
        print("No train folder to clean.")

print("\nCleanup completed!")

Starting cleanup process...
Removed session folder: train
Session storage cleaned!
Cleaned folders: train

Cleanup completed!


In [None]:
import shutil
import random

# Set seed for reproducibility
random.seed(42)

# Define source paths (always read from Google Drive or local base path)
test_dir = BASE_PATH / 'test'
train_dir = BASE_PATH / 'train'

# Define destination paths based on environment
if IS_COLAB:
    # Use session storage for Colab
    dest_train_dir = TEMP_PATH / 'train'
else:
    # Use local train directory
    dest_train_dir = BASE_PATH / 'train'

# Get all folders in test directory except 'good'
test_folders = [f for f in test_dir.iterdir() if f.is_dir() and f.name != 'good']

print("Starting data splitting process...")
print(f"Reading from: {test_dir}")
print(f"Writing to: {dest_train_dir}")
print(f"Found folders to process: {[f.name for f in test_folders]}\n")

# Process each folder
for folder in test_folders:
    folder_name = folder.name

    # Get all files in the test folder
    test_files = list(folder.glob('*'))
    test_files = [f for f in test_files if f.is_file()]

    total_files = len(test_files)

    # Calculate 80% for training
    train_count = int(total_files * 0.8)

    # Randomly select 80% of files for training
    random.shuffle(test_files)
    files_to_copy = test_files[:train_count]

    # Create corresponding folder in destination train directory
    train_folder_path = dest_train_dir / folder_name
    train_folder_path.mkdir(parents=True, exist_ok=True)

    # Copy files to train folder
    copied_count = 0
    for file_path in files_to_copy:
        dest_path = train_folder_path / file_path.name
        try:
            # Remove destination file if it exists to avoid permission errors
            if dest_path.exists():
                dest_path.unlink()
            shutil.copy2(file_path, dest_path)
            copied_count += 1
        except Exception as e:
            print(f"Warning: Could not copy {file_path.name}: {e}")

    print(f"Folder '{folder_name}':")
    print(f"  - Total files in test: {total_files}")
    print(f"  - Files copied to train (80%): {copied_count}")
    print(f"  - Files remaining in test (20%): {total_files - copied_count}\n")

# Handle 'good' folder based on environment
if IS_COLAB:
    # Copy train/good folder from Google Drive to session storage
    source_good_folder = train_dir / 'good'
    dest_good_folder = dest_train_dir / 'good'

    if source_good_folder.exists():
        # Remove destination if exists
        if dest_good_folder.exists():
            shutil.rmtree(dest_good_folder)
        # Copy entire good folder to session storage
        shutil.copytree(source_good_folder, dest_good_folder)
        good_file_count = len(list(dest_good_folder.glob('*')))
        print(f"Copied 'good' folder from Drive to session storage:")
        print(f"  - Total files: {good_file_count}\n")
    else:
        print("Note: 'train/good' folder not found in Google Drive.\n")
else:
    # For local, just use existing train/good folder (preserved by cleanup)
    good_folder = dest_train_dir / 'good'
    if good_folder.exists():
        good_file_count = len(list(good_folder.glob('*')))
        print(f"Using existing 'good' folder in train:")
        print(f"  - Total files: {good_file_count}\n")
    else:
        print("Note: 'train/good' folder not found.\n")

print("Data splitting completed successfully!")
print(f"Results saved to: {dest_train_dir}")

Starting data splitting process...
Reading from: /content/drive/MyDrive/grid/test
Writing to: /content/temp_data/train
Found folders to process: ['bent', 'metal_contamination', 'broken', 'glue', 'thread']

Folder 'bent':
  - Total files in test: 12
  - Files copied to train (80%): 9
  - Files remaining in test (20%): 3

Folder 'metal_contamination':
  - Total files in test: 11
  - Files copied to train (80%): 8
  - Files remaining in test (20%): 3

Folder 'broken':
  - Total files in test: 12
  - Files copied to train (80%): 9
  - Files remaining in test (20%): 3

Folder 'glue':
  - Total files in test: 11
  - Files copied to train (80%): 8
  - Files remaining in test (20%): 3

Folder 'thread':
  - Total files in test: 11
  - Files copied to train (80%): 8
  - Files remaining in test (20%): 3

Copied 'good' folder from Drive to session storage:
  - Total files: 264

Data splitting completed successfully!
Results saved to: /content/temp_data/train
