# Data Preperation 
### Augmenting Orignal Images
### Handling Class Imbalance as Negative class has fewer Class instances then positve

In [4]:
from PIL import Image, ImageEnhance
import os
import numpy as np
from tqdm import tqdm  # Progress bar goodness

# Original data directory
data_dir = "Data"
positive_dir = os.path.join(data_dir, "Positives")  # Fixed to match your folder
negative_dir = os.path.join(data_dir, "Negatives")  # Fixed to match your folder

# New Pro_Data directory
pro_data_dir = "Pro_Data"
pro_positive_dir = os.path.join(pro_data_dir, "Positives")  # Keeping this as Positive
pro_negative_dir = os.path.join(pro_data_dir, "Negatives")  # Keeping this as Negative
os.makedirs(pro_positive_dir, exist_ok=True)
os.makedirs(pro_negative_dir, exist_ok=True)

# Function to augment an image
def augment_image(img):
    angle = np.random.uniform(-30, 30)
    rotated_img = img.rotate(angle)
    if np.random.rand() > 0.5:
        flipped_img = rotated_img.transpose(Image.FLIP_LEFT_RIGHT)
    else:
        flipped_img = rotated_img.transpose(Image.FLIP_TOP_BOTTOM)
    brightness = ImageEnhance.Brightness(flipped_img).enhance(np.random.uniform(0.8, 1.2))
    contrast = ImageEnhance.Contrast(brightness).enhance(np.random.uniform(0.8, 1.2))
    saturation = ImageEnhance.Color(contrast).enhance(np.random.uniform(0.8, 1.2))
    return saturation

# Process Positive images (double it: 1 original + 1 augmented)
positive_files = [f for f in os.listdir(positive_dir) if f.endswith(".jpg")]
positive_count = len(positive_files)

print("Processing Positive images...")
for i, img_file in enumerate(tqdm(positive_files, desc="Positive")):
    img_path = os.path.join(positive_dir, img_file)
    img = Image.open(img_path)
    img.save(os.path.join(pro_positive_dir, f"{i+1}.jpg"))  # Original
    aug_img = augment_image(img)
    aug_img.save(os.path.join(pro_positive_dir, f"{positive_count + i + 1}.jpg"))  # 1 augmented

# Process Negative images (triple it: 1 original + 2 augmented)
negative_files = [f for f in os.listdir(negative_dir) if f.endswith(".jpg")]
negative_count = len(negative_files)

print("Processing Negative images...")
for i, img_file in enumerate(tqdm(negative_files, desc="Negative")):
    img_path = os.path.join(negative_dir, img_file)
    img = Image.open(img_path)
    img.save(os.path.join(pro_negative_dir, f"N_{i+1}.jpg"))  # Original
    # First augmented image
    aug_img1 = augment_image(img)
    aug_img1.save(os.path.join(pro_negative_dir, f"N_{negative_count + i + 1}.jpg"))
    # Second augmented image
    aug_img2 = augment_image(img)
    aug_img2.save(os.path.join(pro_negative_dir, f"N_{2 * negative_count + i + 1}.jpg"))

print("All set! Check Pro_Data—Negative’s boosted and progress was tracked!")

Processing Positive images...


Positive: 100%|██████████| 422/422 [01:04<00:00,  6.55it/s]


Processing Negative images...


Negative: 100%|██████████| 147/147 [00:11<00:00, 12.45it/s]

All set! Check Pro_Data—Negative’s boosted and progress was tracked!





# Data Preparation: Train-Test Splits

### Folder Structure

```bash
Pro_Data/
├── train/
│   ├── Positive/
│   │   ├── 1.jpg
│   │   ├── 2.jpg
│   │   ├── ...
│   ├── Negative/
│   │   ├── N_1.jpg
│   │   ├── N_2.jpg
│   │   ├── ...
├── val/
│   ├── Positive/
│   │   ├── 11.jpg
│   │   ├── 22.jpg
│   │   ├── ...
│   ├── Negative/
│   │   ├── N_11.jpg
│   │   ├── N_22.jpg
│   │   ├── ...
├── test/
│   ├── Positive/
│   │   ├── 111.jpg
│   │   ├── 222.jpg
│   │   ├── ...
│   ├── Negative/
│   │   ├── N_111.jpg
│   │   ├── N_222.jpg
│   │   ├── ...


In [6]:
import os
import shutil
from tqdm import tqdm
import random

def create_directory_structure(base_path):
    # Create main directories
    directories = [
        os.path.join(base_path, 'train', 'Positive'),
        os.path.join(base_path, 'train', 'Negative'),
        os.path.join(base_path, 'val', 'Positive'),
        os.path.join(base_path, 'val', 'Negative'),
        os.path.join(base_path, 'test', 'Positive'),
        os.path.join(base_path, 'test', 'Negative')
    ]
    
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

def split_and_move_files(source_dir, dest_dir, split_ratio=0.8, test_size=20):
    # Get all positive and negative files
    positive_files = [f for f in os.listdir(os.path.join(source_dir, 'Positive')) if f.endswith('.jpg')]
    negative_files = [f for f in os.listdir(os.path.join(source_dir, 'Negative')) if f.endswith('.jpg')]
    
    # Shuffle files
    random.shuffle(positive_files)
    random.shuffle(negative_files)
    
    # Calculate split sizes (excluding test set)
    for files, class_name in [(positive_files, 'Positive'), (negative_files, 'Negative')]:
        total_files = len(files)
        test_files = files[:test_size]  # First 20 for test
        remaining_files = files[test_size:]  # Rest for train/val
        train_size = int(len(remaining_files) * split_ratio)
        
        train_files = remaining_files[:train_size]
        val_files = remaining_files[train_size:]
        
        # Move files with progress bars
        print(f"\nProcessing {class_name} class:")
        
        # Test files
        with tqdm(total=len(test_files), desc=f"Moving {class_name} test files") as pbar:
            for file in test_files:
                src = os.path.join(source_dir, class_name, file)
                dst = os.path.join(dest_dir, 'test', class_name, file)
                shutil.copy2(src, dst)
                pbar.update(1)
                
        # Train files
        with tqdm(total=len(train_files), desc=f"Moving {class_name} train files") as pbar:
            for file in train_files:
                src = os.path.join(source_dir, class_name, file)
                dst = os.path.join(dest_dir, 'train', class_name, file)
                shutil.copy2(src, dst)
                pbar.update(1)
                
        # Validation files
        with tqdm(total=len(val_files), desc=f"Moving {class_name} val files") as pbar:
            for file in val_files:
                src = os.path.join(source_dir, class_name, file)
                dst = os.path.join(dest_dir, 'val', class_name, file)
                shutil.copy2(src, dst)
                pbar.update(1)

def main():
    source_dir = 'Pro_Data'
    dest_dir = 'Final_Data'
    
    # Verify source directory exists
    if not os.path.exists(source_dir):
        print(f"Source directory '{source_dir}' not found!")
        return
    
    # Create new directory structure
    print("Creating directory structure...")
    create_directory_structure(dest_dir)
    
    # Split and move files
    print("Starting file organization...")
    split_and_move_files(source_dir, dest_dir)
    
    print("\nData organization completed successfully!")

if __name__ == "__main__":
    main()

Creating directory structure...
Starting file organization...

Processing Positive class:


Moving Positive test files:   0%|          | 0/20 [00:00<?, ?it/s]

Moving Positive test files: 100%|██████████| 20/20 [00:00<00:00, 807.01it/s]
Moving Positive train files: 100%|██████████| 659/659 [00:00<00:00, 1348.57it/s]
Moving Positive val files: 100%|██████████| 165/165 [00:00<00:00, 1307.12it/s]



Processing Negative class:


Moving Negative test files: 100%|██████████| 20/20 [00:00<00:00, 1250.17it/s]
Moving Negative train files: 100%|██████████| 336/336 [00:00<00:00, 1193.60it/s]
Moving Negative val files: 100%|██████████| 85/85 [00:00<00:00, 1167.99it/s]


Data organization completed successfully!



