In [1]:
import cv2
import numpy as np
import os
import albumentations as A
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import multiprocessing
from functools import partial
import random

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Define constants
INPUT_DIR = r"C:\Users\Ayush\OneDrive\Documents\ml\krishi.ai\ml\data"
OUTPUT_DIR = r"C:\Users\Ayush\OneDrive\Documents\ml\krishi.ai\ml\output_data"
IMG_SIZE = 256
AUGMENTATION_PROBABILITY = 0.6
NUM_WORKERS = multiprocessing.cpu_count()  

os.makedirs(OUTPUT_DIR, exist_ok=True)

def basic_preprocessing(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image = cv2.GaussianBlur(image, (3, 3), 0)
    image = cv2.fastNlMeansDenoisingColored(image, None, 3, 3, 7, 21)
    image = cv2.convertScaleAbs(image, alpha=1.0, beta=0)
    
    return image

In [10]:
# Define weather augmentation pipeline
weather_aug = A.Compose([
    A.RandomSunFlare(flare_roi=(0.9, 0, 1, 0.5), angle_lower=0.3, p=AUGMENTATION_PROBABILITY),
    A.RandomSunFlare(flare_roi=(0.0, 0.0, 1.0, 0.1), angle_lower=0.3, p=AUGMENTATION_PROBABILITY),
    A.RandomRain(drop_length=1.0, drop_width=1, drop_color=(200, 200, 200), blur_value=3, brightness_coefficient=0.6, p=AUGMENTATION_PROBABILITY),
    A.RandomShadow(num_shadows_lower=1, num_shadows_upper=5, shadow_dimension=6, shadow_roi=(0, 0.5, 1, 1), p=AUGMENTATION_PROBABILITY),
    A.RandomFog(fog_coef_lower=0.25, fog_coef_upper=0.8, alpha_coef=0.3, p=AUGMENTATION_PROBABILITY)
])

In [11]:
def process_image(args):
    img_path, output_class_dir, num_augmentations = args
    image = cv2.imread(img_path)
    
    processed_image = basic_preprocessing(image)
    
    img_name = os.path.basename(img_path)
    cv2.imwrite(os.path.join(output_class_dir, f"proc_{img_name}"), cv2.cvtColor(processed_image, cv2.COLOR_RGB2BGR))
    for i in range(num_augmentations):
        augmented = weather_aug(image=processed_image)
        aug_image = augmented['image']
        cv2.imwrite(os.path.join(output_class_dir, f"aug_{i}_{img_name}"), cv2.cvtColor(aug_image, cv2.COLOR_RGB2BGR))

In [12]:
def process_dataset(dataset_name, num_augmentations):
    dataset_dir = os.path.join(INPUT_DIR, dataset_name)
    output_dataset_dir = os.path.join(OUTPUT_DIR, dataset_name)
    os.makedirs(output_dataset_dir, exist_ok=True)
    
    all_images = []
    
    for class_name in os.listdir(dataset_dir):
        class_dir = os.path.join(dataset_dir, class_name)
        output_class_dir = os.path.join(output_dataset_dir, class_name)
        os.makedirs(output_class_dir, exist_ok=True)
        
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            all_images.append((img_path, output_class_dir, num_augmentations))
    
    # Shuffle the images to distribute workload evenly
    random.shuffle(all_images)
    
    # Process images in parallel
    with multiprocessing.Pool(NUM_WORKERS) as pool:
        list(tqdm(pool.imap(process_image, all_images), total=len(all_images), desc=f"Processing {dataset_name}"))

In [13]:
def split_dataset(dataset_name, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    dataset_dir = os.path.join(OUTPUT_DIR, dataset_name)
    train_dir = os.path.join(OUTPUT_DIR, f"{dataset_name}_train")
    val_dir = os.path.join(OUTPUT_DIR, f"{dataset_name}_val")
    test_dir = os.path.join(OUTPUT_DIR, f"{dataset_name}_test")
    
    for dir in [train_dir, val_dir, test_dir]:
        os.makedirs(dir, exist_ok=True)
    
    all_images = []
    for class_name in os.listdir(dataset_dir):
        class_dir = os.path.join(dataset_dir, class_name)
        images = os.listdir(class_dir)
        all_images.extend([(os.path.join(class_dir, img), class_name) for img in images])
    
    # Split the data
    train_val, test = train_test_split(all_images, test_size=test_ratio, random_state=42)
    train, val = train_test_split(train_val, test_size=val_ratio/(train_ratio+val_ratio), random_state=42)
    
    # Move images to respective directories
    for subset, subset_images in [("train", train), ("val", val), ("test", test)]:
        for img_path, class_name in tqdm(subset_images, desc=f"Moving {subset} images"):
            subset_dir = os.path.join(eval(f"{subset}_dir"), class_name)
            os.makedirs(subset_dir, exist_ok=True)
            img_name = os.path.basename(img_path)
            os.rename(img_path, os.path.join(subset_dir, img_name))

In [16]:
process_dataset("test", 6)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Ayush\\OneDrive\\Documents\\ml\\krishi.ai\\ml\\data\\test'

In [None]:

# Process datasets
process_dataset("Katra-Twelve", 2)
process_dataset("BARI-Sunflower", 2)
process_dataset("FGVC8", 6)



# Split datasets
split_dataset("Katra-Twelve")
split_dataset("BARI-Sunflower")
split_dataset("FGVC8")

print("Data preprocessing and splitting completed.")