In [1]:
import os

# Base directory for all data related to Joe's work
base_dir = 'JoeData'
os.makedirs(base_dir, exist_ok=True)

# Subdirectories for GAN and Classifier data
gan_data_dir = os.path.join(base_dir, 'GAN_Training')
classifier_train_dir = os.path.join(base_dir, 'Classifier_Training')
classifier_test_dir = os.path.join(base_dir, 'Classifier_Testing')

# Create the directories
os.makedirs(gan_data_dir, exist_ok=True)
os.makedirs(classifier_train_dir, exist_ok=True)
os.makedirs(classifier_test_dir, exist_ok=True)

In [2]:
# Define the path to the 'competition_data' directory
competition_data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data', 'competition_data')

# Verify the path
print("Path to competition_data:", competition_data_dir)
if not os.path.exists(competition_data_dir):
    print("The competition_data directory does not exist!")
else:
    print("The competition_data directory was found.")

Path to competition_data: f:\Grad_School\CSCE_5215\GroupProject\Git\BrainSynergy\data\competition_data
The competition_data directory was found.


**Data Plan**
* Step 1: Reserve Anonymous Kaggle Test Set
• 20% of 3,096 total images: 619 images (Reserved for Kaggle competition)
* Step 2: Remaining Dataset
• 80% of 3,096 total images: 2,477 images (For our GAN and classifier)
* Step 3: GAN Training Subset
• 70% of 2,477: 1,732 images
• After a single horizontal flip: 3,464 images for GAN training
* Step 4: Classifier Training + Testing
• 30% of 2,477: 743 images
Training: 80% of 743 = 594 images
Testing: 20% of 743 = 149 images

**Summary**
* Anonymous Kaggle Test Set: 619 images
* GAN Training: 3,468 images (after a single data augmentation)
* Classifier Training: 594 images
* Classifier Testing: 149 images

In [3]:
from sklearn.model_selection import train_test_split
import shutil


# Initialize data structure holding all classes
data = {
    'glioma_tumor': [],
    'meningioma_tumor': [],
    'pituitary_tumor': [],
    'normal': []
}

# Load data and labels
image_paths = []
labels = []

# Correctly use os.path.join to create the full path
for label in data.keys():
    class_dir = os.path.join(competition_data_dir, label)
    if os.path.exists(class_dir):
        # Get all images in the source folder and update the labels list
        images = os.listdir(class_dir)
        image_paths += [os.path.join(class_dir, image) for image in images]
        labels += [label] * len(images)
    else:
        print(f"Directory not found: {class_dir}")

# Split data for GAN and classifier with stratification
# 70% for GAN training; remaining for classifier
gan_images, classifier_images, gan_labels, classifier_labels = train_test_split(
    image_paths, labels, test_size=0.3, stratify=labels, random_state=42
)

# Further split classifier images into training and testing with stratification
# 80% of remaining data for classifier training, 20% for classifier testing
classifier_train, classifier_test, classifier_train_labels, classifier_test_labels = train_test_split(
    classifier_images, classifier_labels, test_size=0.2, stratify=classifier_labels, random_state=42
)

# Now you can move the files to their respective directories
for img_path in gan_images:
    # Determine the new path based on the current gan_data_dir
    new_path = os.path.join(gan_data_dir, os.path.basename(img_path))
    shutil.copy(img_path, new_path)

for img_path in classifier_train:
    # Determine the new path based on the current classifier_train_dir
    new_path = os.path.join(classifier_train_dir, os.path.basename(img_path))
    shutil.copy(img_path, new_path)

for img_path in classifier_test:
    # Determine the new path based on the current classifier_test_dir
    new_path = os.path.join(classifier_test_dir, os.path.basename(img_path))
    shutil.copy(img_path, new_path)

In [4]:
# Test to verify correct number of images in each directory
def verify_image_counts(directory, expected_count):
    # Get the list of all files in the directory
    images = [img for img in os.listdir(directory) if img.endswith(('.png', '.jpg', '.jpeg'))]
    actual_count = len(images)
    # Check if the actual count matches the expected count
    if actual_count == expected_count:
        print(f"Verification passed: {directory} contains {actual_count} images as expected.")
    else:
        print(f"Verification failed: {directory} contains {actual_count} images, but expected {expected_count}.")

# Verify each directory given data plan
verify_image_counts(gan_data_dir, 1732)  
verify_image_counts(classifier_train_dir, 594)
verify_image_counts(classifier_test_dir, 149)


Verification passed: JoeData\GAN_Training contains 1732 images as expected.
Verification passed: JoeData\Classifier_Training contains 594 images as expected.
Verification passed: JoeData\Classifier_Testing contains 149 images as expected.


In [5]:
# Function to clear directories
def clear_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

# Function calls below for testing code 
# clear_directory(gan_data_dir)
# clear_directory(classifier_train_dir)
# clear_directory(classifier_test_dir)

In [6]:
from PIL import Image

# Function to perform augmentation (horizontal flip here)
def augment_data(image_paths, save_dir, suffix='_f'):
    for img_path in image_paths:
        img = Image.open(img_path)
        flipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)

        # Split filename and extension
        base, extension = os.path.splitext(os.path.basename(img_path))
        # Create new file name with suffix before the extension
        new_name = f"{base}{suffix}{extension}"

        # Save flipped image with newly created filename
        flipped_img.save(os.path.join(save_dir, new_name))

# Augment GAN data effectively doubling it to 3,468 examples
augment_data(gan_images, gan_data_dir)

In [7]:
# Verify each directory given data plan after data augmentation
verify_image_counts(gan_data_dir, 3464)  
verify_image_counts(classifier_train_dir, 594)
verify_image_counts(classifier_test_dir, 149)

Verification passed: JoeData\GAN_Training contains 3464 images as expected.
Verification passed: JoeData\Classifier_Training contains 594 images as expected.
Verification passed: JoeData\Classifier_Testing contains 149 images as expected.
