In [1]:
# First Cell: Importing Libraries
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

# For saving images and visualizing the data
import shutil



In [2]:
# Second Cell: Load Images from Folder
# Second Cell: Load and Resize Images from Folder
def load_images_from_folder(folder_path, target_size=(256, 256)):
    """
    Loads and resizes images from a specified folder.
    Returns a numpy array of uniformly sized images.
    """
    images = []
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale
        if img is not None:
            resized_img = cv2.resize(img, target_size)
            images.append(resized_img)
    return np.array(images)


# Load images from your "tumor" and "no-tumor" folders
tumor_dir = r"C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\tumor"  # Update with the correct path
no_tumor_dir = r"C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\notumor"  # Update with the correct path

tumor_images = load_images_from_folder(tumor_dir)
no_tumor_images = load_images_from_folder(no_tumor_dir)

print(f"Loaded {len(tumor_images)} tumor images and {len(no_tumor_images)} no-tumor images.")


Loaded 4117 tumor images and 1595 no-tumor images.


In [9]:
import random

target_size = 6380  # Desired size per class

# Oversampling function
def oversample_images(images, target_size):
    """
    Oversample the images by randomly selecting images with replacement.
    """
    current_size = len(images)
    if current_size < target_size:
        indices = np.random.randint(0, current_size, size=(target_size - current_size))
        duplicates = images[indices]
        images = np.concatenate((images, duplicates), axis=0)
    return images

# Undersampling function
def undersample_images(images, target_size):
    """
    Undersample the images by randomly selecting a subset without replacement.
    """
    current_size = len(images)
    if current_size > target_size:
        indices = np.random.choice(current_size, target_size, replace=False)
        images = images[indices]
    return images

# Apply balancing
no_tumor_images = oversample_images(no_tumor_images, target_size)
tumor_images = oversample_images(tumor_images, target_size)

print(f"Balanced: {len(no_tumor_images)} no-tumor images, {len(tumor_images)} tumor images")


Balanced: 6380 no-tumor images, 6380 tumor images


In [11]:
# Fourth Cell: Save the Balanced Dataset to Disk

def save_images(images, save_dir, prefix):
    """
    Save the balanced images to a directory with a specific prefix (e.g., 'no_tumor').
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    for idx, img in enumerate(images):
        img_name = f"{prefix}_{idx}.png"
        img_path = os.path.join(save_dir, img_name)
        cv2.imwrite(img_path, img)

# Define the directories to save your balanced data
save_no_tumor_dir = r"C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\notumor_1_SAVE"  # Update with the correct path
save_tumor_dir = r"C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\tumor_1_SAVE"  # Update with the correct path

# Save the images
save_images(no_tumor_images, save_no_tumor_dir, "no_tumor")
save_images(tumor_images, save_tumor_dir, "tumor")

print(f"Balanced dataset saved: No-Tumor images in {save_no_tumor_dir}, Tumor images in {save_tumor_dir}.")


Balanced dataset saved: No-Tumor images in C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\notumor_1_SAVE, Tumor images in C:\Users\anura\Desktop\REBIRTHTHENEWHOPE\Deep-Fake-Detection\data\DATA\Training\tumor_1_SAVE.


In [13]:
# Fifth Cell: Prepare Dataset for PyTorch Training (Optional)

import torch
from torch.utils.data import Dataset, DataLoader

class TumorDataset(Dataset):
    def __init__(self, tumor_images, no_tumor_images):
        """
        Custom Dataset for handling balanced tumor and no-tumor images.
        """
        self.tumor_images = tumor_images
        self.no_tumor_images = no_tumor_images
        self.labels = [1] * len(tumor_images) + [0] * len(no_tumor_images)  # 1 for tumor, 0 for no-tumor
        self.images = np.concatenate([tumor_images, no_tumor_images], axis=0)
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        return torch.tensor(image, dtype=torch.float32).unsqueeze(0), label  # Adjust for grayscale or RGB

# Create the dataset
dataset = TumorDataset(tumor_images, no_tumor_images)

# Create DataLoader for batching
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

print(f"Dataset size: {len(dataset)} images")


Dataset size: 12760 images
