In [1]:
!pip install -q kaggle

In [2]:
!mkdir ~/.kaggle

In [3]:
!mv /content/kaggle.json ~/.kaggle/


mv: cannot stat '/content/kaggle.json': No such file or directory


In [4]:
# start the download
!kaggle datasets download 'artyomkolas/3-kinds-of-pneumonia'

Dataset URL: https://www.kaggle.com/datasets/artyomkolas/3-kinds-of-pneumonia
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading 3-kinds-of-pneumonia.zip to /content
 99% 3.47G/3.49G [00:34<00:00, 184MB/s]
100% 3.49G/3.49G [00:34<00:00, 109MB/s]


In [5]:
!unzip 3-kinds-of-pneumonia.zip -d dataset/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (691).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (692).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (693).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (694).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (695).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (696).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (697).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (698).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (699).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (7).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (70).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (700).jpg  
  inflating: dataset/Curated X-Ray Dataset/Normal/Normal (701).jpg  
  inflating: dataset/Curated X-Ray Datase

In [6]:
import os
import random
import shutil
from pathlib import Path
import zipfile

def create_balanced_dataset(source_path, target_path, n_samples=1656):
    """
    Create a balanced dataset by sampling equal numbers of images from each relevant directory.

    Args:
        source_path (str): Path to the main directory containing subdirectories
        target_path (str): Path where the balanced dataset will be created
        n_samples (int): Number of images to sample from each directory
    """
    # Create target directory if it doesn't exist
    os.makedirs(target_path, exist_ok=True)

    # Explicitly specify the directories we want to process
    desired_dirs = ['Normal', 'Pneumonia-Bacterial', 'Pneumonia-Viral']

    print("Starting to process directories...")

    for subdir in desired_dirs:
        print(f"\nProcessing {subdir}...")

        # Create corresponding directory in target path
        subdir_target = os.path.join(target_path, subdir)
        os.makedirs(subdir_target, exist_ok=True)

        # Get list of all images in the source subdirectory
        source_dir = os.path.join(source_path, subdir)

        # Check if directory exists
        if not os.path.exists(source_dir):
            print(f"Warning: Directory {subdir} not found at {source_dir}")
            continue

        all_images = [f for f in os.listdir(source_dir)
                     if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        print(f"Found {len(all_images)} images in {subdir}")

        if len(all_images) < n_samples:
            print(f"Warning: Directory {subdir} has fewer images ({len(all_images)}) than requested ({n_samples})")

        # Randomly sample n_samples images
        selected_images = random.sample(all_images, min(n_samples, len(all_images)))

        # Copy selected images to target directory
        successful_copies = 0
        for img in selected_images:
            try:
                shutil.copy2(
                    os.path.join(source_dir, img),
                    os.path.join(subdir_target, img)
                )
                successful_copies += 1
            except Exception as e:
                print(f"Error copying {img}: {str(e)}")

        print(f"Successfully copied {successful_copies} images from {subdir}")

def create_zip_file(source_dir, zip_name="balanced_dataset.zip"):
    """
    Create a zip file from the balanced dataset directory.
    """
    if not os.path.exists(source_dir):
        print(f"Error: Directory {source_dir} does not exist!")
        return

    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        file_count = 0
        for root, _, files in os.walk(source_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_dir)
                zipf.write(file_path, arcname)
                file_count += 1

    zip_size = os.path.getsize(zip_name) / (1024 * 1024)  # Convert to MB
    print(f"\nCreated zip file: {zip_name}")
    print(f"Total files in zip: {file_count}")
    print(f"Zip file size: {zip_size:.2f} MB")

# Example usage
source_path = "/content/dataset/Curated X-Ray Dataset/"
target_path = "/content/balanced_dataset"
final_zip = "balanced_dataset.zip"

print("Starting the process...")
print(f"Source path: {source_path}")
print(f"Target path: {target_path}")

# Create balanced dataset
create_balanced_dataset(source_path, target_path)

# Create zip file
create_zip_file(target_path, final_zip)

Starting the process...
Source path: /content/dataset/Curated X-Ray Dataset/
Target path: /content/balanced_dataset
Starting to process directories...

Processing Normal...
Found 3270 images in Normal
Successfully copied 1656 images from Normal

Processing Pneumonia-Bacterial...
Found 3001 images in Pneumonia-Bacterial
Successfully copied 1656 images from Pneumonia-Bacterial

Processing Pneumonia-Viral...
Found 1656 images in Pneumonia-Viral
Successfully copied 1656 images from Pneumonia-Viral

Created zip file: balanced_dataset.zip
Total files in zip: 4968
Zip file size: 1646.35 MB


In [6]:
import os
import random
import shutil
import zipfile

# Paths to the original dataset directories
dir1_path = "/content/dataset/Curated X-Ray Dataset/Normal"
dir2_path = "/content/dataset/Curated X-Ray Dataset/Pneumonia-Bacterial"  # Pneumonia type 1
dir3_path = "/content/dataset/Curated X-Ray Dataset/Pneumonia-Viral"  # Pneumonia type 2

balanced_dataset_path = "/content/"
os.makedirs(balanced_dataset_path, exist_ok=True)

# Number of images required in each category
target_num_images = 1656

# Set a random seed for reproducibility
random.seed(42)

# Function to select images and copy them to the new directory
def select_and_copy_images(source_dir, target_dir, num_images):
    # Get all image filenames
    image_files = os.listdir(source_dir)

    # Select num_images randomly if more than required
    if len(image_files) > num_images:
        image_files = random.sample(image_files, num_images)

    # Create the target sub-directory
    os.makedirs(target_dir, exist_ok=True)

    # Copy selected images to target directory
    for image_file in image_files:
        shutil.copy(os.path.join(source_dir, image_file), target_dir)

# Select and copy images from each category
select_and_copy_images(dir1_path, os.path.join(balanced_dataset_path, "Normal"), target_num_images)
select_and_copy_images(dir2_path, os.path.join(balanced_dataset_path, "Pneumonia_Type1"), target_num_images)
select_and_copy_images(dir3_path, os.path.join(balanced_dataset_path, "Pneumonia_Type2"), target_num_images)

# Create zip file without compression
zip_path = "/content/balanced_dataset.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_STORED) as zipf:
    for root, _, files in os.walk(balanced_dataset_path):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, balanced_dataset_path))
            print(f"Added {file_path} to zip.")  # Optional: Progress update

# Clear the balanced dataset directory if needed to free up space
shutil.rmtree(balanced_dataset_path)  # Uncomment if you want to delete the temporary files

print("Balanced dataset created and zipped successfully at:", zip_path)

Added /content/3-kinds-of-pneumonia.zip to zip.


KeyboardInterrupt: 

In [None]:
#download the zipped file from here
from google.colab import files
files.download('/content/balanced_dataset.zip')