<a href="https://colab.research.google.com/github/AnovaYoung/AI-System-for-Image-Restoration-and-Enhancement/blob/Data-Cleaning-and-Preparation/Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')

# Path to the zip file in Google Drive
zip_file_path = "/content/drive/My Drive/unified_dataset.zip"

# Directory to extract the zip file
extraction_dir = "/content/unified_dataset_extracted"

# Check if the file exists
if os.path.exists(zip_file_path):
    print("Zip file found! Extracting...")

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_dir)
    print("Extraction complete!")

    # Verify contents
    print(f"Number of files in extracted dataset: {len(os.listdir(extraction_dir))}")
    print("Sample directories and files:")
    for root, dirs, files in os.walk(extraction_dir):
        print(f"Directory: {root}, Files: {files[:5]}")
        break
else:
    print("Zip file not found. Please check the path.")


Mounted at /content/drive
Zip file found! Extracting...
Extraction complete!
Number of files in extracted dataset: 4
Sample directories and files:
Directory: /content/unified_dataset_extracted, Files: []


I believe the way I structured this was that the Directory holds other Directories and the root files are within those, so lets explore further.

In [None]:
# Function to list files and subdirectories in the dataset
def explore_directory(directory, depth=1):
    for root, dirs, files in os.walk(directory):
        # Limit the exploration depth to avoid unnecessary details
        current_depth = root[len(directory):].count(os.sep)
        if current_depth < depth:
            print(f"Directory: {root}")
            print(f"Subdirectories: {dirs[:5]}")  # Show a sample of subdirectories
            print(f"Files: {files[:5]}")  # Show a sample of files
            print("-" * 40)

# Explore the extracted directory
explore_directory(extraction_dir, depth=2)


Directory: /content/unified_dataset_extracted
Subdirectories: ['super_resolution_hr', 'data', 'super_resolution_lr', 'content']
Files: []
----------------------------------------
Directory: /content/unified_dataset_extracted/super_resolution_hr
Subdirectories: []
Files: ['0482.img', '0079.img', '0200.img', '0426.img', '0189.img']
----------------------------------------
Directory: /content/unified_dataset_extracted/data
Subdirectories: ['cifar']
Files: []
----------------------------------------
Directory: /content/unified_dataset_extracted/super_resolution_lr
Subdirectories: []
Files: ['0482.img', '0079.img', '0200.img', '0426.img', '0189.img']
----------------------------------------
Directory: /content/unified_dataset_extracted/content
Subdirectories: ['data', 'unified_dataset_extracted']
Files: []
----------------------------------------


In [None]:
import os

# Define the path to the `/content` directory within the extracted dataset
content_dir_path = "/content/unified_dataset_extracted/content"

# Traverse the `/content` directory to analyze its structure
for root, dirs, files in os.walk(content_dir_path):
    print(f"Directory: {root}")
    print(f"Subdirectories: {dirs}")
    print(f"Files: {files}")
    print("-" * 40)


In [5]:
base_dir = "/content/unified_dataset_extracted"

# List to store paths of .txt files
txt_files = []

# Traverse the directory structure to find all .txt files
for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.txt'):
            txt_files.append(os.path.join(root, file))

# Print the total count of .txt files found
print(f"Total .txt files found: {len(txt_files)}")

# Print first 10 .txt files as a sample
print("Sample .txt file paths:")
print(txt_files[:10])

# Function to display content of a .txt file
def preview_txt_file(file_path, num_lines=10):
    print(f"\nContents of {file_path}:")
    try:
        with open(file_path, 'r') as f:
            for i, line in enumerate(f):
                if i >= num_lines:
                    break
                print(line.strip())
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Preview content of the first few .txt files
for file_path in txt_files[:5]:
    preview_txt_file(file_path)

Total .txt files found: 812
Sample .txt file paths:
['/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/wnids.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/words.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/val/val_annotations.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/wnids.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/words.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/val/val_annotations.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/train/n01768244/n01768244_boxes.txt', '/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/train/n02002724/n02002724_boxes.txt', '/content/unified_dataset_extract

# Breakdown of .txt File

1. **wnids.txt**

Content: A list of WordNet IDs (wnid), likely corresponding to the categories or labels in the Tiny ImageNet dataset.

Use: These IDs map to specific object categories.

Example Content:

n02124075

n04067472

n04540053
...
2. **words.txt**

Content: A mapping of WordNet IDs (wnid) to their respective English labels or descriptions.

Use: Provides a human-readable explanation for the categories in wnids.txt.

Example Content:

n00001740	entity

n00001930	physical entity

n00002137	abstraction, abstract entity
...
3. **val_annotations.txt**

Content: Annotation data for the validation set, including:
1. Image name
2. Category ID (wnid)
3. Bounding box coordinates (x1, y1, x2, y2)

Use: Useful for tasks like object localization or evaluation.

Example Content:

val_0.JPEG	n03444034	0	32	44	62

val_1.JPEG	n04067472	52	55	57	59

...

Summary of .txt File Roles in Tiny ImageNet:

**wnids.txt:** Lists the categories available.

**words.txt**: Maps category IDs to descriptions.

**val_annotations.txt**: Provides validation image metadata, including bounding boxes.


# Explore /content/unified_dataset_extracted/content for duplicate or redundant data.

In [None]:
from collections import defaultdict

def explore_content_dir(base_dir):
    print(f"Exploring: {base_dir}")
    dir_structure = defaultdict(list)
    for root, dirs, files in os.walk(base_dir):
        relative_root = os.path.relpath(root, base_dir)
        if relative_root != ".":
            dir_structure[relative_root] = dirs + files

    for dir_path, items in dir_structure.items():
        print(f"Directory: {dir_path}")
        print(f"Contents ({len(items)} items): {', '.join(items[:10])}{'...' if len(items) > 10 else ''}")
        print("-" * 40)

content_dir = "/content/unified_dataset_extracted/content"
explore_content_dir(content_dir)


# Check CIFAR Directory

In [7]:
def explore_cifar_structure(cifar_dir):
    print(f"Exploring: {cifar_dir}")
    for root, dirs, files in os.walk(cifar_dir):
        print(f"Directory: {root}")
        print(f"Subdirectories: {dirs}")
        print(f"Files ({len(files)}): {files[:10]}")
        print("-" * 40)

cifar_dir = "/content/unified_dataset_extracted/data/cifar"
explore_cifar_structure(cifar_dir)


Exploring: /content/unified_dataset_extracted/data/cifar
Directory: /content/unified_dataset_extracted/data/cifar
Subdirectories: ['cifar10_lr', 'cifar10_hr', 'cifar100_lr', 'cifar100_hr']
Files (0): []
----------------------------------------
Directory: /content/unified_dataset_extracted/data/cifar/cifar10_lr
Subdirectories: []
Files (50000): ['23095_lr.img', '601_lr.img', '1158_lr.img', '1577_lr.img', '34179_lr.img', '34011_lr.img', '45339_lr.img', '13748_lr.img', '31112_lr.img', '4361_lr.img']
----------------------------------------
Directory: /content/unified_dataset_extracted/data/cifar/cifar10_hr
Subdirectories: []
Files (50000): ['1373_hr.img', '34796_hr.img', '29068_hr.img', '40732_hr.img', '15178_hr.img', '44114_hr.img', '30262_hr.img', '43943_hr.img', '36923_hr.img', '6911_hr.img']
----------------------------------------
Directory: /content/unified_dataset_extracted/data/cifar/cifar100_lr
Subdirectories: []
Files (50000): ['23095_lr.img', '601_lr.img', '1158_lr.img', '1577_

# Verify images align with intended tasks

In [8]:
from PIL import Image

def verify_images(base_dir, task_dirs):
    for task_dir in task_dirs:
        task_path = os.path.join(base_dir, task_dir)
        print(f"Verifying images in {task_dir}...")
        for img_file in os.listdir(task_path)[:10]:  # Checking a few images
            img_path = os.path.join(task_path, img_file)
            try:
                with Image.open(img_path) as img:
                    print(f"Image {img_file}: {img.size}, {img.mode}")
            except Exception as e:
                print(f"Error with file {img_file}: {e}")

task_base = "/content/unified_dataset_extracted"
tasks = ["super_resolution_hr", "super_resolution_lr"]
verify_images(task_base, tasks)


Verifying images in super_resolution_hr...
Image 0482.img: (2040, 1488), RGB
Image 0079.img: (2040, 1356), RGB
Image 0200.img: (2040, 1356), RGB
Image 0426.img: (2040, 1164), RGB
Image 0189.img: (2040, 1356), RGB
Image 0081.img: (2040, 1356), RGB
Image 0342.img: (2040, 1356), RGB
Image 0346.img: (2040, 1356), RGB
Image 0796.img: (2040, 1356), RGB
Image 0631.img: (2040, 1356), RGB
Verifying images in super_resolution_lr...
Image 0482.img: (510, 372), RGB
Image 0079.img: (510, 339), RGB
Image 0200.img: (510, 339), RGB
Image 0426.img: (510, 291), RGB
Image 0189.img: (510, 339), RGB
Image 0081.img: (510, 339), RGB
Image 0342.img: (510, 339), RGB
Image 0346.img: (510, 339), RGB
Image 0796.img: (510, 339), RGB
Image 0631.img: (510, 339), RGB


In [9]:
import shutil

def clean_redundancy(base_dir, save_space=False):
    unique_dirs = set()
    duplicates = []

    for root, dirs, files in os.walk(base_dir):
        dir_tuple = tuple(sorted(dirs + files))
        if dir_tuple in unique_dirs:
            duplicates.append(root)
        else:
            unique_dirs.add(dir_tuple)

    print(f"Found {len(duplicates)} duplicate directories.")
    if save_space:
        for dup in duplicates:
            shutil.rmtree(dup)
            print(f"Removed duplicate: {dup}")

content_dir = "/content/unified_dataset_extracted/content"
clean_redundancy(content_dir, save_space=False)


Found 1421 duplicate directories.


# Next Steps

**Handle Duplicates:**

Automate the removal of duplicate directories and verify integrity.

**Align Low-Res and High-Res Image Pairs:**

Check if each low-res image in the CIFAR and super-resolution datasets has a corresponding high-res image.

**Metadata Exploration:**

Investigate .txt files (_boxes.txt) to confirm their utility for tasks like inpainting or bounding box annotations.


# Step 1: Remove Duplicate Directories and Validate Integrity

In [10]:
from filecmp import dircmp

# Base directory
base_dir = "/content/unified_dataset_extracted/content"

# Function to compare and remove duplicate directories
def remove_duplicates(base_dir):
    seen_directories = {}
    duplicates_removed = 0

    for root, dirs, _ in os.walk(base_dir):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            # Check if this directory is identical to one we've already seen
            for seen_dir, seen_path in seen_directories.items():
                comparison = dircmp(dir_path, seen_path)
                if not comparison.left_only and not comparison.right_only and not comparison.diff_files:
                    print(f"Duplicate found: {dir_path} matches {seen_path}. Removing duplicate.")
                    shutil.rmtree(dir_path)
                    duplicates_removed += 1
                    break
            else:
                seen_directories[dir_name] = dir_path  # Add new unique directory

    print(f"Total duplicates removed: {duplicates_removed}")

# Execute duplicate removal
remove_duplicates(base_dir)


Duplicate found: /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/val/organized matches /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/train. Removing duplicate.
Duplicate found: /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/test matches /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/test. Removing duplicate.
Duplicate found: /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/val matches /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/val. Removing duplicate.
Duplicate found: /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200/train matches /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/train. Removing duplicate.
Duplicate found: /content/unified_dataset_extracted/content/unified_data

# Step 2: Verify Alignment of Low-Res and High-Res Image Pairs

In [11]:
import glob

# Paths for CIFAR and super-resolution datasets
cifar_paths = ["/content/unified_dataset_extracted/data/cifar/cifar10_lr",
               "/content/unified_dataset_extracted/data/cifar/cifar10_hr",
               "/content/unified_dataset_extracted/data/cifar/cifar100_lr",
               "/content/unified_dataset_extracted/data/cifar/cifar100_hr"]

super_res_paths = ["/content/unified_dataset_extracted/super_resolution_lr",
                   "/content/unified_dataset_extracted/super_resolution_hr"]

def check_alignment(low_res_dir, high_res_dir):
    low_res_files = set(os.path.basename(f) for f in glob.glob(f"{low_res_dir}/*.img"))
    high_res_files = set(os.path.basename(f) for f in glob.glob(f"{high_res_dir}/*.img"))

    # Match files
    missing_in_high_res = low_res_files - high_res_files
    missing_in_low_res = high_res_files - low_res_files

    print(f"Checking alignment between {low_res_dir} and {high_res_dir}")
    print(f"Missing in high-res: {len(missing_in_high_res)}")
    print(f"Missing in low-res: {len(missing_in_low_res)}")

    if missing_in_high_res:
        print(f"Sample missing in high-res: {list(missing_in_high_res)[:5]}")
    if missing_in_low_res:
        print(f"Sample missing in low-res: {list(missing_in_low_res)[:5]}")

# Verify CIFAR datasets
for i in range(0, len(cifar_paths), 2):
    check_alignment(cifar_paths[i], cifar_paths[i + 1])

# Verify super-resolution datasets
check_alignment(super_res_paths[0], super_res_paths[1])


Checking alignment between /content/unified_dataset_extracted/data/cifar/cifar10_lr and /content/unified_dataset_extracted/data/cifar/cifar10_hr
Missing in high-res: 50000
Missing in low-res: 50000
Sample missing in high-res: ['3401_lr.img', '39615_lr.img', '3560_lr.img', '11408_lr.img', '37560_lr.img']
Sample missing in low-res: ['3532_hr.img', '5586_hr.img', '29499_hr.img', '16280_hr.img', '39747_hr.img']
Checking alignment between /content/unified_dataset_extracted/data/cifar/cifar100_lr and /content/unified_dataset_extracted/data/cifar/cifar100_hr
Missing in high-res: 50000
Missing in low-res: 50000
Sample missing in high-res: ['3401_lr.img', '39615_lr.img', '3560_lr.img', '11408_lr.img', '37560_lr.img']
Sample missing in low-res: ['3532_hr.img', '5586_hr.img', '29499_hr.img', '16280_hr.img', '39747_hr.img']
Checking alignment between /content/unified_dataset_extracted/super_resolution_lr and /content/unified_dataset_extracted/super_resolution_hr
Missing in high-res: 0
Missing in l

In [12]:
txt_base_dir = "/content/unified_dataset_extracted"

def analyze_txt_files(base_dir):
    txt_files = glob.glob(f"{base_dir}/**/*.txt", recursive=True)
    print(f"Total .txt files found: {len(txt_files)}")

    for file_path in txt_files[:5]:  # Display the first 5 files as samples
        print(f"\nContents of {file_path}:")
        try:
            with open(file_path, "r") as f:
                content = f.readlines()
                print("".join(content[:10]))  # Show the first 10 lines
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

analyze_txt_files(txt_base_dir)


Total .txt files found: 205

Contents of /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/wnids.txt:
n02124075
n04067472
n04540053
n04099969
n07749582
n01641577
n02802426
n09246464
n07920052
n03970156


Contents of /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/words.txt:
n00001740	entity
n00001930	physical entity
n00002137	abstraction, abstract entity
n00002452	thing
n00002684	object, physical object
n00003553	whole, unit
n00003993	congener
n00004258	living thing, animate thing
n00004475	organism, being
n00005787	benthos


Contents of /content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/val/val_annotations.txt:
val_0.JPEG	n03444034	0	32	44	62
val_1.JPEG	n04067472	52	55	57	59
val_2.JPEG	n04070727	4	0	60	55
val_3.JPEG	n02808440	3	3	63	63
val_4.JPEG	n02808440	9	27	63	48
val_5.JPEG	n04399382	7	0	59	63
val_6.JPEG	n04179913	0	0	63	56
val_7.JPEG	n02823428	5	0	57	63
val_8.JPEG	n04146614	0	31	60	60
val_9

**Missing Alignment Between Low-Res and High-Res for CIFAR:**

Both cifar10 and cifar100 exhibit a complete mismatch between their lr (low-resolution) and hr (high-resolution) datasets.


# Proposed Plan:

Select CIFAR-100 for Refinement:

Since CIFAR-100 contains more classes (100 vs. 10 in CIFAR-10), it's a richer dataset for demonstrating resolution-based image restoration tasks.
Let's focus on aligning HR and LR images for CIFAR-100.

Generate LR Images from HR:

Use the HR versions as the source dataset.
Downscale the HR images to a lower resolution programmatically, creating new matching LR counterparts.

Potentially drop CIFAR-10.


In [17]:
import numpy as np
from tensorflow.keras.datasets import cifar100
from sklearn.model_selection import train_test_split

hr_dir = "/content/unified_dataset_extracted/super_resolution_hr"
lr_dir = "/content/unified_dataset_extracted/super_resolution_lr"
os.makedirs(hr_dir, exist_ok=True)
os.makedirs(lr_dir, exist_ok=True)

# Load CIFAR-100 dataset
(x_train, _), (x_test, _) = cifar100.load_data()
x_data = np.concatenate([x_train, x_test], axis=0)

# Select 50,000 for training and 10,000 for testing
x_train, x_test = train_test_split(x_data, test_size=0.2, random_state=42)

# Ensure the counts align with your specifications
x_train = x_train[:50000]
x_test = x_test[:10000]

# Save high-resolution and corresponding low-resolution images
def save_images(images, hr_dir, lr_dir, start_idx=0):
    for i, img in enumerate(images):
        # Save high-resolution image
        hr_path = os.path.join(hr_dir, f"img_{start_idx + i:05d}_hr.png")
        Image.fromarray(img).save(hr_path)

        # Create and save low-resolution image
        lr_img = Image.fromarray(img).resize((16, 16), Image.BICUBIC).resize((32, 32), Image.BICUBIC)
        lr_path = os.path.join(lr_dir, f"img_{start_idx + i:05d}_lr.png")
        lr_img.save(lr_path)

    print(f"Saved {len(images)} high-resolution and low-resolution pairs.")

# Save training and testing pairs
save_images(x_train, hr_dir, lr_dir, start_idx=0)
save_images(x_test, hr_dir, lr_dir, start_idx=50000)

print("CIFAR-100 dataset preparation complete.")


Saved 48000 high-resolution and low-resolution pairs.
Saved 10000 high-resolution and low-resolution pairs.
CIFAR-100 dataset preparation complete.


VERIFY BEFORE CONCATANATION

In [20]:
print(f"Total files in HR directory: {len(hr_files)}")
print(f"Total files in LR directory: {len(lr_files)}")

# Split train and test by filename convention
train_hr_files = [f for f in hr_files if "train" in f]
train_lr_files = [f for f in lr_files if "train" in f]
test_hr_files = [f for f in hr_files if "test" in f]
test_lr_files = [f for f in lr_files if "test" in f]

print(f"Training HR files: {len(train_hr_files)}")
print(f"Training LR files: {len(train_lr_files)}")
print(f"Test HR files: {len(test_hr_files)}")
print(f"Test LR files: {len(test_lr_files)}")

# Check for missing files
missing_in_lr = [hr.replace("_hr", "_lr") for hr in train_hr_files if hr.replace("_hr", "_lr") not in train_lr_files]
missing_in_hr = [lr.replace("_lr", "_hr") for lr in train_lr_files if lr.replace("_lr", "_hr") not in train_hr_files]

# Print missing files for training
print(f"Missing in LR (train): {len(missing_in_lr)} files")
print(f"Missing in HR (train): {len(missing_in_hr)} files")

# Repeat for test files
missing_in_lr_test = [hr.replace("_hr", "_lr") for hr in test_hr_files if hr.replace("_hr", "_lr") not in test_lr_files]
missing_in_hr_test = [lr.replace("_lr", "_hr") for lr in test_lr_files if lr.replace("_lr", "_hr") not in test_hr_files]

# Print missing files for testing
print(f"Missing in LR (test): {len(missing_in_lr_test)} files")
print(f"Missing in HR (test): {len(missing_in_hr_test)} files")


Total files in HR directory: 58000
Total files in LR directory: 58000
Training HR files: 0
Training LR files: 0
Test HR files: 0
Test LR files: 0
Missing in LR (train): 0 files
Missing in HR (train): 0 files
Missing in LR (test): 0 files
Missing in HR (test): 0 files


In [21]:
# Split files based on indices
train_hr_files = sorted(hr_files)[:48000]
train_lr_files = sorted(lr_files)[:48000]
test_hr_files = sorted(hr_files)[48000:]
test_lr_files = sorted(lr_files)[48000:]

# Debugging counts
print(f"Training HR files: {len(train_hr_files)}")
print(f"Training LR files: {len(train_lr_files)}")
print(f"Test HR files: {len(test_hr_files)}")
print(f"Test LR files: {len(test_lr_files)}")

# Verify counts
assert len(train_hr_files) == len(train_lr_files) == 48000, "Training pairs mismatch!"
assert len(test_hr_files) == len(test_lr_files) == 10000, "Test pairs mismatch!"

# Verify filenames match
assert all(hr.replace("_hr", "_lr") == lr for hr, lr in zip(train_hr_files, train_lr_files)), "Training filenames do not match!"
assert all(hr.replace("_hr", "_lr") == lr for hr, lr in zip(test_hr_files, test_lr_files)), "Test filenames do not match!"


Training HR files: 48000
Training LR files: 48000
Test HR files: 10000
Test LR files: 10000


In [23]:
hr_dir = "/content/unified_dataset_extracted/super_resolution_hr"
lr_dir = "/content/unified_dataset_extracted/super_resolution_lr"

# Load filenames
hr_files = sorted(os.listdir(hr_dir))
lr_files = sorted(os.listdir(lr_dir))

# Verify counts
assert len(hr_files) == len(lr_files) == 58000, "Total pairs mismatch!"

# Split into training and testing
train_hr_files = hr_files[:48000]
train_lr_files = lr_files[:48000]
test_hr_files = hr_files[48000:]
test_lr_files = lr_files[48000:]

# Verify training counts
assert len(train_hr_files) == len(train_lr_files) == 48000, "Training pairs mismatch!"
# Verify testing counts
assert len(test_hr_files) == len(test_lr_files) == 10000, "Test pairs mismatch!"

# Verify filenames match
train_mismatched = [
    (hr, lr) for hr, lr in zip(train_hr_files, train_lr_files)
    if hr.replace("_hr", "_lr") != lr
]
test_mismatched = [
    (hr, lr) for hr, lr in zip(test_hr_files, test_lr_files)
    if hr.replace("_hr", "_lr") != lr
]

if not train_mismatched and not test_mismatched:
    print("All pairs match!")
else:
    print(f"Training mismatches: {len(train_mismatched)}")
    print(f"Test mismatches: {len(test_mismatched)}")
    if train_mismatched:
        print("Sample training mismatches:", train_mismatched[:5])
    if test_mismatched:
        print("Sample test mismatches:", test_mismatched[:5])


All pairs match!


In [24]:
# Paths to CIFAR-10 and CIFAR-100 in the unified dataset
unified_base_dir = "/content/unified_dataset_extracted"
cifar10_dir = os.path.join(unified_base_dir, "data/cifar/cifar10")
cifar10_lr_dir = os.path.join(unified_base_dir, "data/cifar/cifar10_lr")
cifar10_hr_dir = os.path.join(unified_base_dir, "data/cifar/cifar10_hr")
cifar100_lr_dir = os.path.join(unified_base_dir, "data/cifar/cifar100_lr")
cifar100_hr_dir = os.path.join(unified_base_dir, "data/cifar/cifar100_hr")

# List of directories to remove
dirs_to_remove = [cifar10_dir, cifar10_lr_dir, cifar10_hr_dir, cifar100_lr_dir, cifar100_hr_dir]

# Remove directories if they exist
for dir_path in dirs_to_remove:
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
        print(f"Deleted: {dir_path}")
    else:
        print(f"Directory not found: {dir_path}")

print("Old CIFAR datasets removed.")


Directory not found: /content/unified_dataset_extracted/data/cifar/cifar10
Deleted: /content/unified_dataset_extracted/data/cifar/cifar10_lr
Deleted: /content/unified_dataset_extracted/data/cifar/cifar10_hr
Deleted: /content/unified_dataset_extracted/data/cifar/cifar100_lr
Deleted: /content/unified_dataset_extracted/data/cifar/cifar100_hr
Old CIFAR datasets removed.


In [25]:
base_dir = "/content/unified_dataset_extracted"

# Function to collect statistics
def get_dataset_statistics(base_dir):
    dataset_stats = {}
    for root, dirs, files in os.walk(base_dir):
        if files:
            dataset_stats[root] = len(files)
    return dataset_stats

# Collect statistics
dataset_stats = get_dataset_statistics(base_dir)

# Display summary
total_files = sum(dataset_stats.values())
print(f"Total files in unified dataset: {total_files}")
print("\nSample dataset statistics (first 10):")
for path, file_count in list(dataset_stats.items())[:10]:
    print(f"{path}: {file_count} files")

# Check for CIFAR-related directories
print("\nChecking for CIFAR-related directories...")
cifar_keywords = ["cifar", "cifar10", "cifar100"]
cifar_dirs = [path for path in dataset_stats if any(keyword in path.lower() for keyword in cifar_keywords)]

if cifar_dirs:
    print("CIFAR-related directories found:")
    for path in cifar_dirs:
        print(f"- {path}")
else:
    print("No CIFAR-related directories found in the dataset.")

Total files in unified dataset: 544492

Sample dataset statistics (first 10):
/content/unified_dataset_extracted/super_resolution_hr: 58000 files
/content/unified_dataset_extracted/super_resolution_lr: 58000 files
/content/unified_dataset_extracted/content/data/coco/train2017: 118287 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200: 2 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/test/images: 10000 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/val: 1 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/tiny-imagenet-200: 2 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/train/n01768244: 1 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/train/n01768244/images: 500 files
/content/unified_dataset_extracted/content/data/tiny_imagenet/tiny-imagenet-200/train/n02002724:

# Now Integrate New CIFAR-100 Data

In [27]:
new_cifar_hr_dir = "/content/unified_dataset_extracted/super_resolution_hr"
new_cifar_lr_dir = "/content/unified_dataset_extracted/super_resolution_lr"

# Final target paths for integration with the unified dataset
final_unified_hr_dir = "/content/unified_dataset_extracted/content/super_resolution_hr"
final_unified_lr_dir = "/content/unified_dataset_extracted/content/super_resolution_lr"

# Ensure target directories exist
os.makedirs(final_unified_hr_dir, exist_ok=True)
os.makedirs(final_unified_lr_dir, exist_ok=True)

# Function that moves files to final dataset
def integrate_cifar_data(source_dir, target_dir):
    for filename in os.listdir(source_dir):
        source_path = os.path.join(source_dir, filename)
        target_path = os.path.join(target_dir, filename)
        shutil.move(source_path, target_path)
    print(f"Integrated {len(os.listdir(target_dir))} files into {target_dir}")

# Integrate HR and LR datasets into their respective final locations
integrate_cifar_data(new_cifar_hr_dir, final_unified_hr_dir)
integrate_cifar_data(new_cifar_lr_dir, final_unified_lr_dir)

print("\nCIFAR-100 dataset successfully integrated into the unified dataset structure!")


Integrated 118000 files into /content/unified_dataset_extracted/content/super_resolution_hr
Integrated 58000 files into /content/unified_dataset_extracted/content/super_resolution_lr

CIFAR-100 dataset successfully integrated into the unified dataset structure!


# VERIFY

In [29]:
# List files that don't match the CIFAR-100 naming convention
extra_hr_files = [f for f in hr_files if not f.endswith("_hr.png")]

print(f"Number of extra files in HR directory: {len(extra_hr_files)}")
if extra_hr_files:
    print(f"Sample extra files: {extra_hr_files[:10]}")


Number of extra files in HR directory: 60000
Sample extra files: ['cifar100_hr_0.png', 'cifar100_hr_1.png', 'cifar100_hr_10.png', 'cifar100_hr_100.png', 'cifar100_hr_1000.png', 'cifar100_hr_10000.png', 'cifar100_hr_10001.png', 'cifar100_hr_10002.png', 'cifar100_hr_10003.png', 'cifar100_hr_10004.png']


In [30]:
import os

# Remove extra CIFAR-100 files from HR directory
extra_hr_files = [f for f in hr_files if f.startswith("cifar100_hr_")]

for file in extra_hr_files:
    os.remove(os.path.join(hr_dir, file))

print(f"Removed {len(extra_hr_files)} extra files from HR directory.")


Removed 60000 extra files from HR directory.


In [31]:
# Reload the directories after cleanup
hr_files = sorted(os.listdir(hr_dir))
lr_files = sorted(os.listdir(lr_dir))

print(f"Total files in HR directory after cleanup: {len(hr_files)}")
print(f"Total files in LR directory after cleanup: {len(lr_files)}")


Total files in HR directory after cleanup: 58000
Total files in LR directory after cleanup: 58000


In [32]:
# Verify matching pairs
assert len(hr_files) == len(lr_files), "Mismatch in number of HR and LR files!"
assert all(hr.replace("_hr", "_lr") == lr for hr, lr in zip(hr_files, lr_files)), "Filenames do not match between HR and LR!"

print("All HR and LR pairs match perfectly!")

All HR and LR pairs match perfectly!


# Find DIV2K

In [34]:
base_dir = "/content/unified_dataset_extracted/content"

# Function to locate DIV2K data within the dataset
def find_div2k_data(base_dir):
    div2k_dirs = []
    for root, dirs, files in os.walk(base_dir):
        if "div2k" in root.lower():
            div2k_dirs.append(root)

    return div2k_dirs

# Locate DIV2K directories
div2k_dirs = find_div2k_data(base_dir)

# Check results
if div2k_dirs:
    print(f"Found DIV2K data in {len(div2k_dirs)} directories:")
    for dir_path in div2k_dirs:
        print(f"Directory: {dir_path}")
        sample_files = [f for f in os.listdir(dir_path) if f.endswith(".img")]
        print(f"Sample .img files ({len(sample_files)} found): {sample_files[:5]}")
else:
    print("No DIV2K data found in the unified dataset.")


No DIV2K data found in the unified dataset.


In [35]:
# Function to locate any files or directories with "DIV2K" in their name
def find_div2k_references(base_dir):
    div2k_references = []
    for root, dirs, files in os.walk(base_dir):
        for name in dirs + files:
            if "div2k" in name.lower():
                div2k_references.append(os.path.join(root, name))

    return div2k_references

# Search for any DIV2K-related references
div2k_references = find_div2k_references(base_dir)

# Display results
if div2k_references:
    print(f"Found {len(div2k_references)} DIV2K-related files or directories:")
    for ref in div2k_references[:10]:  # Show first 10 references
        print(ref)
else:
    print("No DIV2K-related files or directories found in the unified dataset.")


No DIV2K-related files or directories found in the unified dataset.


In [None]:
base_dir = "/content/unified_dataset_extracted"

def summarize_data_sources(base_dir):
    dataset_summary = {}

    # Walk through the base directory
    for root, dirs, files in os.walk(base_dir):
        if files:
            # Filter file extensions and count the number of files
            img_files = [file for file in files if file.endswith('.img')]
            txt_files = [file for file in files if file.endswith('.txt')]

            # Store in summary
            dataset_summary[root] = {
                "total_files": len(files),
                "image_files": len(img_files),
                "text_files": len(txt_files),
                "other_files": len(files) - len(img_files) - len(txt_files)
            }

    return dataset_summary

# Run the summary function and display results
dataset_summary = summarize_data_sources(base_dir)

# Print summary of datasets
print(f"Data Sources in {base_dir}:")
for path, counts in dataset_summary.items():
    print(f"Directory: {path}")
    print(f"  Total Files: {counts['total_files']}")
    print(f"  Image Files (.img): {counts['image_files']}")
    print(f"  Text Files (.txt): {counts['text_files']}")
    print(f"  Other Files: {counts['other_files']}")
    print("-" * 40)


In [37]:
# Path to mystery directory
mystery_dir = "/content/unified_dataset_extracted/content/unified_dataset_extracted"

# List a sample of files
mystery_files = sorted(os.listdir(mystery_dir))[:10]
print("Sample Files in Mystery Directory:")
print(mystery_files)

# Check properties of a few files
print("\nFile Properties:")
for file in mystery_files:
    file_path = os.path.join(mystery_dir, file)
    try:
        # Load the .img file as a numpy array to check dimensions
        img = np.fromfile(file_path, dtype=np.uint8)
        print(f"{file}: Shape={img.shape}")
    except Exception as e:
        print(f"Error reading {file}: {e}")


Sample Files in Mystery Directory:
['coco_000000000009.img', 'coco_000000000030.img', 'coco_000000000036.img', 'coco_000000000042.img', 'coco_000000000049.img', 'coco_000000000064.img', 'coco_000000000071.img', 'coco_000000000073.img', 'coco_000000000074.img', 'coco_000000000077.img']

File Properties:
coco_000000000009.img: Shape=(224297,)
coco_000000000030.img: Shape=(71463,)
coco_000000000036.img: Shape=(260207,)
coco_000000000042.img: Shape=(213308,)
coco_000000000049.img: Shape=(124619,)
coco_000000000064.img: Shape=(220869,)
coco_000000000071.img: Shape=(214185,)
coco_000000000073.img: Shape=(383651,)
coco_000000000074.img: Shape=(176151,)
coco_000000000077.img: Shape=(159213,)


In [38]:
# List the last 10 files in the mystery directory
mystery_files_last = sorted(os.listdir(mystery_dir))[-10:]
print("Last 10 Files in Mystery Directory:")
print(mystery_files_last)

# Check properties of the last 10 files
print("\nFile Properties for Last 10 Files:")
for file in mystery_files_last:
    file_path = os.path.join(mystery_dir, file)
    try:
        # Load the .img file as a numpy array to check dimensions
        img = np.fromfile(file_path, dtype=np.uint8)
        print(f"{file}: Shape={img.shape}")
    except Exception as e:
        print(f"Error reading {file}: {e}")


Last 10 Files in Mystery Directory:
['tiny_imagenet_n12267677_90.img', 'tiny_imagenet_n12267677_91.img', 'tiny_imagenet_n12267677_92.img', 'tiny_imagenet_n12267677_93.img', 'tiny_imagenet_n12267677_94.img', 'tiny_imagenet_n12267677_95.img', 'tiny_imagenet_n12267677_96.img', 'tiny_imagenet_n12267677_97.img', 'tiny_imagenet_n12267677_98.img', 'tiny_imagenet_n12267677_99.img']

File Properties for Last 10 Files:
tiny_imagenet_n12267677_90.img: Shape=(1858,)
tiny_imagenet_n12267677_91.img: Shape=(1564,)
tiny_imagenet_n12267677_92.img: Shape=(1491,)
tiny_imagenet_n12267677_93.img: Shape=(2675,)
tiny_imagenet_n12267677_94.img: Shape=(1917,)
tiny_imagenet_n12267677_95.img: Shape=(2630,)
tiny_imagenet_n12267677_96.img: Shape=(2293,)
tiny_imagenet_n12267677_97.img: Shape=(1877,)
tiny_imagenet_n12267677_98.img: Shape=(1624,)
tiny_imagenet_n12267677_99.img: Shape=(2090,)


In [40]:
import os

# Define paths
root_hr_dir = "/content/unified_dataset_extracted/super_resolution_hr"
root_lr_dir = "/content/unified_dataset_extracted/super_resolution_lr"
nested_hr_dir = "/content/unified_dataset_extracted/content/super_resolution_hr"
nested_lr_dir = "/content/unified_dataset_extracted/content/super_resolution_lr"

# Function to list files in a directory
def list_files(directory):
    if not os.path.exists(directory):
        return []
    return sorted([os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])

# List files in each directory
root_hr_files = list_files(root_hr_dir)
root_lr_files = list_files(root_lr_dir)
nested_hr_files = list_files(nested_hr_dir)
nested_lr_files = list_files(nested_lr_dir)

# Compare HR directories
print(f"Total HR files in root directory: {len(root_hr_files)}")
print(f"Total HR files in nested directory: {len(nested_hr_files)}")
print("HR directories match!" if root_hr_files == nested_hr_files else "HR directories do not match!")

# Compare LR directories
print(f"Total LR files in root directory: {len(root_lr_files)}")
print(f"Total LR files in nested directory: {len(nested_lr_files)}")
print("LR directories match!" if root_lr_files == nested_lr_files else "LR directories do not match!")

# Function to find mismatched files
def find_mismatched_files(files1, files2):
    set1 = set(os.path.basename(f) for f in files1)
    set2 = set(os.path.basename(f) for f in files2)
    return list(set1 - set2), list(set2 - set1)

# Find mismatched HR files
hr_missing_in_nested, hr_missing_in_root = find_mismatched_files(root_hr_files, nested_hr_files)
print(f"HR files missing in nested: {len(hr_missing_in_nested)}")
print(f"HR files missing in root: {len(hr_missing_in_root)}")

# Find mismatched LR files
lr_missing_in_nested, lr_missing_in_root = find_mismatched_files(root_lr_files, nested_lr_files)
print(f"LR files missing in nested: {len(lr_missing_in_nested)}")
print(f"LR files missing in root: {len(lr_missing_in_root)}")

# Print sample mismatched files
print("Sample HR files missing in nested:", hr_missing_in_nested[:10])
print("Sample HR files missing in root:", hr_missing_in_root[:10])
print("Sample LR files missing in nested:", lr_missing_in_nested[:10])
print("Sample LR files missing in root:", lr_missing_in_root[:10])


Total HR files in root directory: 0
Total HR files in nested directory: 58000
HR directories do not match!
Total LR files in root directory: 0
Total LR files in nested directory: 58000
LR directories do not match!
HR files missing in nested: 0
HR files missing in root: 58000
LR files missing in nested: 0
LR files missing in root: 58000
Sample HR files missing in nested: []
Sample HR files missing in root: ['img_20089_hr.png', 'img_38419_hr.png', 'img_32981_hr.png', 'img_01878_hr.png', 'img_30333_hr.png', 'img_19770_hr.png', 'img_09495_hr.png', 'img_58484_hr.png', 'img_52961_hr.png', 'img_29142_hr.png']
Sample LR files missing in nested: []
Sample LR files missing in root: ['img_37131_lr.png', 'img_18486_lr.png', 'img_13046_lr.png', 'img_26742_lr.png', 'img_33234_lr.png', 'img_00251_lr.png', 'img_09394_lr.png', 'img_14866_lr.png', 'img_29245_lr.png', 'img_35958_lr.png']


In [41]:
root_hr_dir = "/content/unified_dataset_extracted/super_resolution_hr"
root_lr_dir = "/content/unified_dataset_extracted/super_resolution_lr"

# Remove the empty directories
if os.path.exists(root_hr_dir):
    shutil.rmtree(root_hr_dir)
    print(f"Deleted empty directory: {root_hr_dir}")

if os.path.exists(root_lr_dir):
    shutil.rmtree(root_lr_dir)
    print(f"Deleted empty directory: {root_lr_dir}")

print("Environment cleaned up successfully.")


Deleted empty directory: /content/unified_dataset_extracted/super_resolution_hr
Deleted empty directory: /content/unified_dataset_extracted/super_resolution_lr
Environment cleaned up successfully.


In [42]:
combined_dir = "/content/unified_dataset_extracted/content/unified_dataset_extracted"
coco_dir = "/content/unified_dataset_extracted/content/data/coco"
tiny_imagenet_dir = "/content/unified_dataset_extracted/content/data/tiny_imagenet"

# Function to compare files between directories
def compare_directories(dir1, dir2):
    files1 = set(os.listdir(dir1)) if os.path.exists(dir1) else set()
    files2 = set(os.listdir(dir2)) if os.path.exists(dir2) else set()
    only_in_dir1 = files1 - files2
    only_in_dir2 = files2 - files1
    common_files = files1 & files2
    return only_in_dir1, only_in_dir2, common_files

# Compare combined with separate folders
print("Comparing Combined Dataset with Separate Folders...")

# COCO
coco_combined, coco_separate, coco_common = compare_directories(combined_dir, coco_dir)
print(f"COCO - Only in Combined: {len(coco_combined)}")
print(f"COCO - Only in Separate: {len(coco_separate)}")
print(f"COCO - Common: {len(coco_common)}")

# Tiny ImageNet
tinynet_combined, tinynet_separate, tinynet_common = compare_directories(combined_dir, tiny_imagenet_dir)
print(f"Tiny ImageNet - Only in Combined: {len(tinynet_combined)}")
print(f"Tiny ImageNet - Only in Separate: {len(tinynet_separate)}")
print(f"Tiny ImageNet - Common: {len(tinynet_common)}")


Comparing Combined Dataset with Separate Folders...
COCO - Only in Combined: 200001
COCO - Only in Separate: 1
COCO - Common: 0
Tiny ImageNet - Only in Combined: 200001
Tiny ImageNet - Only in Separate: 1
Tiny ImageNet - Common: 0


In [43]:
separate_coco_dir = "/content/unified_dataset_extracted/content/data/coco"
separate_tinynet_dir = "/content/unified_dataset_extracted/content/data/tiny_imagenet"

# Remove the incomplete separate folders
if os.path.exists(separate_coco_dir):
    shutil.rmtree(separate_coco_dir)
    print(f"Removed incomplete COCO folder: {separate_coco_dir}")

if os.path.exists(separate_tinynet_dir):
    shutil.rmtree(separate_tinynet_dir)
    print(f"Removed incomplete Tiny ImageNet folder: {separate_tinynet_dir}")

# Check the remaining structure
print("\nFinal Dataset Structure:")
for root, dirs, files in os.walk("/content/unified_dataset_extracted", topdown=True):
    print(f"Directory: {root}")
    print(f"  Subdirectories: {dirs}")
    print(f"  Files: {len(files)}")


Removed incomplete COCO folder: /content/unified_dataset_extracted/content/data/coco
Removed incomplete Tiny ImageNet folder: /content/unified_dataset_extracted/content/data/tiny_imagenet

Final Dataset Structure:
Directory: /content/unified_dataset_extracted
  Subdirectories: ['data', 'content']
  Files: 0
Directory: /content/unified_dataset_extracted/data
  Subdirectories: ['cifar']
  Files: 0
Directory: /content/unified_dataset_extracted/data/cifar
  Subdirectories: []
  Files: 0
Directory: /content/unified_dataset_extracted/content
  Subdirectories: ['super_resolution_hr', 'data', 'super_resolution_lr', 'unified_dataset_extracted']
  Files: 0
Directory: /content/unified_dataset_extracted/content/super_resolution_hr
  Subdirectories: []
  Files: 58000
Directory: /content/unified_dataset_extracted/content/data
  Subdirectories: []
  Files: 0
Directory: /content/unified_dataset_extracted/content/super_resolution_lr
  Subdirectories: []
  Files: 58000
Directory: /content/unified_datase

In [44]:
# Original path
old_path = "/content/unified_dataset_extracted/content/unified_dataset_extracted"
# New path
new_path = "/content/unified_dataset_extracted/content/tinynet_coco_combined"

# Rename the directory
if os.path.exists(old_path):
    os.rename(old_path, new_path)
    print(f"Renamed directory:\nFrom: {old_path}\nTo: {new_path}")
else:
    print(f"Directory not found: {old_path}")


Renamed directory:
From: /content/unified_dataset_extracted/content/unified_dataset_extracted
To: /content/unified_dataset_extracted/content/tinynet_coco_combined


In [45]:
source_path = "/content/unified_dataset_extracted/data/cifar"
destination_path = "/content/unified_dataset_extracted/content/cifar"

# Check if source exists
if os.path.exists(source_path):
    # Move the directory
    shutil.move(source_path, destination_path)
    print(f"Moved 'cifar' directory:\nFrom: {source_path}\nTo: {destination_path}")
else:
    print(f"Source directory not found: {source_path}")

# Check if the source directory is now empty and remove it if needed
parent_path = "/content/unified_dataset_extracted/data"
if os.path.exists(parent_path) and not os.listdir(parent_path):
    os.rmdir(parent_path)
    print(f"Removed empty parent directory: {parent_path}")
else:
    print(f"Parent directory still contains files or does not exist: {parent_path}")


Moved 'cifar' directory:
From: /content/unified_dataset_extracted/data/cifar
To: /content/unified_dataset_extracted/content/cifar
Removed empty parent directory: /content/unified_dataset_extracted/data


In [46]:
import zipfile
# Path to the unified dataset folder
unified_dataset_path = "/content/unified_dataset_extracted/content"

# Path for the new zip file
zip_file_path = "/content/drive/My Drive/perfectly_structured_dataset.zip"

# Function to create a zip file
def create_zip_file(folder_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)
    print(f"Zip file created successfully: {zip_file_path}")

# Create the zip file
create_zip_file(unified_dataset_path, zip_file_path)


Zip file created successfully: /content/drive/My Drive/perfectly_structured_dataset.zip


# Convert CIFAR-100 .png Files to .img for Uniformity

In [48]:
import numpy as np
cifar_hr_dir = "/content/unified_dataset_extracted/content/super_resolution_hr"
cifar_lr_dir = "/content/unified_dataset_extracted/content/super_resolution_lr"

# Function to convert .png to .img
def convert_png_to_img(input_dir):
    for file in os.listdir(input_dir):
        if file.endswith(".png"):
            png_path = os.path.join(input_dir, file)
            img_path = png_path.replace(".png", ".img")
            # Read PNG file
            with open(png_path, "rb") as f:
                data = f.read()
            # Write to .img file
            with open(img_path, "wb") as f:
                f.write(data)
            # Remove original .png file
            os.remove(png_path)
    print(f"Converted all .png files to .img in {input_dir}")

# Convert HR and LR CIFAR data
convert_png_to_img(cifar_hr_dir)
convert_png_to_img(cifar_lr_dir)


Converted all .png files to .img in /content/unified_dataset_extracted/content/super_resolution_hr
Converted all .png files to .img in /content/unified_dataset_extracted/content/super_resolution_lr
