In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
import os
from PIL import Image
import shutil

2025-10-23 13:16:22.153907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761225382.174107   58320 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761225382.180605   58320 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1761225382.197075   58320 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761225382.197092   58320 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761225382.197094   58320 computation_placer.cc:177] computation placer alr

In [2]:
# Load the Excel file
df = pd.read_excel('/home/ubuntu/deep_learning_exam1/excel/training/train_test_cleaned.xlsx')

# Calculate class distribution for entire dataset
print("="*60)
print("CLASS DISTRIBUTION - ENTIRE DATASET")
print("="*60)
class_dist_all = df['target'].value_counts().sort_index()
print(class_dist_all)
print(f"\nTotal images: {len(df)}")

# Calculate class distribution for training set only
print("\n" + "="*60)
print("CLASS DISTRIBUTION - TRAINING SET ONLY")
print("="*60)
train_df = df[df['split'] == 'train']
class_dist_train = train_df['target'].value_counts().sort_index()
print(class_dist_train)
print(f"\nTotal training images: {len(train_df)}")

# Calculate class distribution for test set only
print("\n" + "="*60)
print("CLASS DISTRIBUTION - TEST SET ONLY")
print("="*60)
test_df = df[df['split'] == 'test']
class_dist_test = test_df['target'].value_counts().sort_index()
print(class_dist_test)
print(f"\nTotal test images: {len(test_df)}")

# Create a summary DataFrame for better visualization
print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)
summary = pd.DataFrame({
    'Class': class_dist_train.index,
    'Training Count': class_dist_train.values,
    'Test Count': class_dist_test.values,
    'Total Count': class_dist_train.values + class_dist_test.values
})
print(summary.to_string(index=False))

# Calculate imbalance ratio
max_count = class_dist_train.max()
min_count = class_dist_train.min()
imbalance_ratio = max_count / min_count
print(f"\nImbalance Ratio (max/min): {imbalance_ratio:.2f}:1")
print(f"Most frequent class: {class_dist_train.idxmax()} ({max_count} images)")
print(f"Least frequent class: {class_dist_train.idxmin()} ({min_count} images)")


CLASS DISTRIBUTION - ENTIRE DATASET
target
class1        24
class10      834
class2      2619
class3      4998
class4      1372
class5     12773
class6        72
class7      1347
class8      5218
class9      1047
Name: count, dtype: int64

Total images: 30304

CLASS DISTRIBUTION - TRAINING SET ONLY
target
class1        22
class10      732
class2      2325
class3      4437
class4      1207
class5     11354
class6        63
class7      1199
class8      4666
class9       935
Name: count, dtype: int64

Total training images: 26940

CLASS DISTRIBUTION - TEST SET ONLY
target
class1        2
class10     102
class2      294
class3      561
class4      165
class5     1419
class6        9
class7      148
class8      552
class9      112
Name: count, dtype: int64

Total test images: 3364

SUMMARY TABLE
  Class  Training Count  Test Count  Total Count
 class1              22           2           24
class10             732         102          834
 class2            2325         294         2619
 c

In [3]:
# Set random seed for reproducibility
np.random.seed(666)

# Target count for each class


# Filter only training data
train_df = df[df['split'] == 'train'].copy()

TARGET_COUNT = 8000

# Paths - MODIFY THESE TO YOUR ACTUAL PATHS
IMAGE_PATH = '/home/ubuntu/deep_learning_exam1/Data/'  # Source images directory
OUTPUT_PATH = '/home/ubuntu/deep_learning_exam1/Full_Balanced_Data/'  # Output directory for balanced dataset
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [4]:
# Define augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    vertical_flip=False,
    fill_mode='nearest',
    brightness_range=[0.8, 1.2]
)

In [5]:
balanced_data = []

# Global counter for ALL augmented images across all classes
global_aug_counter = 0

print("="*70)
print(f"BALANCING DATASET TO {TARGET_COUNT} IMAGES PER CLASS WITH UNIQUE IDs")
print("="*70)

# Process each class
for class_name in ['class1', 'class2', 'class3', 'class4', 'class5',
                   'class6', 'class7', 'class8', 'class9', 'class10']:

    # Get images for this class
    class_images = train_df[train_df['target'] == class_name].copy()
    current_count = len(class_images)

    print(f"\n{class_name}:")
    print(f"  Current count: {current_count}")

    # Case 1: Class has MORE than target count - randomly sample
    if current_count > TARGET_COUNT:
        print(f"  Action: Randomly selecting {TARGET_COUNT} images")

        # Randomly sample TARGET_COUNT images
        selected_images = class_images.sample(n=TARGET_COUNT, random_state=666)

        # Copy selected images to output directory
        for idx, row in selected_images.iterrows():
            image_id = row['id']
            src_path = os.path.join(IMAGE_PATH, image_id)
            dst_path = os.path.join(OUTPUT_PATH, image_id)

            try:
                shutil.copy2(src_path, dst_path)
                balanced_data.append({
                    'id': image_id,
                    'target': class_name,
                    'split': 'train',
                    'method': 'original_sampled'
                })
            except Exception as e:
                print(f"  Error copying {image_id}: {str(e)}")

        print(f"  Selected: {len(selected_images)} images")

    # Case 2: Class has LESS than target count - use all originals + augment
    else:
        print(f"  Action: Using all {current_count} originals + generating {TARGET_COUNT - current_count} augmented")

        # First, copy all original images
        for idx, row in class_images.iterrows():
            image_id = row['id']
            src_path = os.path.join(IMAGE_PATH, image_id)
            dst_path = os.path.join(OUTPUT_PATH, image_id)

            try:
                shutil.copy2(src_path, dst_path)
                balanced_data.append({
                    'id': image_id,
                    'target': class_name,
                    'split': 'train',
                    'method': 'original'
                })
            except Exception as e:
                print(f"  Error copying {image_id}: {str(e)}")

        # Calculate how many augmented images needed
        augment_count = TARGET_COUNT - current_count
        augmentations_per_image = int(np.ceil(augment_count / current_count))

        print(f"  Augmentations per original image: ~{augmentations_per_image}")

        # Generate augmented images
        generated_count = 0

        # Shuffle the class_images to vary which images get more augmentations
        class_images_shuffled = class_images.sample(frac=1, random_state=999).reset_index(drop=True)

        for idx, row in class_images_shuffled.iterrows():
            if generated_count >= augment_count:
                break

            image_id = row['id']
            image_path = os.path.join(IMAGE_PATH, image_id)

            try:
                # Load image
                img = load_img(image_path)
                x = img_to_array(img)
                x = x.reshape((1,) + x.shape)

                # Generate augmented images
                aug_iter = datagen.flow(x, batch_size=1024, seed=None)  # Changed seed to None for more randomness

                for i in range(augmentations_per_image):
                    if generated_count >= augment_count:
                        break

                    # Generate augmented image
                    aug_img = next(aug_iter)[0].astype('uint8')

                    # Create UNIQUE filename with class name and GLOBAL counter
                    # Format: classname_aug_globalcounter.jpg
                    extension = '.jpg'
                    new_image_id = f"{class_name}_aug_{global_aug_counter:06d}{extension}"
                    new_image_path = os.path.join(OUTPUT_PATH, new_image_id)

                    # Double check uniqueness - if file exists, increment counter
                    while os.path.exists(new_image_path) or new_image_id in [d['id'] for d in balanced_data]:
                        global_aug_counter += 1
                        new_image_id = f"{class_name}_aug_{global_aug_counter:06d}{extension}"
                        new_image_path = os.path.join(OUTPUT_PATH, new_image_id)

                    # Save augmented image
                    array_to_img(aug_img).save(new_image_path)

                    # Store information
                    balanced_data.append({
                        'id': new_image_id,
                        'target': class_name,
                        'split': 'train',
                        'method': 'augmented',
                        'source_image': image_id
                    })

                    generated_count += 1
                    global_aug_counter += 1  # Increment global counter for next augmented image

            except Exception as e:
                print(f"  Error processing {image_id}: {str(e)}")
                continue

        print(f"  Total images: {current_count} original + {generated_count} augmented = {current_count + generated_count}")
        print(f"  Global augmented counter now at: {global_aug_counter}")

# Create DataFrame with balanced data
balanced_df = pd.DataFrame(balanced_data)

BALANCING DATASET TO 8000 IMAGES PER CLASS WITH UNIQUE IDs

class1:
  Current count: 22
  Action: Using all 22 originals + generating 7978 augmented
  Augmentations per original image: ~363
  Total images: 22 original + 7978 augmented = 8000
  Global augmented counter now at: 7978

class2:
  Current count: 2325
  Action: Using all 2325 originals + generating 5675 augmented
  Augmentations per original image: ~3
  Total images: 2325 original + 5675 augmented = 8000
  Global augmented counter now at: 13653

class3:
  Current count: 4437
  Action: Using all 4437 originals + generating 3563 augmented
  Augmentations per original image: ~1
  Total images: 4437 original + 3563 augmented = 8000
  Global augmented counter now at: 17216

class4:
  Current count: 1207
  Action: Using all 1207 originals + generating 6793 augmented
  Augmentations per original image: ~6
  Total images: 1207 original + 6793 augmented = 8000
  Global augmented counter now at: 24009

class5:
  Current count: 11354
  

In [6]:
# Verify uniqueness
unique_ids = balanced_df['id'].nunique()
total_ids = len(balanced_df)
duplicates = balanced_df[balanced_df.duplicated(subset=['id'], keep=False)]

print("\n" + "="*70)
print("UNIQUENESS CHECK")
print("="*70)
print(f"Total images: {total_ids}")
print(f"Unique IDs: {unique_ids}")
if unique_ids == total_ids:
    print("✓ SUCCESS: All IDs are unique!")
else:
    print(f"✗ WARNING: {total_ids - unique_ids} duplicate IDs found!")
    print("\nDuplicate IDs:")
    print(duplicates[['id', 'target', 'method']].sort_values('id'))



UNIQUENESS CHECK
Total images: 80000
Unique IDs: 80000
✓ SUCCESS: All IDs are unique!


In [7]:
# Verify class distribution
print("\n" + "="*70)
print("FINAL BALANCED CLASS DISTRIBUTION")
print("="*70)
for class_name in train_df['target'].unique():
    count = len(balanced_df[balanced_df['target'] == class_name])
    original_count = len(balanced_df[(balanced_df['target'] == class_name) &
                                     (balanced_df['method'].isin(['original', 'original_sampled']))])
    augmented_count = len(balanced_df[(balanced_df['target'] == class_name) &
                                      (balanced_df['method'] == 'augmented')])
    print(f"{class_name}: {count} total (Original: {original_count}, Augmented: {augmented_count})")

balanced_df




FINAL BALANCED CLASS DISTRIBUTION
class4: 8000 total (Original: 1207, Augmented: 6793)
class5: 8000 total (Original: 8000, Augmented: 0)
class2: 8000 total (Original: 2325, Augmented: 5675)
class8: 8000 total (Original: 4666, Augmented: 3334)
class9: 8000 total (Original: 935, Augmented: 7065)
class10: 8000 total (Original: 732, Augmented: 7268)
class3: 8000 total (Original: 4437, Augmented: 3563)
class7: 8000 total (Original: 1199, Augmented: 6801)
class6: 8000 total (Original: 63, Augmented: 7937)
class1: 8000 total (Original: 22, Augmented: 7978)


Unnamed: 0,id,target,split,method,source_image
0,img_19717.jpg,class1,train,original,
1,img_29529.jpg,class1,train,original,
2,img_24038.jpg,class1,train,original,
3,img_13153.jpg,class1,train,original,
4,img_16478.jpg,class1,train,original,
...,...,...,...,...,...
79995,class10_aug_056409.jpg,class10,train,augmented,img_15191.jpg
79996,class10_aug_056410.jpg,class10,train,augmented,img_15191.jpg
79997,class10_aug_056411.jpg,class10,train,augmented,img_15191.jpg
79998,class10_aug_056412.jpg,class10,train,augmented,img_15191.jpg


In [8]:
test_df = df[df['split'] == 'test'].copy()

print("="*70)
print("COPYING TEST IMAGES")
print("="*70)
print(f"Total test images to copy: {len(test_df)}")

# Copy test images and track them
test_data = []
copied_count = 0
error_count = 0

for idx, row in test_df.iterrows():
    image_id = row['id']
    src_path = os.path.join(IMAGE_PATH, image_id)
    dst_path = os.path.join(OUTPUT_PATH, image_id)

    try:
        shutil.copy2(src_path, dst_path)

        # Store test image information
        test_data.append({
            'id': image_id,
            'target': row['target'],
            'split': 'test',
            'method': 'original'
        })
        copied_count += 1

        if copied_count % 500 == 0:
            print(f"  Copied {copied_count} images...")

    except Exception as e:
        print(f"  Error copying {image_id}: {str(e)}")
        error_count += 1

print(f"\nSuccessfully copied: {copied_count} test images")
print(f"Errors: {error_count} images")

COPYING TEST IMAGES
Total test images to copy: 3364
  Copied 500 images...
  Copied 1000 images...
  Copied 1500 images...
  Copied 2000 images...
  Copied 2500 images...
  Copied 3000 images...

Successfully copied: 3364 test images
Errors: 0 images


In [9]:
# Create DataFrame with test data
test_data_df = pd.DataFrame(test_data)

# Combine balanced training data with test data
combined_df = pd.concat([balanced_df, test_data_df], ignore_index=True)

# Save the combined Excel file
class_to_onehot = {
    'class1': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'class2': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    'class3': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
    'class4': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'class5': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    'class6': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    'class7': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'class8': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
    'class9': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    'class10': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
}

# Add the target_class column with one-hot encoded arrays
combined_df['target_class'] = combined_df['target'].map(class_to_onehot)

# Reorder columns to match the format: id, target, split, target_class, (other columns)
# Get all column names
all_columns = combined_df.columns.tolist()

# Define desired column order
desired_order = ['id', 'target', 'split', 'target_class']

# Add any remaining columns that weren't in desired_order
remaining_columns = [col for col in all_columns if col not in desired_order]
final_column_order = desired_order + remaining_columns

# Reorder the DataFrame
combined_df = combined_df[final_column_order]

# Save the updated Excel file
combined_df.to_excel('/home/ubuntu/deep_learning_exam1/excel/training/train_test_balanced_full.xlsx', index=False)

print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"Training images: {len(balanced_df)}")
print(f"Test images: {len(test_data_df)}")
print(f"Total images: {len(combined_df)}")

# Show split distribution
print("\nSplit Distribution:")
print(combined_df['split'].value_counts())

# Show class distribution for test set
print("\nTest Set Class Distribution:")
test_class_dist = test_data_df['target'].value_counts().sort_index()
for class_name, count in test_class_dist.items():
    print(f"  {class_name}: {count}")

print(f"\nAll images saved to: {OUTPUT_PATH}")
print(f"Complete metadata saved to: train_test_balanced.xlsx")
print("="*70)


FINAL SUMMARY
Training images: 80000
Test images: 3364
Total images: 83364

Split Distribution:
split
train    80000
test      3364
Name: count, dtype: int64

Test Set Class Distribution:
  class1: 2
  class10: 102
  class2: 294
  class3: 561
  class4: 165
  class5: 1419
  class6: 9
  class7: 148
  class8: 552
  class9: 112

All images saved to: /home/ubuntu/deep_learning_exam1/Full_Balanced_Data/
Complete metadata saved to: train_test_balanced.xlsx


In [None]:
# Load the balanced Excel file
df_bal = pd.read_excel('/home/ubuntu/deep_learning_exam1/excel/training/train_test_balanced_full.xlsx')

# Create a mapping from class names to one-hot encoded arrays
# Classes are: class1, class2, class3, class4, class5, class6, class7, class8, class9, class10
class_to_onehot = {
    'class1': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'class2': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    'class3': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
    'class4': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'class5': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    'class6': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    'class7': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'class8': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
    'class9': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    'class10': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
}

# Add the target_class column with one-hot encoded arrays
df_bal['target_class'] = df_bal['target'].map(class_to_onehot)

# Reorder columns to match the format: id, target, split, target_class, (other columns)
# Get all column names
all_columns = df_bal.columns.tolist()

# Define desired column order
desired_order = ['id', 'target', 'split', 'target_class']

# Add any remaining columns that weren't in desired_order
remaining_columns = [col for col in all_columns if col not in desired_order]
final_column_order = desired_order + remaining_columns

# Reorder the DataFrame
df_bal = df_bal[final_column_order]

# Save the updated Excel file
df_bal.to_excel('train_test_balanced_new.xlsx', index=False)

print("="*70)
print("ONE-HOT ENCODING ADDED")
print("="*70)
print("\nSample of the updated DataFrame:")
print(df_bal[['id', 'target', 'split', 'target_class']].head(10))

print("\n" + "="*70)
print("VERIFICATION")
print("="*70)
print(f"Total rows: {len(df_bal)}")
print(f"Columns: {df_bal.columns.tolist()}")

# Verify one-hot encoding for each class
print("\nOne-hot encoding verification:")
for class_name in ['class1', 'class2', 'class3', 'class4', 'class5',
                   'class6', 'class7', 'class8', 'class9', 'class10']:
    sample = df_bal[df_bal['target'] == class_name]['target_class'].iloc[0]
    print(f"{class_name}: {sample}")

print("\n" + "="*70)
print(f"Updated file saved as: train_test_balanced_new.xlsx")
print("="*70)
