## IMAGE PREPROCESSING

In [None]:
import os
import cv2
import numpy as np

# Path to the directory containing the X-ray images
dataset_folder = "Cleaned Images"


# Function to preprocess images
def preprocess_image(image_path):
    # Read the image
    image = cv2.imread(image_path)
    
    # Resize the image to a consistent size (e.g., 512x512)
    image = cv2.resize(image, (224, 224))
    
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Normalize pixel values to the range [0, 1]
    normalized_image = gray_image / 255.0
    
    # Apply Gaussian blur for noise reduction
    blurred_image = cv2.GaussianBlur(normalized_image, (5, 5), 0)
    
    # Perform contrast enhancement using histogram equalization
    enhanced_image = cv2.equalizeHist(np.uint8(blurred_image * 255))
    
    return enhanced_image

# Function to preprocess all images in a directory (including subdirectories)
def preprocess_images_in_directory(directory):
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isdir(filepath):
            # If the item is a directory, recursively call the function
            preprocess_images_in_directory(filepath)
        elif filename.endswith(".jpg") or filename.endswith(".png"):
            # Preprocess the image
            preprocessed_image = preprocess_image(filepath)
            
            # Save the preprocessed image to the preprocessed folder
            relative_path = os.path.relpath(filepath, dataset_folder)
            preprocessed_image_path = os.path.join(preprocessed_folder, relative_path)
            os.makedirs(os.path.dirname(preprocessed_image_path), exist_ok=True)
            cv2.imwrite(preprocessed_image_path, preprocessed_image)

# Preprocess all images in the dataset folder (including subfolders)
for i in os.listdir(dataset_folder):
    preprocessed_folder = "PreprocessedDataset"
    os.makedirs(preprocessed_folder, exist_ok=True)
    preprocess_images_in_directory(dataset_folder+"/"+i)
    # Create a new directory to store preprocessed images
    





In [None]:
import os
import matplotlib.pyplot as plt
dataset_folder = "PreprocessedDataset/train"

# Function to count images in each class
def count_images_per_class(dataset_folder):
    class_counts = {}
    for class_name in os.listdir(dataset_folder):
        class_path = os.path.join(dataset_folder, class_name)
        if os.path.isdir(class_path):
            num_images = len([name for name in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, name))])
            class_counts[class_name] = num_images
    return class_counts

# Printing number of images in each class
class_counts = count_images_per_class(dataset_folder)


In [None]:
# Plotting bar plot
plt.figure(figsize=(10, 6))
plt.bar(class_counts.keys(), class_counts.values(), color='skyblue')
plt.xlabel('Severity Grading')
plt.ylabel('Number of Images')
plt.title('Number of Images in Each Class')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import array_to_img
import numpy as np
import os

# Load train data
train_generator = ImageDataGenerator().flow_from_directory(
    "PreprocessedDataset/train",
    class_mode="categorical",
    shuffle=False,
    batch_size=64,  # Reduced batch size
    target_size=(224, 224),
    seed=42
)

# Concatenate data
x = []
y = []
for _ in range(train_generator.__len__()):
    batch_x, batch_y = train_generator.next()
    x.append(batch_x)
    y.append(batch_y)
x = np.concatenate(x)
y = np.concatenate(y)

# Reshape data
X_train = x.reshape(x.shape[0], -1)

# Apply SMOTE
sm = SMOTE(random_state=2)  # Reduced SMOTE batch size
X_smote, y_smote = sm.fit_resample(X_train, y)

# Save images incrementally
train_sep_dir = 'smotefolder'
if not os.path.exists(train_sep_dir):
    os.mkdir(train_sep_dir)

def save_images(X_smote, y_smote, train_sep_dir, start_index=0):
    for i in range(len(X_smote)):
        label = np.argmax(y_smote[i])  # Get index of maximum value (class label)
        label_dir = os.path.join(train_sep_dir, str(label))
        if not os.path.exists(label_dir):
            os.mkdir(label_dir)
        img = array_to_img(X_smote[i].reshape((224, 224, 3)))
        img.save(os.path.join(label_dir, f'smote_{start_index + i}.jpg'))

# Split saving into smaller batches
batch_size = 1000
for i in range(0, len(X_smote), batch_size):
    save_images(X_smote[i:i+batch_size], y_smote[i:i+batch_size], train_sep_dir, start_index=i)


## RESAMPLING

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

dataset_folder = "smotefolder"

# Function to count images in each class
def count_images_per_class(dataset_folder):
    class_counts = {}
    for class_name in os.listdir(dataset_folder):
        class_path = os.path.join(dataset_folder, class_name)
        if os.path.isdir(class_path):
            num_images = len([name for name in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, name))])
            class_counts[class_name] = num_images
    return class_counts

# Function to randomly oversample images in class 2
def random_oversample_class_2(dataset_folder, oversample_ratio):
    class_2_path = os.path.join(dataset_folder, "2")
    images_class_2 = [name for name in os.listdir(class_2_path) if os.path.isfile(os.path.join(class_2_path, name))]
    num_images_class_2 = len(images_class_2)
    
    # Calculate number of images to oversample for class 2
    num_oversample = int((oversample_ratio - 1) * num_images_class_2)
    
    # Randomly select images to duplicate for class 2
    images_to_duplicate = np.random.choice(images_class_2, size=num_oversample, replace=True)
    
    # Copy and rename duplicated images for class 2
    for image_name in images_to_duplicate:
        src_path = os.path.join(class_2_path, image_name)
        dst_path = os.path.join(class_2_path, f"oversampled_{image_name}")
        os.system(f"copy {src_path} {dst_path}")

# Printing number of images in each class before oversampling
class_counts_before = count_images_per_class(dataset_folder)
print("Number of images in each class before oversampling:")
print(class_counts_before)

# Randomly oversample class 2 (increase to 2 times)
random_oversample_class_2(dataset_folder, oversample_ratio=2)

# Printing number of images in each class after oversampling
class_counts_after = count_images_per_class(dataset_folder)
print("\nNumber of images in each class after oversampling:")
print(class_counts_after)

# Plotting bar plot
plt.figure(figsize=(10, 6))
plt.bar(class_counts_after.keys(), class_counts_after.values(), color='skyblue')
plt.xlabel('Severity Grading')
plt.ylabel('Number of Images')
plt.title('Number of Images in Each Class after Random Oversampling for Class 2')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


##  UNDERSAMPLING

In [1]:
import os
import shutil
import random

# Define paths to your original dataset
original_data_dir = 'PreprocessedDataset/train'
downsampled_data_dir = 'DownsampledDataset2'

# Define the number of samples you want to keep for class 2
desired_samples_class_2 = 500

# Create the downsampled data directory if it doesn't exist
if not os.path.exists(downsampled_data_dir):
    os.makedirs(downsampled_data_dir)

# Define classes
classes = ['0', '1', '2']

# Define the downsampling ratio
downsampling_ratio = desired_samples_class_2 / 2000.0  # Assuming 2000 samples initially for classes 0 and 1

# Loop through each class
for class_name in classes:
    # Create the subdirectory for the current class in the downsampled data directory
    class_dir = os.path.join(downsampled_data_dir, class_name)
    os.makedirs(class_dir, exist_ok=True)
    
    # Determine the number of samples to keep for the current class
    if class_name == '2':
        desired_samples = desired_samples_class_2
    else:
        desired_samples = int(2000 * downsampling_ratio)
    
    # Get the list of file names for the current class
    files = os.listdir(os.path.join(original_data_dir, class_name))
    
    # Randomly select a subset of files
    selected_files = random.sample(files, desired_samples)
    
    # Copy the selected files to the downsampled data directory
    for file in selected_files:
        src = os.path.join(original_data_dir, class_name, file)
        dst = os.path.join(class_dir, file)
        shutil.copy(src, dst)
