In [16]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
import os
from tqdm import tqdm
import cv2

import shutil

In [17]:
from google.colab import files

In [18]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [19]:
!kaggle competitions download -c umor-classification

umor-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [20]:
!unzip umor-classification.zip

Archive:  umor-classification.zip
replace images/images/1006.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

### Load the CSV files containing the data:

In [21]:
# # Load and preprocess training images
image_folder   = '/content/images/images/'

In [22]:
print(f"There are {len(os.listdir(image_folder ))} images in  dataset")

There are 775 images in  dataset


In [23]:
train_df  = pd.read_csv('trainset.csv')
test_df   = pd.read_csv('testset.csv')

In [24]:
train_df['Class'].value_counts()

Benign       300
Malignant    148
Normal        91
Name: Class, dtype: int64

In [25]:
train_df[:50]

Unnamed: 0.1,Unnamed: 0,Id,Class
0,211,8602,Malignant
1,315,7464,Malignant
2,165,5832,Malignant
3,188,2843,Malignant
4,710,5272,Benign
5,285,9436,Malignant
6,34,4050,Normal
7,708,7653,Benign
8,334,7048,Malignant
9,776,2885,Benign


In [26]:
image_filenames = os.listdir(image_folder)

In [27]:
id_to_label = dict(zip(train_df['Id'], train_df['Class']))

In [28]:
def load_images_with_labels(df):
    images = []
    labels = []
    for index, row in df.iterrows():
        image_id = row['Id']
        if image_id in id_to_label:
            label = id_to_label[image_id]

            image_path = os.path.join(image_folder, f"{image_id}.png")
            if os.path.exists(image_path):  # Check if the image file exists
                image = cv2.imread(image_path)  # Load the image using cv2

                if image is not None:  # Check if the image was loaded successfully
                    images.append(image)
                    labels.append(label)
                else:
                    print(f"Warning: Unable to load image: {image_path}")
            else:
                print(f"Warning: Image not found: {image_path}")
    return np.array(images), np.array(labels)

In [None]:
X_train, y_train = load_images_with_labels(train_df)

In [None]:
y_train

In [None]:
image_filenames = [filename for filename in os.listdir(image_folder) if filename.endswith(".png")]
image_ids = [int(image_id[:-4]) for image_id in image_filenames if image_id[:-4].isnumeric()]

# Get the image IDs included in the training set
train_image_ids = set(train_df['Id'])

In [None]:
# Get the image IDs from the test CSV
test_image_ids = list(test_df['Id'])

In [None]:
remaining_test_image_ids = [image_id for image_id in test_image_ids if image_id not in train_image_ids]

In [None]:
def load_remaining_test_images(image_ids):
    images = []
    for image_id in image_ids:
        image_path = os.path.join(image_folder, f"{image_id}.png")
        image = cv2.imread(image_path)
        if image is not None:
            images.append(image)
    return np.array(images)

In [None]:
X_test = load_remaining_test_images(remaining_test_image_ids)

In [None]:
X_test.shape

In [None]:
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create separate lists for each class
benign_images = []
malignant_images = []
normal_images = []

# Iterate through the training data and populate the lists based on class
for image, label in zip(X_train, y_train):
    if label == "Benign":
        benign_images.append(image)
    elif label == "Malignant":
        malignant_images.append(image)
    elif label == "Normal":
        normal_images.append(image)

In [None]:
benign_images

In [None]:
malignant_images

In [None]:
normal_images

In [None]:
# Define the image ID you want to plot
image_id_to_plot = 557

# Find the index of the image with the specified ID in the benign_images array
index_to_plot = np.where(train_df['Id'] == image_id_to_plot)[0]

if len(index_to_plot) > 0:
    index_to_plot = index_to_plot[0]  # Take the first index if multiple are found
    plt.figure(figsize=(5, 5))
    plt.imshow(benign_images[index_to_plot])
    plt.title(f"Image ID: {image_id_to_plot} - Class: Benign")
    plt.axis('off')
    plt.show()
else:
    print(f"Image ID not found: {image_id_to_plot}")

In [None]:
# Convert lists to numpy arrays
benign_images = np.array(benign_images)
malignant_images = np.array(malignant_images)
normal_images = np.array(normal_images)

In [None]:
# Find the count of each class
benign_count = benign_images.shape[0]
malignant_count = malignant_images.shape[0]
normal_count = normal_images.shape[0]


In [None]:
# Determine the maximum class count
max_class_count = max(benign_count, malignant_count, normal_count)

In [None]:
max_class_count

In [None]:
# Function to perform data augmentation for medical images
def perform_data_augmentation(images, desired_count):
    augmented_images = []
    current_count = images.shape[0]

    while current_count < desired_count:
        random_index = np.random.randint(0, images.shape[0])
        original_image = images[random_index]

        datagen = ImageDataGenerator(
            rotation_range=15,
            width_shift_range=0.1,
            height_shift_range=0.1,
            shear_range=0.1,
            zoom_range=0.1,
            horizontal_flip=True,
            fill_mode='nearest'
        )

        augmented_image = datagen.random_transform(original_image)
        augmented_images.append(augmented_image)

        current_count += 1

    return np.array(augmented_images)


In [None]:
# Perform data augmentation for the minority classes
augmented_malignant = perform_data_augmentation(malignant_images, max_class_count)
augmented_normal = perform_data_augmentation(normal_images, max_class_count)


In [None]:
# Concatenate the balanced data
balanced_X_train = np.concatenate((benign_images, augmented_malignant, augmented_normal))
balanced_y_train = np.concatenate((
    np.array(["Benign"] * benign_count),
    np.array(["Malignant"] * (max_class_count - malignant_count)),
    np.array(["Normal"] * (max_class_count - normal_count))
))


In [None]:
# Shuffle the balanced data
balanced_X_train, balanced_y_train = shuffle(balanced_X_train, balanced_y_train)


In [None]:
balanced_y_train

In [None]:
# Display a few augmented images for each class
unique_classes, class_counts = np.unique(balanced_y_train, return_counts=True)
num_images_to_visualize = 5

for class_name in unique_classes:
    class_indices = np.where(balanced_y_train == class_name)[0]
    random_indices = np.random.choice(class_indices, num_images_to_visualize, replace=False)

    plt.figure(figsize=(15, 5))
    plt.suptitle(f"Augmented Images for Class: {class_name}", fontsize=16)

    for i, index in enumerate(random_indices):
        plt.subplot(1, num_images_to_visualize, i + 1)
        plt.imshow(balanced_X_train[index])
        plt.title(f"Image {index}")
        plt.axis('off')

    plt.show()



In [None]:
import random

# Choose random indices from both arrays
num_samples_to_visualize = 5
random_indices_original = random.sample(range(X_train.shape[0]), num_samples_to_visualize)
random_indices_balanced = random.sample(range(balanced_X_train.shape[0]), num_samples_to_visualize)

plt.figure(figsize=(15, 10))
plt.suptitle("Comparison of Original and Augmented Images", fontsize=16)

for i in range(num_samples_to_visualize):
    # Plot original image
    plt.subplot(2, num_samples_to_visualize, i + 1)
    plt.imshow(X_train[random_indices_original[i]])
    plt.title(f"Original Image {random_indices_original[i]}")
    plt.axis('off')

    # Plot augmented image
    plt.subplot(2, num_samples_to_visualize, num_samples_to_visualize + i + 1)
    plt.imshow(balanced_X_train[random_indices_balanced[i]])
    plt.title(f"Augmented Image {random_indices_balanced[i]}")
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
balanced_X_train[0].shape

In [None]:
def resize_and_normalize_images(image_array, desired_size):
    normalized_images = []

    for image in image_array:
        normalized_image = resize_and_normalize_image(image, desired_size)
        normalized_images.append(normalized_image)

    return np.array(normalized_images)

# Example usage
desired_image_size = (256, 256)  # Adjust the desired size

# Resize and normalize images in balanced_X_train
normalized_balanced_X_train = resize_and_normalize_images(balanced_X_train, desired_image_size)

# Resize and normalize images in X_test
normalized_X_test = resize_and_normalize_images(X_test, desired_image_size)

In [None]:
normalized_balanced_X_train[0]

In [None]:
le = LabelEncoder()
balanced_y_train = le.fit_transform(y_balanced_y_train)


In [None]:
X_train,X_val,y_train,y_val = train_test_split(normalized_balanced_X_train,balanced_y_train, test_size=0.2,random_state=42)

In [None]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape

In [None]:
#Flattening
X_train=X_train.flatten()
X_train

In [None]:
X_test.shape

In [None]:
#Flattening
normalized_X_test=normalized_X_test.flatten()
normalized_X_test