In [1]:
!pip install opencv-python




In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import matplotlib as mpl
from pylab import rcParams
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from skimage.io import imread, imshow
from skimage.transform import resize
from skimage.feature import hog
from skimage import exposure, filters, color
from keras.models import Sequential
from keras.metrics import categorical_accuracy
from keras import losses
from keras.layers import Conv2D, Dense, GlobalMaxPooling2D, MaxPooling2D, BatchNormalization
from keras.layers import Add, Input, ZeroPadding2D, Activation, Flatten, Dropout
from keras.models import Sequential, Model
import keras.backend as K
import tensorflow as tf
import random
from keras.applications.vgg16 import VGG16

%config InlineBackend.figure_format = 'retina'
%matplotlib inline


In [4]:
'''
LOADING DATA AND PREPROCESSING
'''

def load_data(dir_data, dir_labels, training=True):
    labels_pd = pd.read_csv(dir_labels)
    ids = labels_pd.id.values
    data = []
    for identifier in ids:
        fname = dir_data + identifier.astype(str) + '.tif'
        image = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data)
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids

# Define dataset directories
DIR_TRAIN_IMAGES = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/training/"
DIR_TEST_IMAGES  = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/testing/"
DIR_TRAIN_LABELS = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/labels_training.csv"
DIR_TEST_IDS     = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/sample_submission.csv"

# Load train and test data
X, y = load_data(DIR_TRAIN_IMAGES, DIR_TRAIN_LABELS)
X_test, test_ids = load_data(DIR_TEST_IMAGES, DIR_TEST_IDS, training=False)

# Resize each image in X_train to (100, 100)
X = [cv2.resize(img, (100, 100)) for img in X]
X = np.array(X)

# Print data shape
print('X shape:\n', X.shape)
print('X_test shape:\n', X_test.shape)

# Check the number of samples in each class
print('Distribution of y', np.bincount(y))

# Scale pixel values between 0 and 1
X = X / 255.0
X_test = X_test / 255.0




X shape:
 (1500, 100, 100, 3)
X_test shape:
 (558, 101, 101, 3)
Distribution of y [995 505]


In [10]:
from sklearn.utils import resample

# Check the number of samples in each class
class_counts = np.bincount(y)
print('Distribution of y:', class_counts)

# Determine the minority class size
minority_class_size = np.min(class_counts)

# Balance the data using random selection
balanced_X = []
balanced_y = []

for class_label in np.unique(y):
    class_indices = np.where(y == class_label)[0]
    sampled_indices = resample(class_indices, n_samples=minority_class_size, random_state=42)
    balanced_X.extend(X[sampled_indices])
    balanced_y.extend(y[sampled_indices])

# Convert balanced data to NumPy arrays
X = np.array(balanced_X)
y = np.array(balanced_y)

# Shuffle the data
shuffle_indices = np.arange(len(y))
np.random.shuffle(shuffle_indices)
X = X[shuffle_indices]
y = y[shuffle_indices]

# Now the data is balanced
print('Balanced X shape:', X.shape)
print('Balanced y distribution:', np.bincount(y))

# Check the number of samples in each class
class_counts = np.bincount(y)
print('Balanced y distribution:', class_counts)

# Check if the data is balanced
is_balanced = np.all(class_counts == class_counts[0])

if is_balanced:
    print('The data is balanced.')
else:
    print('The data is imbalanced.')


Distribution of y: [505 505]
Balanced X shape: (1010, 100, 100, 3)
Balanced y distribution: [505 505]
Balanced y distribution: [505 505]
The data is balanced.


In [None]:
import matplotlib.pyplot as plt

def show_images(image_ids, num_images, title):
    '''
    Display a subset of images from the image_ids data
    '''
    rcParams['figure.figsize'] = 20, 4
    plt.figure()
    n = 1
    for i in image_ids[0:num_images]:
        plt.subplot(1, num_images, n)
        plt.imshow(X[i, :, :, :])
        plt.title('Image id: ' + str(i))
        plt.xticks([])
        plt.yticks([])
        n += 1

    plt.suptitle(title, fontsize=15)
    plt.show()

# You can then use this function to visualize images as shown in your code.

In [11]:
from keras.preprocessing.image import ImageDataGenerator

# Initialize the ImageDataGenerator with your desired augmentations
datagen = ImageDataGenerator(
    rotation_range=40,  # Random rotation in the range [0, 40] degrees
    width_shift_range=0.2,  # Random horizontal shift by up to 20% of the image width
    height_shift_range=0.2,  # Random vertical shift by up to 20% of the image height
    shear_range=0.2,  # Shear intensity (shear angle in radians)
    zoom_range=0.2,  # Random zoom in the range [0.8, 1.2]
    horizontal_flip=True,  # Randomly flip images horizontally
    fill_mode='nearest'  # Filling mode for pixel values outside the boundary
)

# Specify the directory where the images are located
image_directory = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/training/"

# Load and preprocess your data (X and y) if not done already

# Specify where the augmented images should be saved
save_directory = "/content/drive/MyDrive/Data_Science/01_Projects/02_VGG_CNN_KNN_LR/Data/Augmented_images/"

# Define the number of augmented images to generate per original image
num_augmented_images = 5

# Apply data augmentation and save augmented images
for i in range(len(X)):
    img = X[i]
    img = img.reshape((1,) + img.shape)  # Reshape to (1, height, width, channels)

    for j, _ in enumerate(datagen.flow(img, batch_size=1)):
        if j >= num_augmented_images:
            break  # Stop after generating the specified number of augmented images

        # Convert augmented image to the original shape
        augmented_img = _[0].reshape(img.shape[1:])

        # Save the augmented image
        mpl.image.imsave(f"{save_directory}/augmented_image_{i}_{j}.png", augmented_img)

# The augmented images will be saved in the 'save_directory' with filenames like 'augmented_image_0_0.png', 'augmented_image_0_1.png', etc.


In [12]:
from keras.preprocessing.image import ImageDataGenerator

# Initialize the ImageDataGenerator with your desired augmentations
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Load and preprocess your data (X and y) if not done already

# Count the number of original images
num_original_images = len(X)

# Define the number of augmented images to generate per original image
num_augmented_images = 5

# Calculate the total number of images before augmentation
total_images_before_augmentation = num_original_images

# Calculate the total number of images after augmentation
total_images_after_augmentation = num_original_images * (1 + num_augmented_images)

# Print the counts
print("Number of original images:", num_original_images)
print("Number of images to generate per original image:", num_augmented_images)
print("Total number of images before augmentation:", total_images_before_augmentation)
print("Total number of images after augmentation:", total_images_after_augmentation)


Number of original images: 1010
Number of images to generate per original image: 5
Total number of images before augmentation: 1010
Total number of images after augmentation: 6060
