### Importing Libraries

In [2]:
import os
import cv2
import pandas as pd
import cupy as cp  # CuPy for GPU-based NumPy operations
import numpy as np
import tensorflow as tf
import scipy
from skimage.feature import local_binary_pattern
from skimage.filters import gabor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Feature Extraction

**Define the paths for the images**

In [3]:
# Define paths
dataset_dir = "textures 3"
categories = ['cotton', 'corduroy', 'denim', 'linin', 'wool']

**Canny Edge Detection**

In [4]:
def extract_canny_edge_detection(image):
    """ image must be passes to the function in grayscale"""
    # Step 1: Enhance contrast (optional)
    equalized_image = cp.asarray(cv2.equalizeHist(cp.asnumpy(image)))
    
    # Step 2: Apply Gaussian Blur to reduce noise
    blurred_image = cp.asarray(cv2.GaussianBlur(cp.asnumpy(equalized_image), (3, 3), 1))
    
    # Step 3: Apply Canny Edge Detection with adjusted thresholds (convert back and forth)
    edges = cp.asarray(cv2.Canny(cp.asnumpy(blurred_image), 30, 30))

    return edges

**Gabor Filtering**

In [5]:
def extract_gabor_filters(image):
    """ image must be in grayscale"""
    
    def build_kernels():
        # Parameters
        gabor_kernels = []
        angles = [0, cp.pi/4, cp.pi/2, 3*cp.pi/4]  # Use CuPy for angles
        ksize = 31  # Size of the filter
        sigma = 4.0  # Standard deviation of the Gaussian envelope
        lambd = 10.0  # Wavelength of the sinusoidal factor
        gamma = 0.5  # Spatial aspect ratio
        psi = 0  # Phase offset

        # Create Gabor kernels
        for theta in np.deg2rad([45, 135]):  # Convert degrees to radians
            kernel = cp.asarray(cv2.getGaborKernel((ksize, ksize), sigma, theta, lambd, gamma, psi, ktype=cv2.CV_32F)) # Using Cupy array
            gabor_kernels.append(kernel)

        return gabor_kernels


    gabor_kernels = build_kernels()
    
    gabor_features = []

    for kernel in gabor_kernels:
        fimg = cp.asarray(cv2.filter2D(cp.asnumpy(image), cv2.CV_8UC3, cp.asnumpy(kernel)))
        gabor_features.append(fimg)

    gabor_features = cp.array(gabor_features).flatten()

    return gabor_features

**Local Binary Pattern**

In [6]:
def extract_local_binary_pattern(image):

    # Parameters 
    radius = 1
    n_points = 8 * radius

    
    lbp = local_binary_pattern(cp.asnumpy(image), n_points, radius, method="uniform")
    (hist, _) = cp.histogram(cp.asarray(lbp).ravel(), bins=cp.arange(0, n_points + 3),
                             range=(0, n_points + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-6)

    return hist

**Feature Extraction**

In [7]:
# Function to extract features from an image
def extract_features(image):
    # Convert to grayscale using CuPy arrays
    gray = cp.asarray(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

    # Canny edge detection
    edges = extract_canny_edge_detection(gray)
    
    # Gabor Filter responses
    gabor_features=  extract_gabor_filters(gray)
    
    # Local Binary Patterns (LBP)
    hist = extract_local_binary_pattern(gray)
    
    # Combine features: edges, Gabor, and LBP
    features = cp.hstack([edges.flatten(), gabor_features, hist])
    features = cp.asnumpy(features)
    
    return features # Return features back as NumPy array for further processing

### **Feature Extraction Method 1**


1. **Data Structures**
  - `data`: Stores extracted features from augmented images.
  - `labels`: Stores corresponding labels for each image.

2. **Feature Extraction and Augmentation**

- For each image:
  - Resizes to 128x128 pixels and prepares for augmentation.
  - Initializes an augmentation iterator.
  - Generates 4 augmented versions
  - **Features are extracted only from the augmented images** and appending them to `data` and `labels`.

In [None]:
# Prepare dataset and labels
data = []
labels = []

# Image Augmentation using TensorFlow
datagen = ImageDataGenerator(
    #rotation_range=15,        # Random rotations up to 15 degrees
    #width_shift_range=0.1,    # Horizontal shifts
    #height_shift_range=0.1,   # Vertical shifts
    horizontal_flip=True,     # Flip images horizontally
    vertical_flip=True,       # Flip images vertically
    #zoom_range=0.2,           # Random zoom
    brightness_range=[0.8, 1.2], # Brightness adjustment
    shear_range=0.4           # Shear transformation
)

for num,category in enumerate(categories):
    path = os.path.join(dataset_dir, category)
    label = category
    
    for count,img_name in enumerate(os.listdir(path),start=1):

        if count==2:
            print(f"Shape of the features: {features.shape}")


        img_path = os.path.join(path, img_name)
        image = cv2.imread(img_path)
        
        try:
            # Apply data augmentation and extract features
            image = cv2.resize(image, (128, 128))  # Resize to a fixed size
            image = np.expand_dims(image, axis=0)  # Prepare for augmentation
            aug_iter = datagen.flow(image, batch_size=1)

            # Perform 4 augmentations per image
            for _ in range(4):
                aug_img = next(aug_iter)[0].astype(np.uint8)
                features = extract_features(aug_img)
                data.append(features)
                labels.append(label)


        except Exception as e:
            print(img_path,img_name)


### **Feature Extraction Method 2**

1. **Data Structures**
- **`data`**: Stores extracted features for both original and augmented images.
- **`labels`**: Stores labels corresponding to each image.
- **`groups`**: Tracks group IDs, assigning each original image and its augmentations the same group ID.


2. **Group ID Management**
- A unique `group_id` is assigned to each original image and its corresponding augmentations.
- The `group_id` is incremented for each new image processed.
- This is required in the step of nested cross validation to allow the same group of images to be either in the training folds or the testing fold , not both.

3. **Feature Extraction**
- **Features are extracted for the original images as well as their augmented versions.**
- For each original image, we have 4 augmentations.




In [8]:
# Prepare dataset, labels, and groups
data = []
labels = []
groups = []  # This will store the group IDs

# Image Augmentation using TensorFlow
datagen = ImageDataGenerator(
    horizontal_flip=True,     
    vertical_flip=True,       
    brightness_range=[0.8, 1.2], 
    shear_range=0.4           
)

group_id = 0  # Initialize group ID

for category in categories:
    path = os.path.join(dataset_dir, category)
    label = category
    
    for count, img_name in enumerate(os.listdir(path), start=1):
        
        img_path = os.path.join(path, img_name)
        image = cv2.imread(img_path)
        
        try:
            # Apply data augmentation and extract features
            image = cv2.resize(image, (128, 128))  # Resize to a fixed size
            
            # Extract features from the original image
            original_features = extract_features(image)
            data.append(original_features)
            labels.append(label)
            groups.append(group_id)  # Assign the group ID to the original image

            # Prepare for augmentation
            image = np.expand_dims(image, axis=0)
            aug_iter = datagen.flow(image, batch_size=1)

            # Perform 4 augmentations per image
            for _ in range(4):
                aug_img = next(aug_iter)[0].astype(np.uint8)
                features = extract_features(aug_img)
                data.append(features)
                labels.append(label)
                groups.append(group_id)  # Assign the same group ID to the augmentations

            # Increment group ID for the next image and its augmentations
            group_id += 1

        except Exception as e:
            print(f"Error processing image: {img_path}, {img_name}")


print()

Error processing image: textures 3/corduroy/.ipynb_checkpoints, .ipynb_checkpoints


Storing the data, labels and groups as memory mapped arrays

In [9]:
# Convert lists to NumPy arrays
final_data = np.array(data)
final_labels = np.array(labels)
final_groups = np.array(groups)


if not os.path.exists("Extracted_features"):
    os.makedirs("Extracted_features")


data_mmap = np.memmap('Extracted_features\\final_data_mmap.dat', dtype='float32', mode='w+', shape=final_data.shape)
labels_mmap = np.memmap('Extracted_features\\final_labels_mmap.dat', dtype='int', mode='w+', shape=final_labels.shape)
groups_mmap = np.memmap('Extracted_features\\final_groups_mmap.dat', dtype='int', mode='w+', shape=final_groups.shape)

# Copy data to memory-mapped files (you can do this in batches)
data_mmap[:] = final_data[:]
labels_mmap[:] = final_labels[:]
groups_mmap[:] = final_groups[:]

# Flush to disk
data_mmap.flush()
labels_mmap.flush()
groups_mmap.flush()

: 

### **Feature Extraction Method 3**


Storing the extracted features in batches

This script processes images in batches for efficient memory usage, performing image augmentation, feature extraction, and saving the data incrementally using NumPy arrays.


1. **Batch Saving**
- Data is saved in batches once `batch_size` (1000 images) is reached:
  1. Saves the current batch of data to a compressed `.npz` file using `np.savez_compressed()`.
  2. **Memory Management**: Clears the `data`, `labels`, and `groups` lists, and forces garbage collection (`gc.collect()`) to free memory before processing the next batch.

2. **Final Data Save**
- After processing all images, if any remaining data exists that hasn't been saved, the script writes the final batch to disk.

3. **Summary**
This script efficiently processes images by augmenting them, extracting features, and saving the data in batches to avoid memory overload. It also ensures proper memory management using garbage collection after each batch.


In [10]:
import numpy as np
import os
import cv2
import gc
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Initialize batch size for saving
batch_size = 1000  # You can adjust this based on memory capacity
save_count = 0  # To track saved files

# Initialize arrays
data = []
labels = []
groups = []  # This will store the group IDs

group_id = 0  # Initialize group ID

# Image Augmentation using TensorFlow
datagen = ImageDataGenerator(
    horizontal_flip=True,     
    vertical_flip=True,       
    brightness_range=[0.8, 1.2], 
    shear_range=0.4           
)

for category in categories:
    path = os.path.join(dataset_dir, category)
    label = category

    for count, img_name in enumerate(os.listdir(path), start=1):
        
        img_path = os.path.join(path, img_name)
        image = cv2.imread(img_path)
        
        try:
            # Apply data augmentation and extract features
            image = cv2.resize(image, (128, 128))  # Resize to a fixed size
            
            # Extract features from the original image
            original_features = extract_features(image)
            data.append(original_features)
            labels.append(label)
            groups.append(group_id)  # Assign the group ID to the original image

            # Prepare for augmentation
            image = np.expand_dims(image, axis=0)
            aug_iter = datagen.flow(image, batch_size=1)

            # Perform 4 augmentations per image
            for _ in range(4):
                aug_img = next(aug_iter)[0].astype(np.uint8)
                features = extract_features(aug_img)
                data.append(features)
                labels.append(label)
                groups.append(group_id)  # Assign the same group ID to the augmentations

            # Increment group ID for the next image and its augmentations
            group_id += 1

            # Save data in batches
            if len(data) >= batch_size:
                # Save current batch to an .npz file
                np.savez_compressed(f'Extracted_features\\dataset_batch_{save_count}.npz', data=np.array(data), labels=np.array(labels), groups=np.array(groups))
                save_count += 1

                # Clear memory by resetting the arrays and forcing garbage collection
                data.clear()
                labels.clear()
                groups.clear()
                gc.collect()  # Force garbage collection to free memory

        except Exception as e:
            print(f"Error processing image: {img_path}, {img_name}")

# Save any remaining data if exists after loop ends
if data:
    np.savez_compressed(f'Extracted_features\\dataset_batch_{save_count}.npz', data=np.array(data), labels=np.array(labels), groups=np.array(groups))
    data.clear()
    labels.clear()
    groups.clear()
    gc.collect()  # Clean up the remaining memory


Error processing image: textures 3/corduroy/.ipynb_checkpoints, .ipynb_checkpoints


**Loading the data, labels and groups**

In [None]:
import os
import numpy as np

# Directory where your .npz files are stored
npz_directory = './'  # Replace with the correct directory path if necessary

# Automatically list all .npz files in the directory
npz_files = [f for f in os.listdir(npz_directory) if f.endswith('.npz')]

# Initialize empty lists to store the extracted data
all_data = []
all_labels = []
all_groups = []

# Iterate through each saved .npz file and load the arrays
for file in npz_files:
    file_path = os.path.join(npz_directory, file)
    # Load the saved .npz file
    with np.load(file_path) as data:
        all_data.append(data['data'])    # Append 'data' array
        all_labels.append(data['labels']) # Append 'labels' array
        all_groups.append(data['groups']) # Append 'groups' array

# Now concatenate all arrays to create final datasets
final_data = np.concatenate(all_data, axis=0)
final_labels = np.concatenate(all_labels, axis=0)
final_groups = np.concatenate(all_groups, axis=0)

# Your final arrays are now ready to use
print("Final data shape:", final_data.shape)
print("Final labels shape:", final_labels.shape)
print("Final groups shape:", final_groups.shape)


: 

--------------------------------End----------------------------------

MOVE TO model_training.ipynb

**Loading the saved data and labels**

In [None]:
# Load from the .npz file
loaded_data = np.load('Extracted_features\\data.npz')
loaded_labels = np.load('Extracted_features\\labels.npz')
loaded_groups = np.load('Extracted_features\\groups.npz')

data_loaded = loaded_data["data"]
labels_loaded = loaded_labels["labels"]
groups_loaded = loaded_labels["groups"]

### **Feature Extraction Method 4**

(using data,labels and groups directly as ndarrays rather than lists to reduce RAM usage for conversion to arrays from lists)

The ndarrays are memory mapped reducing memory usage

**Key Feature: Memory-Mapped Arrays**

- Creates a folder for storing the features, labels, and group IDs if it doesn't exist.
- Preallocates memory-mapped arrays for efficient handling of large datasets:
  - **`data_file`**: Stores image features.
  - **`labels_file`**: Stores image labels.
  - **`groups_file`**: Stores group IDs to keep track of the relationship between original and augmented images.
- The total number of images (`total_images = 17,780`) and the feature size (`49162`) are predefined.


This first snippet counts the number of orginal images, number of augmented images and the total no of features extracted from each image.<br>
Only when we know the feature size can we create a memory mapped array in advance with a defined shape.

In [2]:

# Counting the number of rows(images) and features
import os

# Initialize counters
total_original_images = 0
total_augmented_images = 0
augmentation_per_image = 4  # As you're doing 4 augmentations per image
feature_size = None  # To store the size of features (columns)

for category in categories:
    path = os.path.join(dataset_dir, category)
    
    for count, img_name in enumerate(os.listdir(path), start=1):
        img_path = os.path.join(path, img_name)
        image = cv2.imread(img_path)
        
        try:
            image = cv2.resize(image, (128, 128))  # Resize to a fixed size
            original_features = extract_features(image)
            
            # Determine feature size (columns) from the first image processed
            if feature_size is None:
                feature_size = original_features.shape[0]  # Assuming 1D feature vector
            
            # Increment counts for original and augmented images
            total_original_images += 1
            total_augmented_images += augmentation_per_image

        except Exception as e:
            print(f"Error processing image: {img_path}, {img_name}")

# Total number of images = original + augmented
total_images = total_original_images + total_augmented_images

# Print the results
print(f"Total original images: {total_original_images}")
print(f"Total augmented images: {total_augmented_images}")
print(f"Total images including augmentations: {total_images}")
print(f"Number of features per image: {feature_size}")


NameError: name 'categories' is not defined

This sceond snippet performs the real task of feature extraction storing the data, labels and groups as memory mapped arrays.<br>
Shape of data - (total_images,feature_size)<br>
Shape of labels - (total_images,)<br>
Shape of groups - (total_images,)

In [8]:
# Image Augmentation using TensorFlow
datagen = ImageDataGenerator(
    horizontal_flip=True,     
    vertical_flip=True,       
    brightness_range=[0.8, 1.2], 
    shear_range=0.4           
)

# Step 1: Create "Extracted_features" folder if it doesn't exist
output_dir = 'Extracted_features'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Step 2: Define file paths for the memory-mapped arrays inside the new folder
data_file = os.path.join(output_dir, 'data_file.dat')
labels_file = os.path.join(output_dir, 'labels_file.dat')
groups_file = os.path.join(output_dir, 'groups_file.dat')
# Known values
total_images = 17780  # Total original images + augmentations
feature_size = 49162   # Number of features per image

# Step 1: Preallocate memory-mapped arrays
data = np.memmap(data_file, dtype='float32', mode='w+', shape=(total_images, feature_size))
labels = np.memmap(labels_file, dtype='object', mode='w+', shape=(total_images,))
groups = np.memmap(groups_file, dtype='int32', mode='w+', shape=(total_images,))

# Step 2: Feature Extraction with memory mapping
group_id = 0  # Initialize group ID
image_counter = 0  # Keep track of which row we are filling in the arrays

for category in categories:
    path = os.path.join(dataset_dir, category)
    label = category
    
    for count, img_name in enumerate(os.listdir(path), start=1):
        img_path = os.path.join(path, img_name)
        image = cv2.imread(img_path)
        
        try:
            image = cv2.resize(image, (128, 128))  # Resize to a fixed size
            
            # Extract features from the original image
            original_features = extract_features(image)
            
            # Store the original image's features, label, and group ID in the memory-mapped arrays
            data[image_counter, :] = original_features  # Fill in the row corresponding to the current image
            labels[image_counter] = label
            groups[image_counter] = group_id
            image_counter += 1  # Increment counter to fill the next row

            # Prepare for augmentation
            image = np.expand_dims(image, axis=0)
            aug_iter = datagen.flow(image, batch_size=1)

            # Perform 4 augmentations per image
            for _ in range(4):
                aug_img = next(aug_iter)[0].astype(np.uint8)
                features = extract_features(aug_img)
                
                # Store augmented image features, label, and group ID
                data[image_counter, :] = features
                labels[image_counter] = label
                groups[image_counter] = group_id
                image_counter += 1  # Increment counter for the next augmented image

            # Increment group ID for the next set of images
            group_id += 1

        except Exception as e:
            print(e)

# Flush the changes to disk to ensure everything is saved
data.flush()
labels.flush()
groups.flush()


OpenCV(4.10.0) /io/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'



**Freeing up the memory**

In [11]:
import gc
gc.collect()
del data
del labels
del groups

**Load the data,labels and groups**

In [3]:
import numpy as np
import os

# Memory-mapped file paths
output_dir = 'Extracted_features'
data_file = os.path.join(output_dir, 'data_file.dat')
labels_file = os.path.join(output_dir, 'labels_file.dat')
groups_file = os.path.join(output_dir, 'groups_file.dat')

# Known values
total_images = 17780  # Total original images + augmentations
feature_size = 49162   # Number of features per image 

# Load the memory-mapped arrays for reading
data = np.memmap(data_file, dtype='float32', mode='r', shape=(total_images, feature_size))
labels = np.memmap(labels_file, dtype='object', mode='r', shape=(total_images,))
groups = np.memmap(groups_file, dtype='int32', mode='r', shape=(total_images,))

In [4]:
print(data.shape)

(17780, 49162)


**Data** contains the input data (X) <br>
**Labels** contains the output data (Y) 

In [4]:
# Checking the shape of the dataset and the labels
print(f"Dataset shape: {data_loaded.shape}")
print(f"Labels shape: {labels_loaded.shape}")
print(f"Groups shape: {groups_loaded.shape}")

Dataset shape: (17780, 49162)
Labels shape: (17780,)
