### Imports

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
import shutil
import random
import warnings
from sklearn import preprocessing
import itertools

### Paths
After downloading the files from the link: 
change the path accordingly, and leave \painting-to-artist\dataset or \painting-to-artist\workspace as is.

In [8]:
dataset_path = os.path.abspath(r'D:\Program Files (x86)\painting-to-artist\dataset')
training_path = os.path.abspath(r'D:\Program Files (x86)\painting-to-artist\workspace\train')
testing_path = os.path.abspath(os.path.join(r'D:\Program Files (x86)\painting-to-artist\workspace\test'))

### Sampling the Images

In [9]:
def sample_images(data_path, train_path, test_path, n, ratio=0.8):
    """
    This method samples the images from the dataset path to the training path and testing path.
    :param data_path: The path to the data.
    :param train_path: The path to which the training images will be copied.
    :param test_path: The path to which the testing images will be copied.
    :param n: The number of images aimed to be sampled.
    :param ratio: The wanted ratio from the sampled data to be used for training. The rest will be used for testing.
    :return: None
    """

    if ratio >= 1 or ratio <= 0:
        raise ValueError("The ratio should be between 0 and 1 (non-inclusive)")
    
    if ratio < 0.5:
        warnings.warn("The ratio is less than 0.5, not advised for good training")
    
    artists = os.listdir(data_path) # list of artists
    
    for artist in artists:
        artist_path = os.path.join(data_path, artist)  # path to the artist
        images = os.listdir(artist_path)               # list of images
        random.shuffle(images)                         # shuffle the images inside the images list

        ## Adjust the number of images based on availability
        n_train = min(int(n * ratio), int(len(images) * ratio))  # 80% of the images
        n_test = min(n-n_train, len(images) - n_train)           # 20% of the images

        ## Partition the images to training and testing
        train_images = images[:n_train]
        test_images = images[n_train:n_train + n_test]

        ## Create the directories for training and testing
        artist_train_path = os.path.join(train_path, artist)
        artist_test_path = os.path.join(test_path, artist)
        os.makedirs(artist_train_path, exist_ok=True)
        os.makedirs(artist_test_path, exist_ok=True)

        # Create the directories
        for image in train_images:
            image_path = os.path.join(artist_path, image)
            shutil.copy(image_path, os.path.join(artist_train_path, image))

        for image in test_images:
            image_path = os.path.join(artist_path, image)
            shutil.copy(image_path, os.path.join(artist_test_path, image))

In [23]:
# ??????????????????????? #
sample_images(dataset_path, training_path, testing_path, 50, 0.8)

In [11]:
def clear_files(directory):
    """
    Clears all files within the subdirectories of the given directory.
    :param directory: The directory to clear its subcategories' files.
    """
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            os.remove(file_path)

In [49]:
clear_files(training_path)
clear_files(testing_path)

### Data Preprocessing
This section includes methods used to preprocess the data before feeding it to the model of our choice.
The result will be a list of vectorized images and their corresponding labels.

In [12]:
def resize_and_split(train_path, test_path, size=128):
    """
    Resizes the images to the given size and splits the data into training and testing datasets.
    :param train_path: The path from which the training images will be taken.
    :param test_path: The path from which the testing images will be taken.
    :param size: The row/column size to which the images will be resized.
    :return: Lists of the training and testing images and their corresponding labels.
    """
    train_images = []
    train_labels = []
    
    # The following loop reads the images from the training path, resizes them, and adds them to the list of images.
    # It also adds the corresponding label for each image to the list of labels.
    for directory_path in glob.glob(train_path+r'/*'):
        label = directory_path.split("\\")[-1] # extracts the artist name from the directory path.
        # print(label)
        for img_path in glob.glob(os.path.join(directory_path, "*.jpg")):
            img = cv2.imread(img_path, cv2.IMREAD_COLOR) # Read the image in color (BGR format)
            img = cv2.resize(img, (size, size))          # Resize the image to the given size, using weighted average values for interpolation.
            train_images.append(img)   # Add the processed image to the list of images
            train_labels.append(label) # Add the label to the list of labels, matching the image in the same index in the images list.
    
    # Convert the lists to numpy arrays
    train_images = np.array(train_images)
    train_labels = np.array(train_labels)
    
    ##################
    
    test_images = []
    test_labels = []

    # The following loop reads the images from the training path, resizes them, and adds them to the list of images.
    # It also adds the corresponding label for each image to the list of labels.
    for directory_path in glob.glob(test_path+r'/*'):
        label = directory_path.split("\\")[-1] # extracts the artist name from the directory path.
        # print(label)
        for img_path in glob.glob(os.path.join(directory_path, "*.jpg")):
            img = cv2.imread(img_path, cv2.IMREAD_COLOR) # Read the image in color (BGR format)
            img = cv2.resize(img, (size, size))          # Resize the image to the given size, using weighted average values for interpolation.
            test_images.append(img)    # Add the processed image to the list of images
            test_labels.append(label)  # Add the label to the list of labels, matching the image in the same index in the images list.
            
    # Convert the lists to numpy arrays
    test_images = np.array(test_images)
    test_labels = np.array(test_labels)
    
    return train_images, train_labels, test_images, test_labels

In [25]:
# ??????????????????????? #
x_train, y_train, x_test, y_test = resize_and_split(training_path, testing_path, 128)

In [26]:
def label_data(train_labels, test_labels):    
    """
    Converts the labels to numbers for the model to be able to process them, utilizing sklearn LabelEncoder.
    :param train_labels: The labels of the images in the training dataset.
    :param test_labels: The labels of the images in the testing dataset.
    :return: The encoded labels for both the training and testing datasets.
    """
    # Converting the labels to numbers for the model to be able to process them.
    le = preprocessing.LabelEncoder()
    le.fit(test_labels)
    test_labels_encoded = le.transform(test_labels)
    le.fit(train_labels)
    train_labels_encoded = le.transform(train_labels)

    # Giving our data conventional names for easier use in the model.
    return train_labels_encoded, test_labels_encoded

In [27]:
# ??????????????????????? #
y_train_encoded, y_test_encoded = label_data(y_train, y_test)

In [28]:
def minmax_normalize(train_images, test_images):    
    """
    Normalizes the data to be between 0 and 1 using the min-max normalization.
    :param train_images: The training dataset.
    :param test_images: The testing dataset.
    :return: Normalized training and testing datasets.
    """
    train_images_normalized = []
    for img in train_images:
        min_val = np.min(img)
        max_val = np.max(img)
        normalized_img = (img.astype(np.float32) - min_val) / (max_val - min_val)
        train_images_normalized.append(normalized_img)

    test_images_normalized = []
    for img in test_images:
        img = img.astype(np.float32)
        min_val = np.min(img)
        max_val = np.max(img)
        normalized_img = (img.astype(np.float32) - min_val) / (max_val - min_val)
        test_images_normalized.append(normalized_img)
        
    return np.array(train_images_normalized), np.array(test_images_normalized)

In [29]:
# ??????????????????????? #
x_train, x_test = minmax_normalize(x_train, x_test)

In [46]:
# The following function is used to vectorize the images by extracting features from them, and aligning them in a dataframe.
# The input must be a 4 dimensional array. In our case, an array of colored images. Won't work with grayscale images.
def feature_extraction(dataset):
    
    image_dataset = pd.DataFrame()
    
    for image in range(dataset.shape[0]): # meaning we iterate through each image
    
        df = pd.DataFrame()               # We use a temporary dataframe to capture information for each image
        
        input_img = dataset[image, :,:,:]   # We take the image
        img = input_img
        
        # >> Feature no. 1 - Pixel Values <<
        pixel_values = img.reshape(-1)     # Reshaping the image into one vector.
        df['Pixel_Value'] = pixel_values   # Adding the pixel values to the dataframe.
        
        # >> Feature no. 2 - Gabor Filter Responses <<
        # This feature captures the texture and directionality of the image,
        # which is very important in our case as different artists tend to have unique textures and brush strokes.
        
        # The following parameters are used to create the Gabor filters.
        f  = [0.1, 0.5, 1.0, 1.5, 2.0] # Represents the frequency of the sine component
        o  = [0, 30, 60, 90, 120, 150] # Represents the orientation of the filter
        sa = [0.5, 0.75, 1.0]          # Represents the spatial aspect ratio of the filter.
        sd = [1.0, 2.0, 3.0]           # Represents the standard deviation of the filter
        p  = [0, 1*np.pi/2]            # Represents the phase offset of the filter
        ks = [2, 4, 6]                 # Represents the kernel size of the filter (K x K)
        
        # Create all possible combinations of the parameters above.
        combos = list(itertools.product(f, o, sa, sd, p, ks)) # All possible combinations of the filter parameters
        count = 1
        filters = []
        filtered_images = []
        
        # This loop applies all the possible combinations of the Gabor filters with the parameters set above to the image and adds the responses to the dataframe.
        for freq, orient, aspect, std_dev, phase_offset, kernel_size in combos:
            
            gabor_label = 'Gabor' + str(count)  # Create a label for each filter response
            
            ## Create a gabor filter based on the current combination of parameters and append it to the list of filters.
            gabor_filter = cv2.getGaborKernel((kernel_size, kernel_size), std_dev, orient, freq, aspect, phase_offset, ktype=cv2.CV_32F)
            filters.append(gabor_filter)

            gabor_response = cv2.filter2D(img, cv2.CV_32F, gabor_filter) # Apply the filter to the image
            filtered_img = gabor_response.reshape(-1)
            df[gabor_label] = filtered_img
            
            count += 1 # Increment the count for the next filter label

        # >> Feature no. 3 - Sobel Edge <<
        # This feature captures the edges in the image, which is also important in our case as different artists tend to have unique edges,
        # and the edges are also an important aspect in the painting itself.
        
        edge_sobel = sobel(img)              # Apply the Sobel filter to the image
        edge_sobel = edge_sobel.reshape(-1)  # Reshape the response into a vector
        df['Sobel'] = edge_sobel             # Add the response to the dataframe
    
        # Append the dataframe of the current image to the dataset dataframe.
        image_dataset = image_dataset.append(df)
        
    # Return the dataframe of the dataset.
    return image_dataset

In [48]:
# ??????????????????????? #
## img_features = feature_extraction(x_train)

KeyboardInterrupt: 