In [4]:
from read_datasets import read_data 
from read_datasets import DATA_PATH, ICMC_PATH, ORIGNAL_PATH

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 50)

from skimage.feature import hog

In [5]:
# Read data
icmc_x, icmc_y = read_data(DATA_PATH + ICMC_PATH, size=(20, 10, 300, 200))
original_x, original_y = read_data(DATA_PATH + ORIGNAL_PATH,
                            size=(20, 10, 112,  92))

# Apply HOG

In [6]:
def apply_hog(imgs, testing=False):
    """ Applys HOG (Histogram of oriented gradients) to array of images
    
    Parameters
    ----------
    imgs :  numpy.array 
            Contains the images
    
    testing : boolean
            Weather or not to use less data (for testing purposes)
            Also return a tuple, with the second element being the visualization 
                of the image
    """
    if testing:
        visualize = True
        imgs = imgs[:1]
    else:
        visualize = False
    
    new_imgs = []
    for img in imgs:
        new_imgs.append(hog(img, visualize=visualize, feature_vector=True))
        
    return np.array(new_imgs)

# Save data

In [17]:
def save_dataset(data_x, data_y, path):
    full_data = pd.DataFrame(data_x)
    full_data["target"] = data_y
    
    # Split a large csv file into multiple small csvs
    # We always split, even if the size is small, for cleaner code
    for name, group in full_data.groupby("target"):
        group.to_csv(path.replace(".csv", f"{name}.csv"), index=False)


In [19]:
RE_RUN = True   # By default we do not re-generate all data
DEBUG = False    # By default we do not show any debug info (plots and sizes)

if __name__ == "__main__":
    if RE_RUN:
        # Regenerate all data
        img_folders = [ICMC_PATH, ORIGNAL_PATH]
        file_names = ["icmc.csv", "original.csv"]
        sizes = [(20, 10, 300, 200), (20, 10, 112,  92)]
        
        for folder, file_name, size in zip(img_folders, file_names, sizes):
            data_x, data_y = read_data(DATA_PATH + folder, size=size)
            hog_data = apply_hog(data_x)
            save_dataset(hog_data, data_y, DATA_PATH + file_name)
    
    elif DEBUG:
        # Plot of a converted image:
        original_x, original_y = read_data(DATA_PATH + ICMC_PATH,
                                            size=(20, 10, 300,  200))
        img, show_img = apply_hog(original_x, testing=True)[0]
        plt.imshow(show_img, cmap=plt.cm.gray)
        plt.show()

        # Distribution of data
        imgs = apply_hog(original_x)
        
        # Get random columns and describe them
        cols = np.random.randint(0, high=len(imgs[0])-1, size=100)
        pd.DataFrame(imgs).iloc[:, cols].describe().T[["min", "max"]]
        print("Therefore we don't need to standardize since the min/max", 
              "is already very close")
    
        print("Size of a feature vector:", img.shape)
    
    else:
        print("We are not running anything")