# Code to Extract ColorHistograms for Database

#### Author: Nikolas Hülsmann
#### Date: 2015-11-22

## Functions for Extract Data

### Function to iterate through given directory and return images paths and classLabels

In [1]:
def imgCrawl(path): #path to 'highest' folder
    rootdir = path
    df = pd.DataFrame()
        
    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories
        for file in files:
            pathOfFile = os.path.join(subdir, file) #path of file
            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel
            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) 
            
    return df

### Function to determine Class-Labels with Integer representation

In [3]:
# function to determine Class-labels and return Series
def getClassLabels(path):
    data = os.listdir(path) # listdir returns all subdirectories
    index = range(0,len(data))
    
    return pd.Series(data,index)

### Function to calculate the ColorHistogram for given Images 

In [4]:
#### Calculate ColorHistograms for all images

# path to higehst folder
# dfImages: Dataframe with paths to all images - use function imgCrawl
# sClassLabel: Series with ClassLabels - use function getClassLabels
def calcColorHisto(path_, dfImages_, sClassLabels_):
    # Initialize function
    df = pd.DataFrame()
    path =path_
    npImages = dfImages_.values
    sClassLabels = sClassLabels_

    ## algo
    for images in npImages:
        image = cv2.imread(images[1])
        chans = cv2.split(image) # Split into color chanels rgb
        colors = ("b", "g", "r")
        features = []

        # loop over the image channels
        for (chan, color) in zip(chans, colors):
            # Calculate Color Histogram - 16 bins cf. paper
            hist = cv2.calcHist([chan], [0], None, [16], [0, 256])

            # to get raw values
            hist = hist[:,0]

            # Normalize with MinMax from 0 to 1 -> feature scaling
            cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)
            features.extend(hist)

        # assign integer label for dataframe
        classLabel = sClassLabels[sClassLabels == images[0]].index[0]

        # append features to df
        df = df.append({'classLabel': classLabel, 'ColHisto': features}, ignore_index=True) 
    
    return df

### Function to export calculated Data to csv 

In [5]:
#### Export ColorHistogram to csv
def exportToCSV(pandasSorDF, filename):
    #filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-ColorHistogram"
    path = os.getcwdu() + "\\" + filename
    
    if os.path.isfile(path + ".csv"):
        for i in range(1,20):
            testFileName = filename  + "-" + str(i) + ".csv"
            if os.path.isfile(os.getcwdu() + "\\" +  testFileName)!=True:
                pandasSorDF.to_csv(testFileName)
                break

    else:
        pandasSorDF.to_csv(filename + ".csv")


## Main Programm


In [6]:
# Imports
import os           # for iteration throug directories
import pandas as pd # for Series and DataFrames
import cv2          # for OpenCV 
import datetime     # for TimeStamp in CSVFile

In [7]:
#### Calculate Color Histogram
path ='D:\Caltech'
dfImages = imgCrawl(path)
sClassLabels = getClassLabels(path)
dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)

fileNameColorHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Features" +"-ColorHistogram"
exportToCSV(dfColorHistogram, fileNameColorHis)

fileNameClassLabels = datetime.datetime.now().strftime("%Y_%m_%d") + "-ClassLabels" + "-Caltech"
exportToCSV(sClassLabels, fileNameClassLabels)
