In [19]:
import pandas as pd
#import xlrd #do not need to explicitly import, but needed for pandas.read_excel
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import mahotas
import cv2
import os
import h5py

In [40]:
data_dir = './7Feb2019_DeMMOcartridgesFebNovAug17'
label_file = (data_dir + '/DeMMO_Biofilm_CellCounts.xlsx')
data_labels = pd.read_excel(label_file).dropna()
#int(data_labels["cells.counted"][0]) # test count
data_labels

Unnamed: 0,Site,Tube,Date Deployed,Date Collected,Mineral,Experiment,Type,Duration.days,Cell.density.sq.mm,cells.counted
0,DeMMO 1,7 Large,2017-08-31,2018-04-18,calcite,mineral,Exp,231,447.065598,29.00
1,DeMMO 1,7 Large,2017-08-31,2018-04-18,calcite,mineral,Rep,231,2016.095890,192.00
2,DeMMO 1,7 Large,2017-08-31,2018-04-18,calcite,control,Internal Control,231,875.396810,81.00
3,DeMMO 1,9 Large,2017-08-31,2018-04-18,muscovite,mineral,Exp,231,6939.881357,449.00
4,DeMMO 1,9 Large,2017-08-31,2018-04-18,muscovite,mineral,Rep,231,5523.557849,298.00
5,DeMMO 1,9 Large,2017-08-31,2018-04-18,muscovite,control,Internal Control,231,937.544592,75.00
6,DeMMO 1,12 Large,2017-08-31,2018-04-18,control sand,Control,Control,231,48192.761850,1559.00
8,DeMMO 1,SC 1,2017-08-31,2018-04-18,pyrolusite,mineral,exp,231,52349.678599,4155.00
9,DeMMO 1,SC 1,2017-08-31,2018-04-18,pyrolusite,control,Internal Control,231,21472.987787,1684.00
10,DeMMO 1,SC 2,2017-08-31,2018-04-18,siderite,mineral,Exp,231,10737.033003,472.00


In [24]:
fixed_size = tuple((768, 551)) #these are the lowest dimensions I found; D1T12Large_glass_Aug2017.tif
#os.listdir(data_dir)
bins = 10

In [25]:
# feature-descriptor-1: Hu Moments
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature


# feature-descriptor-2: Haralick Texture
def fd_haralick(image):
    # convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    # return the result
    return haralick


# Color Histogram -- Maybe not helpful?
def fd_histogram(image, mask = None):
    # convert the image to HSV color-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # compute the color histogram
    hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
    # normalize the histogram
    cv2.normalize(hist, hist)
    # return the histogram
    return hist.flatten()

In [44]:
global_features = []
labels = []
data_list = os.listdir(data_dir)[:-1] #cuts last file assuming it is excel table
for i in data_labels.index:
    file = data_dir + '/' + data_list[i]
    current_label = int(data_labels["cells.counted"][i])
    image = cv2.imread(file)
    image = cv2.resize(image, fixed_size)
    fv_hu_moments = fd_hu_moments(image)
    fv_haralick = fd_haralick(image)
    fv_histogram  = fd_histogram(image)
    global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])
    labels.append(current_label)
    global_features.append(global_feature)

In [52]:
target = labels
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)
h5f_data = h5py.File('data.h5', 'w')
h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))
h5f_label = h5py.File('labels.h5', 'w')
h5f_label.create_dataset('dataset_1', data=np.array(target))
h5f_data.close()
h5f_label.close()