Dealing with class imbalance:
- resampling techniques
    - under or over sampling random vs informed
    - SMOTE synthetic minor ...
- kappa statistics/ MCC Metric
- multiclass mcc "comparing two k-category assignments by a k-category correlation coeeficient"


spatial pyramid pooling in deep convolutional networks for visual recognition

In [131]:
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from pathlib import Path
import keras
import random
from keras.utils import Sequence
import skimage
from skimage.io import imread
from skimage.transform import resize
from skimage.util import pad
from skimage.util import crop
import numpy as np

In [141]:
class MY_Gen(Sequence):

    def __init__(self, image_filenames, labels, batch_size, shuffle):
        self.image_filenames, self.labels = image_filenames, labels
        self.batch_size = batch_size
        self.num_labels = len(np.unique(labels))
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / float(self.batch_size)))
    
    def crop_or_pad(self, image, dim):
        x, y, _ = image.shape
        if y < dim and x < dim:
            image = pad(image, ((math.ceil((dim - image.shape[0])/2),math.floor((dim - image.shape[0])/2)),
                                (math.ceil((dim - image.shape[1])/2),math.floor((dim - image.shape[1])/2)), 
                                (0,0)), 'constant', constant_values = 255)
        elif y > dim and x > dim:
            rand1 = random.randint(1,x-dim)
            rand2 = random.randint(1,y-dim)
            image = crop(image, ((rand1,x-dim-rand1),(rand2,y-dim-rand2),(0,0)))
        elif x > dim:
            rand1 = random.randint(1,x-dim)
            image = pad(image, ((0,0),
                                (math.ceil((dim - y)/2),math.floor((dim - y)/2)), 
                                (0,0)), 'constant', constant_values = 255)
            image = crop(image, ((rand1,x-dim-rand1),
                                (0,0), (0,0)))
        else:
            rand2 = random.randint(1,y-dim)
            image = pad(image, ((math.ceil((dim - x)/2),math.floor((dim - x)/2)),
                                (0,0), 
                                (0,0)), 'constant', constant_values = 255)
            image = crop(image, ((0,0),
                                 (rand2,y-dim-rand2), (0,0)))
        return image
    
    def read_im(self, filename, dim):
        image = imread(filename)
        #image = resize(image, (dim,dim), anti_aliasing = True, mode = "reflect")
        image = skimage.color.gray2rgb(image)
        image = self.crop_or_pad(image, dim)
        return image


    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
        image = [self.read_im(filename, 299) for filename in batch_x]
        image = (image-np.amin(image))/(np.amax(image)-np.amin(image))
        batch_y = keras.utils.to_categorical(batch_y, self.num_labels)
        return np.array(image), np.array(batch_y)
    
    def on_epoch_end(self):
        if self.shuffle == True:
            fnames_and_labels = list(zip(self.image_filenames, self.labels))
            random.shuffle(fnames_and_labels)
            self.image_filenames, self.labels = zip(*fnames_and_labels)

In [218]:

path = './data/test/'
def fetch_data_set(path):
    p = Path(path)
    files = list(p.glob('**/*.jpg'))
    classes = str(files).split('/')
    classes = [classes[i] for i in list(range(2,len(classes),3)) ]
    classnames, indices = np.unique(classes, return_inverse=True)
    dict_classes = dict(zip(classnames, list(range(0,len(classes)))))
    return files, classes, dict_classes

#np.array([dict_[i] for i in classes])

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from itertools import chain

def over_under_sample(files, classes, dict_classes, num_to_undersample = 15000, num_to_oversample = 7500):
    files_array = np.array(files).reshape(-1,1)
    dic = {}
    for i in list(classes):
        dic[i] = dic.get(i,0) + 1

    classes_to_oversample = dict((k, v) for k, v in dic.items() if v < num_to_oversample)
    classes_to_undersample = dict((k, v) for k, v in dic.items() if v > num_to_undersample)    
    
    for key, value in classes_to_undersample.items():
        classes_to_undersample[key] = num_to_undersample
    for key, value in classes_to_oversample.items():
        classes_to_oversample[key] = num_to_oversample
    
    ros = RandomOverSampler(sampling_strategy = classes_to_oversample)
    rus = RandomUnderSampler(sampling_strategy = classes_to_undersample)
    x_over, y_over = ros.fit_resample(files_array, classes)
    x_under, y_under = rus.fit_resample(x_over, y_over)
    x_under = list(chain(*x_under.tolist()))
    return x_under, y_under
    
from sklearn.model_selection import train_test_split

def split_data_train_validation_test(data_X, data_Y, test_percent, validation_percent, seed): 
    assert (test_percent < 1) and (0 < validation_percent) and (validation_percent < 1)
    X_tmp, X_val, Y_tmp, Y_val = train_test_split(data_X, data_Y, test_size=validation_percent, shuffle=True, random_state=seed)
    
    if test_percent != 0:
        relative_test_percent = test_percent / (1 - validation_percent)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tmp, Y_tmp, test_size=relative_test_percent, shuffle=True, random_state=seed)
        split_data = [X_train, Y_train, X_val, Y_val, X_test, Y_test]
    else:
        X_train, Y_train = X_tmp, Y_tmp
        split_data = [X_train, Y_train, X_val, Y_val]

    return split_data  
    

In [255]:
#files, classes, dict_classes = fetch_data_set("./data/imgs/")

#x_re, y_re = over_under_sample(files, classes, dict_classes)

split = split_data_train_validation_test(files, classes, 0.05, .25, random.randint(1,10000))

#from collections import Counter
#print(sorted(Counter(classes).items()))
#print(sorted(Counter(y_re).items()))

In [256]:
c,_= np.unique(split[5], return_inverse=True)
len(c)
b = {}
for i in list(split[5]):
    b[i] = b.get(i,0) + 1

counts = dict((k, v) for k, v in b.items() if v > 1)
counts

{'Annelida': 121,
 'Bivalvia__Mollusca': 183,
 'Brachyura': 263,
 'Candaciidae': 105,
 'Cavoliniidae': 133,
 'Centropagidae': 254,
 'Corycaeidae': 555,
 'Coscinodiscus': 678,
 'Decapoda': 194,
 'Doliolida': 154,
 'Eucalanidae': 225,
 'Euchaetidae': 104,
 'Evadne': 1056,
 'Foraminifera': 289,
 'Fritillariidae': 128,
 'Haloptilus': 253,
 'Harpacticoida': 153,
 'Limacinidae': 595,
 'Noctiluca': 341,
 'Oikopleuridae': 979,
 'Oncaeidae': 657,
 'Ostracoda': 688,
 'Penilia': 329,
 'Phaeodaria': 806,
 'Salpida': 421,
 'Temoridae': 463,
 'calyptopsis': 216,
 'cyphonaute': 451,
 'egg__Actinopterygii': 100,
 'egg__other': 847,
 'eudoxie__Diphyidae': 162,
 'gonophore__Diphyidae': 155,
 'multiple__Copepoda': 143,
 'multiple__other': 673,
 'nauplii__Cirripedia': 303,
 'nauplii__Crustacea': 488,
 'nectophore__Diphyidae': 281,
 'tail__Appendicularia': 245,
 'tail__Chaetognatha': 129,
 'zoea__Decapoda': 328}

In [225]:
p = Path('./data/test/') 
files = list(p.glob('**/*.jpg'))
classes = str(files).split('/')
classes = [classes[i] for i in list(range(2,len(classes),3)) ]
classnames, indices = np.unique(classes, return_inverse=True)
labels = keras.utils.to_categorical(indices, len(np.unique(indices)))

In [156]:
p = Path('./data/imgs/') 
files = list(p.glob('**/*.jpg'))
classes = str(files).split('/')
classes = [classes[i] for i in list(range(2,len(classes),3)) ]
classnames, indices = np.unique(classes, return_inverse=True)

In [163]:
b = {}
for i in list(classes):
    b[i] = b.get(i,0) + 1

counts = dict((k, v) for k, v in b.items() if v > 1)

In [182]:
np.std(list(counts.values()))
counts
classes_to_oversample = dict((k, v) for k, v in b.items() if v < 7500)
classes_to_undersample = dict((k, v) for k, v in b.items() if v > 15000)
#assign new target values for sampling
for key, value in classes_to_undersample.items():
    classes_to_undersample[key] = 15000
classes_to_undersample.values()

dict_values([15000, 15000, 15000, 15000])

In [185]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [194]:
from sklearn.model_selection import train_test_split

def split_data_train_validation_test(data_X, data_Y, test_percent, validation_percent, seed): 
    assert (test_percent < 1) and (0 < validation_percent) and (validation_percent < 1)
    X_tmp, X_val, Y_tmp, Y_val = train_test_split(data_X, data_Y, test_size=validation_percent, shuffle=True, random_state=seed)
    
    if test_percent != 0:
        relative_test_percent = test_percent / (1 - validation_percent)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tmp, Y_tmp, test_size=relative_test_percent, shuffle=True, random_state=seed)
        split_data = [X_train, Y_train, X_val, Y_val, X_test, Y_test]
    else:
        X_train, Y_train = X_tmp, Y_tmp
        split_data = [X_train, Y_train, X_val, Y_val]

    return split_data

split = split_data_train_validation_test(files, classes, 0, .25, 1190)

In [147]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
d = {0: 150, 1: 300}
ros = RandomOverSampler(random_state = 0, sampling_strategy = d)
x_re, y_re = ros.fit_resample(np.array(split[0]).reshape(-1,1), split[1])
from collections import Counter
print(sorted(Counter(split[1]).items()))
print(sorted(Counter(y_re).items()))

[(0, 75), (1, 78), (2, 72)]
[(0, 150), (1, 300), (2, 72)]


  n_samples_majority))
  n_samples_majority))


In [153]:
from itertools import chain

x_re = list(chain(*x_re.tolist()))
y_re


array([1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0, 1, 2, 0,
       1, 0, 1, 1, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2, 0, 2, 2, 1, 0, 2,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 2, 2, 2, 0, 2, 1, 1, 1,
       2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1, 0, 2, 2, 1, 2, 0, 1, 0,
       0, 2, 1, 2, 1, 1, 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 2, 1, 0, 1,
       0, 2, 2, 2, 0, 2, 1, 1, 2, 2, 2, 2, 0, 0, 1, 0, 2, 2, 1, 2, 0, 1,
       1, 2, 2, 0, 0, 1, 0, 2, 1, 2, 1, 2, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0,
       2, 0, 2, 1, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1,
       1, 2, 1, 1, 2, 0, 1, 0, 1, 1, 1, 2, 0, 2, 0, 1, 2, 1, 1, 1, 0, 1,
       1, 2, 1, 0, 1, 2, 2, 0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2,
       2, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [16]:
#import keras_metrics
#metrics=[keras_metrics.precision(), keras_metrics.recall()
b = {}
for i in list(np.array(x_re)):
    b[i] = b.get(i,0) + 1

dict((k, v) for k, v in b.items() if v > 1)

#x_re

TypeError: unhashable type: 'numpy.ndarray'

In [154]:
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
import keras_metrics
from keras import backend as K

base_model = InceptionV3(weights='imagenet', include_top=False)


x = base_model.output
x = GlobalAveragePooling2D()(x)

x = Dense(1024, activation='relu')(x)

predictions = Dense(3, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

In [155]:
batch_size=30
num_training_samples=len(files)

my_training_batch_generator = MY_Gen(x_re, y_re, batch_size, shuffle = True) #files, indices
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[keras_metrics.precision(), keras_metrics.recall(),'accuracy'])

model.fit_generator(generator=my_training_batch_generator,
                                          steps_per_epoch=(num_training_samples // batch_size),
                                          epochs=10,
                                          verbose=1,
                                          use_multiprocessing=False,
                                          workers=16,
                                          max_queue_size=32)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f52ee6e8cf8>

In [144]:
json_string = model.to_json()
with open("data/output/models/pretra_INC.json", "w") as json_file:
    json_file.write(json_string)

model.save_weights("data/output/models/pretra_INC_weights.hdf5")

In [None]:
import tensorflow as tf
outputTensor = model.output
listOfVariableTensors = model.trainable_weights
gradients = K.gradients(outputTensor, listOfVariableTensors)

trainingExample = np.random.random((1,299,299,3))
sess = K.get_session()
sess.run(tf.initialize_all_variables())
evaluated_gradients = sess.run(gradients,feed_dict={model.input:trainingExample})

In [None]:
(evaluated_gradients)

In [None]:
#im=np.asarray(Image.open(files[1]).resize([299,299]))
#im = im/np.amax(im)
#import matplotlib.pyplot as plt
#plt.imshow(image[7])
#plt.show()
#ind = np.arange(105)
#isinstance(classes, list)
#len(classes)
import random
c=list(zip(files,classes))
random.shuffle(c)
files,classes = zip (*c)

In [None]:
import skimage
im = resize(imread(files[1]), (100, 100))
im = skimage.color.gray2rgb(im)
im.shape

In [None]:
#im = np.array([(Image.open(file_name).resize([299,299])) for file_name in files])
#im2 = np.array([
           # resize(imread(file_name), (299, 299))
            #   for file_name in files])
#len(im2)
#im2.shape
#im.shape
def norm_im(filename, dim):
    image = imread(filename)
    image = resize(image, (dim,dim), mode = "edge")
    image = (image-np.amin(image))/(np.amax(image)-np.amin(image))
    return image
image = np.array([norm_im(filename, 100) for filename in files])


image.shape

In [33]:
#files[range(1,10)]
list(indices)
keras.utils.to_categorical(indices, 5)
len(np.unique(indices))

3

In [None]:
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K

# create the base pre-trained model
base_model = InceptionV3(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(3, activation='softmax')(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')




In [None]:
batch_size=100
num_training_samples=len(files)
# compile the model (should be done *after* setting layers to non-trainable)
my_training_batch_generator = MY_Gen(files, labels, batch_size)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics = ['accuracy'])
# train the model on the new data for a few epochs
model.fit_generator(generator=my_training_batch_generator,
                                          steps_per_epoch=(num_training_samples // batch_size),
                                          epochs=10,
                                          verbose=1,
                                          use_multiprocessing=False,
                                          workers=16,
                                          max_queue_size=32,
                             shuffle = True)

# at this point, the top layers are well trained and we can start fine-tuning
# convolutional layers from inception V3. We will freeze the bottom N layers
# and train the remaining top layers.



In [None]:
# let's visualize layer names and layer indices to see how many layers
# we should freeze:
for i, layer in enumerate(base_model.layers):
   print(i, layer.name)

# we chose to train the top 2 inception blocks, i.e. we will freeze
# the first 249 layers and unfreeze the rest:
for layer in model.layers[:249]:
   layer.trainable = False
for layer in model.layers[249:]:
   layer.trainable = True

# we need to recompile the model for these modifications to take effect
# we use SGD with a low learning rate
from keras.optimizers import SGD
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy')

# we train our model again (this time fine-tuning the top 2 inception blocks
# alongside the top Dense layers
model.fit_generator(...)

In [None]:
from pathlib import Path
p = Path('./data/imgs/') 
classes = [x for x in p.iterdir() if x.is_dir()]
files = list(p.glob('**/*.jpg'))

In [None]:
str(files[200000])

In [None]:
# euc laptop
from pathlib import Path
p = Path('./data/') 
#classes = [x for x in p.iterdir() if x.is_dir()]
files = list(p.glob('**/*.png'))
classes = str(files).split('/')
classes = [ classes[i] for i in list(range(1,len(classes),2)) ]

In [None]:
import matplotlib.pyplot as plt
#plt.imshow(plt.imread(str(files[1])))
#plt.show()

In [None]:
from PIL import Image
im = Image.open(str(files[1])).resize([28,28])
#im = im.resize([28,50])
plt.imshow(im)
plt.show()

In [None]:
import numpy as np
im = np.asarray(im)
im.shape

In [None]:
import keras
keras.utils.to_categorical(classes, len(np.unique(classes)))

In [None]:
classnames, indices = np.unique(classes, return_inverse=True)
keras.utils.to_categorical(indices, len(np.unique(indices)))

In [None]:
from PIL import Image

In [None]:
model.summary()