In [1]:
import glob
import numpy as np
import cv2
from PIL import Image
import sklearn 
from numpy import asarray
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import random
from sklearn.model_selection import cross_val_score
import skimage

# preparing folders

In [2]:
folder = 'transformed_zoom500_dim130x100/data/'
highRMFolder = folder + 'HighRm/'
lowRMFolder = folder + 'LowRm/'

In [3]:
low = glob.glob(lowRMFolder + '*.png')
high = glob.glob(highRMFolder + '*.png')

Dataset size, we have much more low rm samples than high_rm samples

In [4]:
low_samples_cnt = len(low)
high_samples_cnt = len(high)
print(low_samples_cnt)
print(high_samples_cnt)

11356
1511


We prepare subset of data with same size of both classes to train our model and to test it, if run on machine with enough resources this can be modified up to min(low_samples_cnt, high_samples_cnt) or even all data can be loaded, but we separate it on the level of loading image since this operation is resource extensive

In [5]:
train_size = 200
test_size = 25

In [6]:
train_images = random.sample(low, train_size) + random.sample(high, train_size)
test_images = random.sample(low, test_size) + random.sample(high, test_size)

In [7]:
X = np.array([asarray(Image.open(im)).ravel() for im in train_images])
X_test = np.array([asarray(Image.open(im)).ravel() for im in test_images])

In [8]:
y = [0 for _ in range(train_size)] +  [1 for _ in range(train_size)]
y_test = [0 for _ in range(test_size)] +  [1 for _ in range(test_size)]

# simple SVC

In [9]:
clf = make_pipeline(SVC(gamma='auto'))
clf.fit(X, y)

Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [10]:
clf.score(X_test, y_test)

0.56

In [11]:
np.mean(cross_val_score(clf, X, y, cv=10))

0.5875

# with scalling

scaling improved the result to nearly 90 percent

In [12]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [13]:
clf.score(X_test, y_test)

0.84

In [14]:
np.mean(cross_val_score(clf, X, y, cv=10))

0.8699999999999999

# moments

moments of image did not introduce any progress
https://en.wikipedia.org/wiki/Image_moment

for most of randomly chosen sample of training set it were giving worse performance, but sometimes it have higher score on cross validation score 

In [15]:
import skimage.io
import skimage.measure


X = np.array([skimage.measure.moments(skimage.io.imread(im)).ravel() for im in train_images])
X_test = np.array([skimage.measure.moments(skimage.io.imread(im)).ravel() for im in test_images])

In [16]:
y = [0 for _ in range(train_size)] +  [1 for _ in range(train_size)]
y_test = [0 for _ in range(test_size)] +  [1 for _ in range(test_size)]

In [17]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [18]:
clf.score(X_test, y_test)

0.82

In [19]:
np.mean(cross_val_score(clf, X, y, cv=10))

0.8724999999999999

# hu moments 
https://medium.com/@dataturks/understanding-svms-for-image-classification-cf4f01232700

In [20]:
from skimage.color import rgb2gray

X = np.array([skimage.measure.moments_hu(rgb2gray(skimage.io.imread(im))).ravel() for im in train_images])
X_test = np.array([skimage.measure.moments_hu(rgb2gray(skimage.io.imread(im))).ravel() for im in test_images])

In [21]:
y = [0 for _ in range(train_size)] +  [1 for _ in range(train_size)]
y_test = [0 for _ in range(test_size)] +  [1 for _ in range(test_size)]

In [22]:
clf = make_pipeline(SVC(gamma='auto'))
clf.fit(X, y)

Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [23]:
clf.score(X_test, y_test)

0.82

In [24]:
np.mean(cross_val_score(clf, X, y, cv=10))

0.8225

# further tests

after trying above methods simple svc with scaling, resulted in best score (sometimes simple moments were better), that is why we have focused on improving its results

In [25]:
X = np.array([asarray(Image.open(im)).ravel() for im in train_images])
X_test = np.array([asarray(Image.open(im)).ravel() for im in test_images])

y = [0 for _ in range(train_size)] +  [1 for _ in range(train_size)]
y_test = [0 for _ in range(test_size)] +  [1 for _ in range(test_size)]

for k in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = k))
    clf.fit(X, y)
    print('kernel is: ' + k )
    print('mean score: ' + str(np.mean(cross_val_score(clf, X, y, cv=10))))

kernel is: linear
mean score: 0.8200000000000001
kernel is: poly
mean score: 0.7449999999999999
kernel is: rbf
mean score: 0.8699999999999999
kernel is: sigmoid
mean score: 0.8724999999999999


# try other features

In [26]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf.fit(X, y)
np.mean(cross_val_score(clf, X, y, cv=10))

0.8699999999999999

In [28]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', cache_size=1000))
clf.fit(X, y)
np.mean(cross_val_score(clf, X, y, cv=10))

0.8699999999999999

In [29]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', decision_function_shape='ovo'))
clf.fit(X, y)
np.mean(cross_val_score(clf, X, y, cv=10))

0.8699999999999999

# trying full dataset for moments (faster computation)

In [30]:
X = np.array([skimage.measure.moments(skimage.io.imread(im)).ravel() for im in low + high])
y = y = [0 for _ in range(low_samples_cnt)] +  [1 for _ in range(high_samples_cnt)]
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
np.mean(cross_val_score(clf, X, y, cv=10))

0.9273324222002295

# final conclusion

SVC + StandardScalerwork great on data processed by our data manipulator, but for some random inputs it resulted in worse score than using moments of image

SVC work best with defualt rbf kernel but sometimes sigmoid returned a bit better result

Trying moment and hu moment resulted in simmillar score but it get a lot better if it comes to processing time, that is why even on slow pcs it is easy to rpocess whole dataset which was the best score among all tries.

final result ~93%  on cross val score with 10-fold