# Assignment 3<br>
## 1 Image Classification using RF and SVM


### Antoine Wang 260766084

In [15]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Define the path for the input images
# please change this part if directory is different
path = ''

In [17]:
train_images = np.load(path + 'flower_subset.npz')['train_images']
train_labels = np.load(path + 'flower_subset.npz')['train_labels']
test_images = np.load(path + 'flower_subset.npz')['test_images']
test_labels = np.load(path + 'flower_subset.npz')['test_labels']

print(type(train_images))
print(train_images.data.shape)
print(train_labels.shape)

<class 'numpy.ndarray'>
(1556, 128, 128)
(1556,)


#### 1.1 Resize the train/test images to 64 * 64 and compute HoG features

In [18]:
img_size = (64, 64) # h x w in pixels
cell_size = (8, 8)  # h x w in pixels
block_size = (4, 4)  # h x w in cells
nbins = 4  # number of orientation bins


def generate_HoG(dataset,img_size = (64, 64),cell_size = (8, 8),
                 block_size = (4, 4), nbins = 4 ):
    hog = cv2.HOGDescriptor(_winSize=(img_size[1] // cell_size[1] * cell_size[1],
                                  img_size[0] // cell_size[0] * cell_size[0]),
                        _blockSize=(block_size[1] * cell_size[1],
                                    block_size[0] * cell_size[0]),
                        _blockStride=(cell_size[1], cell_size[0]),
                        _cellSize=(cell_size[1], cell_size[0]),
                        _nbins=nbins)
    
    
    features = []
    # normalizing the value of image from 0 - 255
    dataset = dataset*255
    for i in range(dataset.data.shape[0]):
        resized = cv2.resize(dataset[i], img_size, interpolation = cv2.INTER_AREA)
        features.append(hog.compute(resized .astype(np.uint8)).reshape(1, -1))
    features = np.vstack(features)
    
    return features

In [19]:
features = generate_HoG(train_images)
print(features.shape)
print(features[0])

(1556, 1600)
[0.04604047 0.03783537 0.03697053 ... 0.04313596 0.02390664 0.03033774]


#### 1.2 Fit a non-linear SVM classifier (use RBF kernel with gamma=`auto' and (C=1)

In [20]:
clf = svm.SVC(gamma='auto', C=1.) 
clf.fit(features, train_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#### 1.3 Predict labels of the test images by feeding the test features to the trained classifier and calculate classification accuracy.

In [21]:
test_feature = generate_HoG(test_images)
prediction = clf.predict(test_feature)
print("there are " + str(prediction.shape) + " in the predicted labels")
print("there are " + str(test_labels.shape) + " in the testing labels/teacher")


correct_count = 0;
for i in range(0, prediction.shape[0]):
    if prediction[i] == test_labels[i]:
        correct_count = correct_count + 1
print("number of correct count: " + str(correct_count))

there are (90,) in the predicted labels
there are (90,) in the testing labels/teacher
number of correct count: 10


#### 1.4 Tune values of hyperparameters 'gamma' and 'C' (Get 50%)
The final chosen value is 100. Gamma is kept at "auto";

In [22]:
clf = svm.SVC(gamma='auto', C=100.) 
clf.fit(features, train_labels)
test_feature = generate_HoG(test_images)
prediction = clf.predict(test_feature)
print("there are " + str(prediction.shape) + " in the predicted labels")
print("there are " + str(test_labels.shape) + " in the testing labels/teacher")


correct_count = 0;
for i in range(0, prediction.shape[0]):
    if prediction[i] == test_labels[i]:
        correct_count = correct_count + 1
print("number of correct count: " + str(correct_count))
print("accuracy of the test is: " + str(correct_count/prediction.shape[0]))

there are (90,) in the predicted labels
there are (90,) in the testing labels/teacher
number of correct count: 50
accuracy of the test is: 0.5555555555555556


#### 1.5 Fit a Random Forest(RF) classifier (set n estimators=10, max depth=5 and criterion='entropy')

In [23]:
rf_clf = RandomForestClassifier(n_estimators=10, max_depth=5, criterion='entropy')
features = generate_HoG(train_images)
rf_clf.fit(features,train_labels)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### 1.6 Predict labels of the test images by feeding the test features to the trained classifier and calculate classification accuracy.

In [24]:
test_feature = generate_HoG(test_images)

predict_rf = rf_clf.predict(test_feature)

print("there are " + str(predict_rf.shape) + " in the predicted labels")
print("there are " + str(test_labels.shape) + " in the testing labels/teacher")
correct_count = 0;
for i in range(0, predict_rf.shape[0]):
    if predict_rf[i] == test_labels[i]:
        correct_count = correct_count + 1
print("number of correct count: " + str(correct_count))

there are (90,) in the predicted labels
there are (90,) in the testing labels/teacher
number of correct count: 31


##### setting n_estimators=10, max_depth=5 and criterion='entropy' gives a accuracy way lower than 50%. Below is the tweaked version.
When there are 100 estimator in the forest, each has a depth of 10, the performance is boosted above 50%

In [29]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy')
features = generate_HoG(train_images)
rf_clf.fit(features,train_labels)

test_feature = generate_HoG(test_images)

predict_rf = rf_clf.predict(test_feature)

print("there are " + str(predict_rf.shape) + " in the predicted labels")
print("there are " + str(test_labels.shape) + " in the testing labels/teacher")
correct_count = 0;
for i in range(0, predict_rf.shape[0]):
    if predict_rf[i] == test_labels[i]:
        correct_count = correct_count + 1
print("number of correct count: " + str(correct_count))
print("accuracy of the test is: " + str(correct_count/predict_rf.shape[0]))

there are (90,) in the predicted labels
there are (90,) in the testing labels/teacher
number of correct count: 46
accuracy of the test is: 0.5111111111111111


#### 1.7 Compare results of SVM and RF classifiers.

In [30]:
def svm_with_random_state(random_state_n, C_n=100.):
    features = generate_HoG(train_images)
    clf = svm.SVC(gamma='auto', C=C_n, random_state = random_state_n) 
    clf.fit(features, train_labels)
    
    test_feature = generate_HoG(test_images)
    prediction = clf.predict(test_feature)
    print("Testing random state for SVM with random state = " + str(random_state_n))


    correct_count = 0;
    for i in range(0, prediction.shape[0]):
        if prediction[i] == test_labels[i]:
            correct_count = correct_count + 1
    print("number of correct count: " + str(correct_count))
    print("accuracy of the test is: " + str(correct_count/prediction.shape[0]))
    print("--------------------------------------------------------------\n")
    
    
def rf_with_random_state(random_state_n, n_estimators_n=100, max_depth_n=10):
    features = generate_HoG(train_images)
    rf_clf = RandomForestClassifier(n_estimators=n_estimators_n, 
                                    max_depth=n_estimators_n, 
                                    criterion='entropy')
    rf_clf.fit(features,train_labels)
    
    test_feature = generate_HoG(test_images)
    predict_rf = rf_clf.predict(test_feature)
    print("Testing random state for RF with random state = " + str(random_state_n))
    correct_count = 0;
    for i in range(0, predict_rf.shape[0]):
        if predict_rf[i] == test_labels[i]:
            correct_count = correct_count + 1
    print("number of correct count: " + str(correct_count))
    print("accuracy of the test is: " + str(correct_count/predict_rf.shape[0]))
    print("--------------------------------------------------------------\n")

In [31]:
for i in range(1, 10):
    svm_with_random_state(i)

Testing random state for SVM with random state = 1
number of correct count: 50
accuracy of the test is: 0.5555555555555556
--------------------------------------------------------------

Testing random state for SVM with random state = 2
number of correct count: 50
accuracy of the test is: 0.5555555555555556
--------------------------------------------------------------

Testing random state for SVM with random state = 3
number of correct count: 50
accuracy of the test is: 0.5555555555555556
--------------------------------------------------------------

Testing random state for SVM with random state = 4
number of correct count: 50
accuracy of the test is: 0.5555555555555556
--------------------------------------------------------------

Testing random state for SVM with random state = 5
number of correct count: 50
accuracy of the test is: 0.5555555555555556
--------------------------------------------------------------

Testing random state for SVM with random state = 6
number of corr

In [32]:
for i in range(1, 10):
    rf_with_random_state(i)

Testing random state for RF with random state = 1
number of correct count: 46
accuracy of the test is: 0.5111111111111111
--------------------------------------------------------------

Testing random state for RF with random state = 2
number of correct count: 48
accuracy of the test is: 0.5333333333333333
--------------------------------------------------------------

Testing random state for RF with random state = 3
number of correct count: 44
accuracy of the test is: 0.4888888888888889
--------------------------------------------------------------

Testing random state for RF with random state = 4
number of correct count: 43
accuracy of the test is: 0.4777777777777778
--------------------------------------------------------------

Testing random state for RF with random state = 5
number of correct count: 46
accuracy of the test is: 0.5111111111111111
--------------------------------------------------------------

Testing random state for RF with random state = 6
number of correct co

Above are the result of running 9 different random states. From the result, **SVM is more robost with changing random states** since the accuracy rate keeps unchanged at 55%. Random forest returns slightly different result with different random states, sometimes the accuracy rate even drop slightly below 50%.