In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from sklearn.metrics import accuracy_score

In [2]:
import operator


class KNeighborsClassifier():
    
    def __init__(self, n_neighbors=5):
        self.neighbors = n_neighbors #this will initialize the number od neighbors to 5
        
    def fit():
        pass
    
    def predict():
        pass
    
    def euclidian_dist(self, point_1, point_2): #this function will calculate the euclidian distance between two points
        dist = 0.0
        for i in range(len(point_1) - 1):
            dist += pow(point_1[i] - point_2[i], 2)
        return np.sqrt(dist)
    
    def calc_distances(self, data, new_point): #this function will calculate the distance between the new point and all the points in the dataset
        distances = []
        neighbors = []
        for i in data:
            distances.append((i, self.euclidian_dist(new_point, i))) #this will append the distance between the new point and the point in the dataset
        distances.sort(key=operator.itemgetter(1)) #this will sort the distances in ascending order
        for i in range(self.neighbors):
            neighbors.append(distances[i][0]) #this will append the neighbors to the neighbors list
        return neighbors
    
    def find_majority(self, neighbors, train_X, train_y): #this function will find the majority of the neighbors
        iter_y = [] 
        for i in neighbors:
            iter_y.append(train_y[np.where(train_X == i)[0][0]]) #this will append the class of the neighbor to the iter_y list i.e if the neighbor is a cat then it will append cat to the list
        return max(iter_y)
    
    def fit(self, train_X, train_y): #this will fit the model
        set_of_classes = set(train_y) #this will find the unique classes in the dataset
        self.classes = 0;
        for i in set_of_classes: #this will count the number of classes in the dataset
            self.classes += 1
        self.X = train_X #this will store the training data
        self.y = train_y #this will store the training labels
        self.data_len = len(train_X) #this will store the length of the training data
        
    def predict(self, test_y):#this will predict the class of the test
        y_pred = []  #this will store the predicted classes
        neighbors = [] #this will store the neighbors
        for i in test_y:
            neighbors = self.calc_distances(self.X, i) #this will find the neighbors
            y_pred.append(self.find_majority(neighbors, self.X, self.y)) #this will find the majority of the neighbors
        return y_pred #and we will return the predicted classes

In [3]:
class Kmeans():
    
    def __init__(self, n_clusters=5):
        self.clusters = n_clusters

    def fit(self, X):
        self.X = X
        self.data_len = len(X)
        self.centroids = []
        for i in range(self.clusters):
            self.centroids.append(X[i])
        self.centroids = np.array(self.centroids)
        self.clusters = np.zeros(self.data_len)

    def euclidian_dist(self, point_1, point_2):
        dist = 0.0
        for i in range(len(point_1)):
            dist += pow(point_1[i] - point_2[i], 2)
        return np.sqrt(dist)

    def calc_distances(self, data, new_point):
        distances = []
        for i in data:
            distances.append(self.euclidian_dist(new_point, i))
        return distances

    def find_nearest_centroid(self, new_point):
        distances = self.calc_distances(self.centroids, new_point)
        return np.argmin(distances)

    def update_centroids(self):
        for i in range(self.clusters):
            points = [self.X[j] for j in range(len(self.X)) if self.clusters[j] == i]
            self.centroids[i] = np.mean(points, axis=0)
            
    def predict(self, test_X):
        y_pred = []
        for i in test_X:
            y_pred.append(self.find_nearest_centroid(i))
        return y_pred

        

In [4]:
#importing the data
data_dir = '/home/kalyan/DataSets/DogsandCats/random_images'
train_dir = os.path.join(data_dir, 'training_set/training_set/')
test_dir = os.path.join(data_dir, 'test_set/test_set')
train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')
test_cats_dir = os.path.join(test_dir, 'cats')
test_dogs_dir = os.path.join(test_dir, 'dogs')


#checking the number of images in each folder
print('Total training cat images:', len(os.listdir(train_cats_dir)))
print('Total training dog images:', len(os.listdir(train_dogs_dir)))
print('Total test cat images:', len(os.listdir(test_cats_dir)))
print('Total test dog images:', len(os.listdir(test_dogs_dir)))


Total training cat images: 25
Total training dog images: 25
Total test cat images: 50
Total test dog images: 50


In [5]:
#taking svd of the images and to truncatate first 10 features of the images as it is most contributing to the images
'''
Input : An image and number of features to be truncated
Output : Truncated US matrix
'''
def svd_truncate(img, n):
    #converting images to grayscale
    img = img.mean(axis=2) # convert to grayscale
    U, s, V = np.linalg.svd(img)
    U = U[:, :n]
    s = s[:n]
    us = np.dot(U, np.diag(s))
    return us
#getting svd of all images
'''
Input : Directory of the images and number of features to be truncated
Output : List of truncated US matrix
'''
def get_svd_images(img_dir, n):
    img_files = os.listdir(img_dir)
    img_files = [os.path.join(img_dir, f) for f in img_files]
    #img_files = [plt.imread(f) for f in img_files]
    # read images from file, resize them into 100x100, store in single array
    img_files = [cv2.resize(plt.imread(f), (200, 200)) for f in img_files]
    svd_images = [svd_truncate(img, n) for img in img_files]
    return svd_images

In [6]:
'''
Train Images
Input : Directory of the images and number of features to be truncated
Output : Truncated US matrix of cats and dogs
'''
cat_images = get_svd_images(train_cats_dir, 5)
cat_images = np.array(cat_images)
dog_images = get_svd_images(train_dogs_dir, 5)
dog_images = np.array(dog_images)
#concatenating the images
train_images = np.concatenate((cat_images, dog_images), axis=0)
print(train_images.shape)

(50, 200, 5)


In [7]:
#creating labels for the train images
train_labels = np.concatenate((np.zeros(len(cat_images)), np.ones(len(dog_images))), axis=0)
print(train_labels.shape)

(50,)


In [8]:
'''
Test Images
Input : Directory of the images and number of features to be truncated
Output : Truncated US matrix of cats and dogs
'''
test_cat_images = get_svd_images(test_cats_dir, 5)
test_cat_images = np.array(test_cat_images)
print(test_cat_images.shape)
#test for dogs
test_dog_images = get_svd_images(test_dogs_dir, 5)
test_dog_images = np.array(test_dog_images)
print(test_dog_images.shape)
#concatenating the test images
test_images = np.concatenate((test_cat_images, test_dog_images), axis=0)
print(test_images.shape)

(50, 200, 5)
(50, 200, 5)
(100, 200, 5)


In [9]:
train_image_1d = np.array([img.flatten() for img in train_images])
test_image_1d = np.array([img.flatten() for img in test_images])
print(train_image_1d.shape)
print(test_image_1d.shape)

(50, 1000)
(100, 1000)


In [10]:
#creating labels for the test images
test_labels = np.concatenate((np.zeros(len(test_cat_images)), np.ones(len(test_dog_images))), axis=0)
print(test_labels.shape)

(100,)


In [11]:
knn = KNeighborsClassifier()
knn.fit(train_image_1d,train_labels)
y_pred = knn.predict(test_image_1d)
print(accuracy_score(test_labels, y_pred))


0.47


In [14]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_image_1d, train_labels)
y_pred = knn.predict(test_image_1d)
print(accuracy_score(test_labels, y_pred))

0.47


In [15]:
#get f1 score,precision recall and confusion matrix
#for knn

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test_labels, y_pred))
print(confusion_matrix(test_labels, y_pred))


              precision    recall  f1-score   support

         0.0       0.41      0.14      0.21        50
         1.0       0.48      0.80      0.60        50

    accuracy                           0.47       100
   macro avg       0.45      0.47      0.41       100
weighted avg       0.45      0.47      0.41       100

[[ 7 43]
 [10 40]]


In [16]:
kmenas= Kmeans(5)
kmenas.fit(train_image_1d)
y_pred = kmenas.predict(test_image_1d)
print(accuracy_score(test_labels, y_pred))


0.43


In [17]:
#get f1 score,precision recall and confusion matrix
#for knn

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test_labels, y_pred))
print(confusion_matrix(test_labels, y_pred))


              precision    recall  f1-score   support

         0.0       0.52      0.54      0.53        50
         1.0       0.59      0.32      0.42        50
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0

    accuracy                           0.43       100
   macro avg       0.22      0.17      0.19       100
weighted avg       0.56      0.43      0.47       100

[[27 11  2  4  6]
 [25 16  2  1  6]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
#using HyperoptEstimator
from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from hyperopt import tpe
import numpy as np

estim = HyperoptEstimator(classifier=any_classifier("my_clf"),
                              preprocessing=any_preprocessing("my_pre"),
                              algo=tpe.suggest,
                              max_evals=100,
                              trial_timeout=120)

estim.fit(train_image_1d, train_labels)
print(estim.score(test_image_1d, test_labels))

100%|██████████| 1/1 [00:00<00:00,  1.59trial/s, best loss: 1.0]
100%|██████████| 2/2 [00:01<00:00,  1.07s/trial, best loss: 0.8]
100%|██████████| 3/3 [00:01<00:00,  1.33s/trial, best loss: 0.6]
100%|██████████| 4/4 [00:00<00:00,  6.12trial/s, best loss: 0.5]
100%|██████████| 5/5 [00:00<00:00,  2.20trial/s, best loss: 0.5]
100%|██████████| 6/6 [00:00<00:00,  3.63trial/s, best loss: 0.5]
100%|██████████| 7/7 [00:09<00:00,  9.08s/trial, best loss: 0.5]
100%|██████████| 8/8 [00:00<00:00, 11.73trial/s, best loss: 0.5]
100%|██████████| 9/9 [00:02<00:00,  2.70s/trial, best loss: 0.5]
100%|██████████| 10/10 [00:06<00:00,  6.79s/trial, best loss: 0.5]
100%|██████████| 11/11 [00:02<00:00,  2.32s/trial, best loss: 0.5]
100%|██████████| 12/12 [00:03<00:00,  3.84s/trial, best loss: 0.5]
100%|██████████| 13/13 [00:00<00:00,  5.62trial/s, best loss: 0.5]
100%|██████████| 14/14 [00:00<00:00,  4.97trial/s, best loss: 0.5]
100%|██████████| 15/15 [00:00<00:00,  5.46trial/s, best loss: 0.5]
100%|████████




100%|██████████| 24/24 [00:00<00:00,  2.10trial/s, best loss: 0.5]
100%|██████████| 25/25 [00:00<00:00,  7.46trial/s, best loss: 0.30000000000000004]
100%|██████████| 26/26 [00:00<00:00,  7.25trial/s, best loss: 0.30000000000000004]
100%|██████████| 27/27 [00:00<00:00,  4.34trial/s, best loss: 0.30000000000000004]
100%|██████████| 28/28 [00:00<00:00,  7.87trial/s, best loss: 0.30000000000000004]
100%|██████████| 29/29 [00:00<00:00,  7.02trial/s, best loss: 0.30000000000000004]
100%|██████████| 30/30 [00:00<00:00,  4.46trial/s, best loss: 0.19999999999999996]
100%|██████████| 31/31 [00:00<00:00,  7.90trial/s, best loss: 0.19999999999999996]
100%|██████████| 32/32 [00:00<00:00,  4.52trial/s, best loss: 0.19999999999999996]
100%|██████████| 33/33 [00:00<00:00,  7.39trial/s, best loss: 0.19999999999999996]
100%|██████████| 34/34 [00:00<00:00,  7.41trial/s, best loss: 0.19999999999999996]
100%|██████████| 35/35 [00:00<00:00,  3.56trial/s, best loss: 0.19999999999999996]
100%|██████████| 36/

  + estimator_weight * incorrect * (sample_weight > 0)

  return super().fit(X, y, sample_weight)



100%|██████████| 47/47 [00:00<00:00,  3.28trial/s, best loss: 0.19999999999999996]
100%|██████████| 48/48 [02:00<00:00, 120.22s/trial, best loss: 0.19999999999999996]
100%|██████████| 49/49 [00:00<00:00,  6.53trial/s, best loss: 0.0]
100%|██████████| 50/50 [00:00<00:00,  6.55trial/s, best loss: 0.0]
100%|██████████| 51/51 [00:00<00:00,  7.04trial/s, best loss: 0.0]
100%|██████████| 52/52 [00:03<00:00,  3.13s/trial, best loss: 0.0]
100%|██████████| 53/53 [00:03<00:00,  3.93s/trial, best loss: 0.0]
100%|██████████| 54/54 [00:00<00:00,  1.32trial/s, best loss: 0.0]
100%|██████████| 55/55 [00:00<00:00,  1.04trial/s, best loss: 0.0]
100%|██████████| 56/56 [00:01<00:00,  1.37s/trial, best loss: 0.0]
100%|██████████| 57/57 [00:00<00:00,  1.04trial/s, best loss: 0.0]
100%|██████████| 58/58 [00:00<00:00,  8.29trial/s, best loss: 0.0]
100%|██████████| 59/59 [00:00<00:00,  3.92trial/s, best loss: 0.0]
100%|██████████| 60/60 [00:00<00:00,  4.06trial/s, best loss: 0.0]
100%|██████████| 61/61 [00:00

In [11]:
print(estim.best_model())

{'learner': SVC(C=0.563836014184182, coef0=0.6591710063188843,
    decision_function_shape='ovo', degree=2, gamma='auto', kernel='poly',
    random_state=4, tol=3.721312395841434e-05), 'preprocs': (PCA(n_components=40, whiten=True),), 'ex_preprocs': ()}
