In [1]:
import pandas as pd

In [2]:
from imutils import paths
from scipy.spatial.distance import minkowski

my_data = pd.read_csv("800ImagesFeatures.csv")
my_data.head()

Unnamed: 0,contour_points,amount_contours,rect_area,hull_area,approximation_area,contour_perimeters,corners,harris_corners,ratio_wide_length,contour_length_area_ratio,contour_length_rect_area_ratio,contour_length_hull_area_ratio,contour_rect_length_ratio,contour_hull_length_ratio,extent,solidity,hull_rectangle_ratio,labels
0,676,3,20729.545245,14977.0,10152.5,789.494509,53,901,3.399329,0.07731,0.038085,0.052714,1.149038,1.318301,0.49263,0.681845,0.722495,0
1,1725,3,65025.0,65025.0,48154.5,1847.192994,92,1582,1.0,0.038508,0.028407,0.028407,1.810974,1.810974,0.737693,0.737693,1.0,0
2,514,2,18142.0,14201.0,11280.5,561.220343,62,305,0.487047,0.049727,0.030935,0.03952,0.977736,1.134376,0.622092,0.794733,0.782769,0
3,492,83,6807.69467,4847.5,1323.5,598.867093,52,15,0.124905,0.447584,0.087969,0.123541,1.140181,1.214194,0.196542,0.276019,0.712062,0
4,988,5,54314.985718,33316.0,25395.5,1076.641697,54,1504,1.197183,0.042451,0.019822,0.032316,1.150258,1.313623,0.466943,0.761256,0.613385,0


In [3]:
data_val = my_data.drop('labels', axis=1)
labels = my_data.labels

In [4]:
# Calculate distance between two points

def minkowski_distance(a, b, p=1):
    # Store the number of dimensions
    dim = len(a)
    # Set initial distance to 0
    distance = 0
    # Calculate minkowski distance using parameter p
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
    distance = distance**(1/p)
    return distance


# Test the function

minkowski_distance(a=data_val.iloc[0], b=data_val.iloc[1], p=1)

135176.3583782622

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Split the data - 75% train, 25% test
X_train, X_test, y_train, y_test = train_test_split(data_val, labels, test_size=0.15,
                                                   random_state=1)

In [6]:
# Scale the X data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
def knn_predict(X_train, X_test, y_train, y_test, k, p):

    # Counter to help with label voting
    from collections import Counter

    # Make predictions on the test data
    # Need output of 1 prediction per test data point
    y_hat_test = []

    for test_point in X_test:
        distances = []

        for train_point in X_train:
            distance = minkowski(test_point, train_point, p)
            # distance = minkowski_distance(test_point, train_point, p=p)
            distances.append(distance)

        # Store distances in a dataframe
        df_dists = pd.DataFrame(data=distances, columns=['dist'],
                                index=y_train.index)

        # Sort distances, and only consider the k closest points
        df_nn = df_dists.sort_values(by=['dist'], axis=0)[:k]

        # Create counter object to track the labels of k closest neighbors
        counter = Counter(y_train[df_nn.index])

        # Get most common label of all the nearest neighbors
        prediction = counter.most_common()[0][0]

        # Append prediction to output list
        y_hat_test.append(prediction)

    return y_hat_test

In [8]:
y_hat_test = knn_predict(X_train, X_test, y_train, y_test, k=5, p=1)

print(y_hat_test)

[0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0]


In [9]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, y_hat_test))
print(classification_report(y_test, y_hat_test, target_names=["brush", "comb"]))
print(confusion_matrix(y_test, y_hat_test))

0.8166666666666667
              precision    recall  f1-score   support

       brush       0.78      0.88      0.83        60
        comb       0.87      0.75      0.80        60

    accuracy                           0.82       120
   macro avg       0.82      0.82      0.82       120
weighted avg       0.82      0.82      0.82       120

[[53  7]
 [15 45]]


In [10]:
from preprocessing import ImageResizer
from preprocessing import  SimpleDatasetLoader

In [11]:
args = {
	"dataset": "resources",
	"neighbors": 5,
	"jobs": -1
}

In [12]:
print("[INFO] loading images...")
imagePaths = list(paths.list_images(args["dataset"]))

# initialize the image preprocessor, load the dataset from disk,
# and reshape the data matrix
sp = ImageResizer(32, 32)
sdl = SimpleDatasetLoader(preprocessors=[sp])
(data, lbls) = sdl.load(imagePaths, verbose=500)
data = data.reshape((data.shape[0], 3072))

# show some information on memory consumption of the images
print("[INFO] features matrix: {:.1f}MB".format(
	data.nbytes / (1024 * 1024.0)))

[INFO] loading images...
[INFO] processed 500/800
[INFO] features matrix: 2.3MB


In [13]:
# encode the labels as integers
le = LabelEncoder()
lbls = le.fit_transform(lbls)
print(le.classes_)

['Brush' 'Comb']


In [14]:
# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
(trainX, testX, trainY, testY) = train_test_split(data, lbls, test_size=0.10, random_state=42)

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1], dtype=int64)

In [15]:
trainY = pd.Series(data=trainY)

In [16]:
# Scale the X data
scaler = StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)

In [17]:
prediction_2 = knn_predict(trainX, testX, trainY, testY, k=5, p=1)
print(prediction_2)

[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1]


In [18]:
print(accuracy_score(testY, prediction_2))
print(classification_report(testY, prediction_2, target_names=["brush", "comb"]))
print(confusion_matrix(testY, prediction_2))


0.8
              precision    recall  f1-score   support

       brush       0.80      0.80      0.80        41
        comb       0.79      0.79      0.79        39

    accuracy                           0.80        80
   macro avg       0.80      0.80      0.80        80
weighted avg       0.80      0.80      0.80        80

[[33  8]
 [ 8 31]]
