based on:
https://www.pyimagesearch.com/2016/08/08/k-nn-classifier-for-image-classification/

In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imutils import paths
import numpy as np
import argparse
import imutils
import cv2
import os
import pandas as pd

In [2]:
cv2.__version__

'4.1.2'

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/MLforphysicist'
root_path2 = 'gdrive/My\ Drive/MLforphysicist'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)
	# return the flattened histogram as the feature vector
	return hist.flatten()

In [5]:
choose_size = "large"
train_path = root_path+"/cropped_images/"+"{}_images_train.csv".format(choose_size)
eval_path = root_path+"/cropped_images/"+"{}_images_eval.csv".format(choose_size)
test_path = root_path+"/cropped_images/"+"{}_images_test.csv".format(choose_size)

type_dict = {"filename": np.str, "xmin": np.int64, "ymin": np.int64, "xmax": np.int64, "ymax": np.int64, "class": np.str}
df_train = pd.read_csv(train_path, encoding='utf-8', engine='c', header=0, dtype=type_dict)
df_eval = pd.read_csv(eval_path, encoding='utf-8', engine='c', header=0, dtype=type_dict)
df_test = pd.read_csv(test_path, encoding='utf-8', engine='c', header=0, dtype=type_dict)


In [6]:
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
train_rawImages = []
train_features = []
train_labels = []

eval_rawImages = []
eval_features = []
eval_labels = []

test_rawImages = []
test_features = []
test_labels = []

In [7]:
# loop over the train images
for index, row in df_train.iterrows():
	## load the image and extract the class label (assuming that our
	## path as the format: /path/to/dataset/{class}.{image_num}.jpg
  image = cv2.imread(root_path+"/cropped_images/"+row["filename"])
  label = row["class"]
	## extract raw pixel intensity "features", followed by a color
	## histogram to characterize the color distribution of the pixels
	## in the image
	#pixels = image_to_feature_vector(image)
  hist = extract_color_histogram(image)
	## update the raw images, features, and labels matricies,
	## respectively
	#rawImages.append(pixels)
  train_features.append(hist)
  train_labels.append(label)

In [8]:
# loop over the eval images
for index, row in df_eval.iterrows():
	## load the image and extract the class label (assuming that our
	## path as the format: /path/to/dataset/{class}.{image_num}.jpg
  image = cv2.imread(root_path+"/cropped_images/"+row["filename"])
  label = row["class"]
	## extract raw pixel intensity "features", followed by a color
	## histogram to characterize the color distribution of the pixels
	## in the image
	#pixels = image_to_feature_vector(image)
  hist = extract_color_histogram(image)
	## update the raw images, features, and labels matricies,
	## respectively
	#rawImages.append(pixels)
  eval_features.append(hist)
  eval_labels.append(label)

In [9]:
# loop over the test images
for index, row in df_test.iterrows():
	## load the image and extract the class label (assuming that our
	## path as the format: /path/to/dataset/{class}.{image_num}.jpg
  image = cv2.imread(root_path+"/cropped_images/"+row["filename"])
  label = row["class"]
	## extract raw pixel intensity "features", followed by a color
	## histogram to characterize the color distribution of the pixels
	## in the image
	#pixels = image_to_feature_vector(image)
  hist = extract_color_histogram(image)
	## update the raw images, features, and labels matricies,
	## respectively
	#rawImages.append(pixels)
  test_features.append(hist)
  test_labels.append(label)

In [10]:
#rawImages = np.array(rawImages)
train_features = np.array(train_features)
train_labels = np.array(train_labels)

#rawImages = np.array(rawImages)
eval_features = np.array(eval_features)
eval_labels = np.array(eval_labels)

#rawImages = np.array(rawImages)
test_features = np.array(test_features)
test_labels = np.array(test_labels)

In [11]:
model = SVC(probability=True)
model.fit(train_features, train_labels)
print(model.score(eval_features, eval_labels))
print(model.score(test_features, test_labels))

0.922077922077922
0.9240506329113924


In [12]:
print(model.classes_)
print(model.predict_proba(test_features[0].reshape(1, -1)))
print(test_labels[0])

['A' 'C' 'F' 'H' 'ID2' 'ID3' 'R']
[[1.54998294e-02 9.59924603e-01 8.33241789e-03 9.46355027e-03
  4.07779916e-04 5.51431496e-03 8.57504579e-04]]
C


In [13]:
predictions = model.predict_proba(test_features)
#print(predictions[:,0])
for i, signal_type in enumerate(model.classes_):
  df_test[signal_type] = predictions[:,i]

In [14]:
df_test
df_test.to_csv(root_path+"/prediction/"+"pred_for_{}_on_test.csv".format(choose_size),index=False)