## Load Dataset

In [1]:
import os
%matplotlib inline
import cv2
import numpy as np
import pandas as pd
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
data_dir = "../input/plant-seedlings-classification/"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")

In [4]:
# https://www.kaggle.com/gaborvecsei/plant-seedlings-fun-with-computer-vision/notebook

images_per_class = {}

for class_folder_name in os.listdir(train_dir):
    class_folder_path = os.path.join(train_dir, class_folder_name)
    class_label = class_folder_name
    images_per_class[class_label] = []
    
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        image_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
        images_per_class[class_label].append(image_bgr)

## Image Segmentation: Pre-Processing Functions

In [5]:
# https://www.kaggle.com/gaborvecsei/plant-seedlings-fun-with-computer-vision/notebook

import cv2
import numpy as np

def create_mask_for_plant(image):
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    sensitivity = 35
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])

    mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11,11))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    
    return mask

def segment_plant(image):
    mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(image, image, mask = mask)
    return output

def sharpen_image(image):
    image_blurred = cv2.GaussianBlur(image, (0, 0), 3)
    image_sharp = cv2.addWeighted(image, 1.5, image_blurred, -0.5, 0)
    return image_sharp

In [6]:
# Test image to see the changes

# https://www.kaggle.com/gaborvecsei/plant-seedlings-fun-with-computer-vision/notebook
image = images_per_class["Cleavers"][0]

image_mask = create_mask_for_plant(image)
image_segmented = segment_plant(image)
image_sharpen = sharpen_image(image_segmented)

fig, axs = plt.subplots(1, 4, figsize=(20, 20))
axs[0].imshow(image)
axs[1].imshow(image_mask)
axs[2].imshow(image_segmented)
axs[3].imshow(image_sharpen)

## Load Images and Labels

### 1. Gray scaling, segmentation (and sharpening)

In [None]:
images_gray, labels_gray = [], []

for class_folder_name in os.listdir(train_dir):
    class_folder_path = os.path.join(train_dir, class_folder_name)
    
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image_150 = cv2.resize(image, (299, 299))
        image_segmented = segment_plant(image_150)
        image_sharpened = sharpen_image(image_segmented)
        image_gray = cv2.cvtColor(image_sharpened, cv2.COLOR_BGR2GRAY)
        image_45 = cv2.resize(image_gray, (299,299))
        image_flat = image_45.flatten()
        images_gray.append(image_flat)
        labels_gray.append(class_folder_name)

images_gray = np.array(images_gray)
labels_gray = np.array(labels_gray)

In [None]:
np.unique(labels_gray)

In [None]:
images_test_gray = []

for image_path in glob(os.path.join(test_dir, "*.png")):
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    image_150 = cv2.resize(image, (299, 299))
    image_segmented = segment_plant(image_150)
    image_sharpened = sharpen_image(image_segmented)
    image_gray = cv2.cvtColor(image_sharpened, cv2.COLOR_BGR2GRAY)
    image_45 = cv2.resize(image_gray, (299,299))
    image_flat = image_45.flatten()
    images_test_gray.append(image_flat)  

images_test_gray = np.array(images_test_gray)

### 2. Segmentation (and sharpening)

In [7]:
images, labels = [], []

for class_folder_name in os.listdir(train_dir):
    class_folder_path = os.path.join(train_dir, class_folder_name)
    
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image_150 = cv2.resize(image, (299, 299))
        image_segmented = segment_plant(image_150)
        image_sharpened = sharpen_image(image_segmented)
        image_45 = cv2.resize(image_sharpened, (299,299))
        image_flat = image_45.flatten()
        images.append(image_flat)
        labels.append(class_folder_name)

images = np.array(images)
labels = np.array(labels)

In [8]:
np.unique(labels)

In [9]:
images_test = []

for image_path in glob(os.path.join(test_dir, "*.png")):
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    image_150 = cv2.resize(image, (299, 299))
    image_segmented = segment_plant(image_150)
    image_sharpened = sharpen_image(image_segmented)
    image_45 = cv2.resize(image_sharpened, (299,299))
    image_flat = image_45.flatten()
    images_test.append(image_flat)  

images_test = np.array(images_test)

### 3. No pre-processing

In [17]:
images_basic, labels_basic = [], []

for class_folder_name in os.listdir(train_dir):
    class_folder_path = os.path.join(train_dir, class_folder_name)
    
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image_45 = cv2.resize(image, (299,299))
        image_flat = image_45.flatten()
        images_basic.append(image_flat)
        labels_basic.append(class_folder_name)

images_basic = np.array(images_basic)
labels_basic = np.array(labels_basic)

In [18]:
np.unique(labels_basic)

In [51]:
images_test_basic = []

for image_path in glob(os.path.join(test_dir, "*.png")):
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    image_45 = cv2.resize(image, (299,299))
    image_flat = image_45.flatten()
    images_test_basic.append(image_flat)  

images_test_basic = np.array(images_test_basic)

## Label Pre-Processing

In [10]:
# convert the text labels to numerical ones for prediction scores
text_label_to_num_label_dict = {v:i for i,v in enumerate(np.unique(labels))}
text_label_to_num_label_dict

In [11]:
# convert numerical ids to text labels
num_label_to_text_label_dict = {v: k for k, v in text_label_to_num_label_dict.items()}
num_label_to_text_label_dict

In [None]:
truth_labels_gray = np.array([text_label_to_num_label_dict[x] for x in labels_gray])
truth_labels_gray

In [12]:
truth_labels = np.array([text_label_to_num_label_dict[x] for x in labels])
truth_labels

In [23]:
truth_labels_basic = np.array([text_label_to_num_label_dict[x] for x in labels_basic])
truth_labels_basic

In [24]:
if (truth_labels_gray == truth_labels).all() and (truth_labels_basic == truth_labels).all():
    print("Same!")

## PCA
##### Code reference: 
https://www.kaggle.com/code/tomras/sentiment-analysis-of-tweets-using-pca-and-ml/notebook

### 1. Segmented Images (with sharpening)

In [None]:
# # image size: 45, 45, 3
# image_data = images.flatten().reshape(4750, 6075)

# # 2 principle components
# pca = PCA(n_components = 2)
# pca_data = pca.fit_transform(image_data)

# pca_data.shape

In [112]:
# df_pca_data = pd.DataFrame(pca_data)
# df_labels = pd.DataFrame(truth_labels)
# df_pca_data = pd.concat([df_pca_data, df_labels], axis=1, ignore_index=True)
# df_pca_data.columns = ['pca_1', 'pca_2', 'target']
# df_pca_data.describe(include='all')

In [113]:
# df_text_labels = pd.DataFrame(labels)
# df_pca_data = pd.concat([df_pca_data, df_text_labels], axis=1, ignore_index=True)
# df_pca_data.columns = ['pca_1', 'pca_2', 'target','text']

In [114]:
# plt.figure(figsize=(16,10))
# sns.scatterplot(
#     x="pca_1", y="pca_2",
#     hue="text",
#     data=df_pca_data,
#     legend="full",
#     palette='colorblind'
# )

### 2. No pre-processing

In [None]:
# # image size: 45, 45, 3
# image_data = images_basic.flatten().reshape(4750, 6075)

# # 2 principle components
# pca = PCA(n_components = 2)
# pca_data = pca.fit_transform(image_data)

# pca_data.shape

In [116]:
# df_pca_data = pd.DataFrame(pca_data)
# df_labels = pd.DataFrame(truth_labels)
# df_pca_data = pd.concat([df_pca_data, df_labels], axis=1, ignore_index=True)
# df_pca_data.columns = ['pca_1', 'pca_2', 'target']
# df_pca_data.describe(include='all')

In [117]:
# df_text_labels = pd.DataFrame(labels)
# df_pca_data = pd.concat([df_pca_data, df_text_labels], axis=1, ignore_index=True)
# df_pca_data.columns = ['pca_1', 'pca_2', 'target','text']

In [118]:
# plt.figure(figsize=(16,10))
# sns.scatterplot(
#     x="pca_1", y="pca_2",
#     hue="text",
#     data=df_pca_data,
#     legend="full",
#     palette='colorblind'
# )

## K-Means

##### Sources: 
https://medium.com/@joel_34096/k-means-clustering-for-image-classification-a648f28bdc47

https://www.analyticsvidhya.com/blog/2021/06/k-means-clustering-and-transfer-learning-for-image-classification/

In [13]:
# source: https://medium.com/@joel_34096/k-means-clustering-for-image-classification-a648f28bdc47

def retrieve_info(cluster_labels,y_train):
    """
    Associates most probable label with each cluster in KMeans model
    returns: dictionary of clusters assigned to each label
    """
    # Initializing
    reference_labels = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(cluster_labels))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        reference_labels[i] = num
    return reference_labels

In [14]:
# initializing k means with 12 clusters (we have 12 classes)
def kmeans(input_labels, input_images):
    model_kmeans = KMeans(n_clusters = len(np.unique(input_labels)), init='random')
    model_kmeans.fit(input_images)
    return model_kmeans

In [15]:
def map_kmeans_to_actual_labels(input_labels, reference_labels):
    pred_labels = []
    for i in range(len(input_labels)):
        k_means_label = input_labels[i]
        ref_label_mapping = reference_labels[k_means_label]
        pred_labels.append(ref_label_mapping)
    return pred_labels

### 1. Gray scaling + segmentation (with sharpening)

In [28]:
# initialize kmeans and fit on images
model_kmeans_gray = kmeans(labels_gray, images_gray)

In [29]:
np.unique(model_kmeans_gray.labels_)

In [32]:
# find mapping for k means labels according to truth_labels
reference_labels_dict = retrieve_info(model_kmeans_gray.labels_ , truth_labels)
reference_labels_dict

In [33]:
# map the k means labels to the actual labels using reference_labels_dict
pred_labels = map_kmeans_to_actual_labels(model_kmeans_gray.labels_, reference_labels_dict)
# calc accuracy
accuracy_score(pred_labels ,truth_labels)

### 2. Segmentation (with sharpening)

In [None]:
# initialize kmeans and fit on images
model_kmeans_seg = kmeans(labels, images)

In [None]:
np.unique(model_kmeans_seg.labels_)

In [None]:
# find mapping for k means labels according to truth_labels
reference_labels_dict = retrieve_info(model_kmeans_seg.labels_ , truth_labels)
reference_labels_dict

In [None]:
# map the k means labels to the actual labels using reference_labels_dict
pred_labels = map_kmeans_to_actual_labels(model_kmeans_seg.labels_, reference_labels_dict)
# calc accuracy
accuracy_score(pred_labels ,truth_labels)

### 3. No pre-processing

In [38]:
# initialize kmeans and fit on images
model_kmeans_basic = kmeans(labels_basic, images_basic)

In [39]:
np.unique(model_kmeans_basic.labels_)

In [40]:
# find mapping for k means labels according to truth_labels
reference_labels_dict = retrieve_info(model_kmeans_basic.labels_ , truth_labels)
reference_labels_dict

In [41]:
# map the k means labels to the actual labels using reference_labels_dict
pred_labels = map_kmeans_to_actual_labels(model_kmeans_basic.labels_, reference_labels_dict)
# calc accuracy
accuracy_score(pred_labels ,truth_labels)

## KNN
##### Sources:
https://www.kaggle.com/olaniyan/image-classification-using-knn

In [None]:
# initializing k means with 12 clusters (we have 12 classes)
def knn(input_labels, input_images):
    model_knn = OneVsRestClassifier(KNeighborsClassifier())
    model_knn.fit(input_images,input_labels)
    return model_knn

### 1. Gray scaling + segmentation (with sharpening)

In [46]:
# initialize knn and fit on images
model_knn_gray = knn(labels_gray, images_gray)

In [52]:
predictions = model_knn_gray.predict(images_test_gray)

In [57]:
# save to df
df = pd.DataFrame({'file': os.listdir('../input/plant-seedlings-classification/test'), 'species': predictions})
df.to_csv('knn_gray_results.csv', index=False)

### 2. Segmentation (with sharpening)

In [None]:
# initialize knn and fit on images
model_knn_seg = knn(labels, images)

In [None]:
predictions = model_knn_seg.predict(images_test)

In [None]:
# save to df
df = pd.DataFrame({'file': os.listdir('../input/plant-seedlings-classification/test'), 'species': predictions})
df.to_csv('knn_seg_results.csv', index=False)

### 3. No pre-processing

In [48]:
# initialize knn and fit on images
model_knn_basic = knn(labels_basic, images_basic)

In [61]:
predictions = model_knn_basic.predict(images_test_basic)

In [62]:
# save to df
df = pd.DataFrame({'file': os.listdir('../input/plant-seedlings-classification/test'), 'species': predictions})
df.to_csv('knn_basic_results.csv', index=False)