In [86]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import ast

# Load the dataset
data = pd.read_csv('leafsnap_data.csv')

hv = [d.replace("'", "") for d in data['hist_values']]
hv = [ast.literal_eval(d) for d in data['hist_values']]

X = hv
y = data['plant']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [110]:
# Create the k-nearest neighbors model
k = 6 # Set the value of k
knn = KNeighborsClassifier(n_neighbors=k)

# Train the model on the training set
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the model performance using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.25


In [111]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def set_optimal_perimeters():
    # Define the hyperparameter grid for tuning
    # Define the range and step size
    start = 1
    stop = 100
    step = 50

    # Create the array using numpy's arange function
    arr = np.arange(start, stop + step, step)
    gamma = list(np.logspace(-3, 3, 30))
    param_grid = {
      'C':arr,
      'kernel': ['rbf'],
      'degree': [2],
      'gamma': ['scale', 'auto'] + gamma
      #['scale', 'auto'] +gamma
    }

    # param_grid = {
    #     'C': [101],
    #     'kernel': ['rbf'],
    #     'degree': [2],
    #     'gamma': [2.976351441631316]
    #     # ['scale', 'auto'] +gamma
    # }
    return param_grid


def svm_train(param_grid, X_train, X_test, y_train, y_test):
    # Define the SVM classifier
    svm = SVC()
    # Perform Grid Search Cross Validation
    grid_search = GridSearchCV(svm, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    # Print the best hyperparameters and corresponding accuracy
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Accuracy: ", grid_search.best_score_)
    # Train SVM with the best hyperparameters on the entire training set
    best_svm = grid_search.best_estimator_
    best_svm.fit(X_train, y_train)
    # Evaluate the model on the test set
    accuracy = best_svm.score(X_test, y_test)
    # Predict labels for test data
    y_pred = best_svm.predict(X_test)
    return best_svm


param_grid = set_optimal_perimeters()
svm = svm_train(param_grid, X_train_scaled, X_test, y_train, y_test)



In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def set_optimal_perimeters():
    # Define the hyperparameter grid for tuning
    # Define the range and step size
    start = 1
    stop = 100
    step = 50

    # Create the array using numpy's arange function
    arr = np.arange(start, stop + step, step)
    gamma = list(np.logspace(-3, 3, 30))
    param_grid = {
      'C':arr,
      'kernel': ['rbf'],
      'degree': [2],
      'gamma': ['scale', 'auto'] + gamma
      #['scale', 'auto'] +gamma
    }

    # param_grid = {
    #     'C': [101],
    #     'kernel': ['rbf'],
    #     'degree': [2],
    #     'gamma': [2.976351441631316]
    #     # ['scale', 'auto'] +gamma
    # }
    return param_grid


# Create a dataframe with the dependent and independent variables
def print_metrics(y_test, y_pred):
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    

def rf_train(X_train, X_test, y_train, y_test):
    # Define the NB classifier
    rf = RandomForestClassifier()
    # Define hyperparameter grid to search over
    param_grid = {
        'n_estimators': [10, 50, 100],  # Number of trees in the forest
        'max_depth': [None, 10, 20],  # Maximum depth of the trees
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
        'max_features': ['sqrt', 'log2']  # Number of features to consider for the best split
    }
    # Create GridSearchCV object with Random Forest Classifier and hyperparameter grid
    grid_search = GridSearchCV(rf, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    # Get the best hyperparameters found by GridSearchCV
    best_params = grid_search.best_params_
    print("Best Parameters: ", best_params)
    print("Best Accuracy: ", grid_search.best_score_)
    # Train Random Forest Classifier with the best hyperparameters on the entire training data
    best_rf = RandomForestClassifier(**best_params)
    best_rf.fit(X_train, y_train)
    # Evaluate the model on the test set
    accuracy = best_rf.score(X_test, y_test)
    # Predict labels for test data
    y_pred = best_rf.predict(X_test)
    print_metrics(y_test, y_pred)
    return best_rf


rf = rf_train(X_train, X_test, y_train, y_test)



Best Parameters:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy:  0.7515151515151515
Accuracy: 0.7037037037037037
Precision: 0.7530864197530863
Recall: 0.7037037037037037


  _warn_prf(average, modifier, msg_start, len(result))


# Prepare folder for 1 image

In [40]:
import os
import random
import shutil

source_dir = "../../leafsnap/leafsnap-dataset/dataset/images/field"
target_dir = "../../leafsnap/leafsnap-dataset/dataset/1img_compiled"

if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_index = 0
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if file.lower().endswith(".jpg") or file.lower().endswith(".png"): # change file extensions as needed
            file_path = os.path.join(root, file)
            target_file_path = os.path.join(target_dir, str(file_index) + os.path.splitext(file)[-1])
            shutil.copyfile(file_path, target_file_path)
            file_index += 1
            print("Copied file {} to {}".format(file_path, target_file_path))

            # Randomly copy only 1 file from each folder
            break

Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\abies_concolor\12995307070714.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\0.jpg
Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\abies_nordmanniana\13291651120806.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\1.jpg
Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\acer_campestre\13291732970169.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\2.jpg
Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\acer_ginnala\13291762510376.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\3.jpg
Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\acer_griseum\13001148650053.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\4.jpg
Copied file ../../leafsnap/leafsnap-dataset/dataset/images/field\acer_negundo\13001151160340.jpg to ../../leafsnap/leafsnap-dataset/dataset/1img_compiled\5.jpg
Copied file ../../leafsnap/lea

# NEAREST NEIGHBORS

In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import ast

d:\anaconda3\envs\tf2.4\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
d:\anaconda3\envs\tf2.4\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll


In [33]:
# Load the dataset
data = pd.read_csv('leafsnap_data.csv')

hv = [d.replace("'", "") for d in data['hist_values']]
hv = [ast.literal_eval(d) for d in data['hist_values']]

In [34]:
from sklearn.preprocessing import LabelEncoder

categories = data['plant']

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit the LabelEncoder to the categories
le.fit(categories)

# Transform the categories into integer labels
species_labels = le.transform(categories)

In [73]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# # Load the features extracted from the database of lab and field images
# database_features = np.load("path/to/database_features.npy")

# Create a nearest neighbors object with n_neighbors set to 25
nbrs = NearestNeighbors(n_neighbors=25, algorithm='auto').fit(hv)

In [74]:
input_hist = np.array([[ 6,  5,  9, 10, 12, 11, 11,  7, 12, 14, 17, 14, 15, 15, 22, 28, 32,
                        39, 37, 34, 22, 17, 19, 14, 24, 19, 16, 13,  2, 11],
                       [ 7,  7, 15, 19, 17, 15, 17, 23, 28, 17, 19, 18, 18, 23, 28, 23, 30,
                        22, 15, 16, 14, 13, 16, 10, 13, 12, 13,  8, 16, 15],
                       [ 5, 12, 14, 15, 21, 31, 30, 17, 20, 23, 29, 30, 30, 24, 23, 27, 22,
                        21, 11, 18, 19, 11,  7,  5,  3,  6,  3,  6,  6, 18],
                       [ 6, 14, 13, 22, 17, 20, 29, 30, 29, 28, 34, 37, 23, 28, 23, 19, 33,
                        21, 18,  8,  5,  1,  7,  5,  3,  6,  2,  4,  8, 14],
                       [ 8,  8, 21, 24, 21, 20, 40, 16, 31, 37, 36, 30, 23, 19, 22, 27, 32,
                        10, 11, 15,  7,  2,  5,  4,  4,  2,  5,  3, 10, 14],
                       [ 4, 12, 20, 16, 24, 23, 38, 25, 20, 20, 37, 28, 27, 23, 20, 27, 25,
                        27, 12,  9, 14,  5,  6,  7,  6,  4,  4,  9,  5, 10],
                       [ 4,  8, 19, 18, 15, 25, 24, 33, 23, 16, 21, 22, 34, 21, 31, 26, 19,
                        18, 18, 19, 19, 18, 11, 11,  3,  5,  5,  7,  3, 11],
                       [ 6, 16, 15, 19, 18, 16, 25, 26, 18, 15, 20, 18, 13, 26, 29, 26, 18,
                        27, 18, 20, 14, 13, 17, 25, 11,  9,  7,  8,  3, 11],
                       [ 3,  7,  7, 17, 20, 18, 16, 18, 28, 16, 10, 17, 15, 18, 17, 29, 19,
                        16, 25, 24, 27, 17, 16, 17, 20, 13, 20, 16,  9, 12], 
                       [ 2,  6,  3,  4,  8, 18, 13, 20, 20, 14, 28, 10,  9, 12, 21, 15, 17,
                        12, 14, 24, 26, 15, 30, 15, 23, 19, 24, 22, 33, 30]])




In [75]:
# Use the HoCS feature vector extracted from the input image as the query
query = input_hist.reshape(1, -1)

In [76]:
# Perform the nearest neighbors search
distances, indices = nbrs.kneighbors(query)

In [77]:
indices

array([[ 145,  149,  146,  105,  148, 2279, 2280, 2025, 2317,  118, 2022,
        2314, 2313, 2056, 2068, 2026, 2308, 2353, 2318,  119,  150, 2063,
        2310,  106, 2281]], dtype=int64)

In [78]:

import matplotlib.pyplot as plt
from PIL import Image

# Show the top 25 matches to the user
for i in range(len(indices[0])):
    index = indices[0][i]
    distance = distances[0][i]
    species = species_labels[index]
    # image_path = "../../leafsnap/leafsnap-dataset/dataset/1img_compiled/" + str(index) + ".jpg" # Assuming the images are named as their corresponding index in the database

    # # Load and show the matched image
    # img = Image.open(image_path)
    # plt.imshow(img)
    # plt.show()

    # Print the match information
    print(f"Match {i+1} Species: {index} Distance: {distance}")
# Assuming that you have already loaded the database of features and the list of species labels
# as numpy arrays called database_features and species_labels, respectively

Match 1 Species: 145 Distance: 0.0
Match 2 Species: 149 Distance: 124.04837765968566
Match 3 Species: 146 Distance: 134.929611279363
Match 4 Species: 105 Distance: 140.19272449025306
Match 5 Species: 148 Distance: 140.5987197665754
Match 6 Species: 2279 Distance: 151.0562809021856
Match 7 Species: 2280 Distance: 151.30763364747992
Match 8 Species: 2025 Distance: 151.4661678395542
Match 9 Species: 2317 Distance: 152.44015219095002
Match 10 Species: 118 Distance: 153.28405005087777
Match 11 Species: 2022 Distance: 153.44705927452634
Match 12 Species: 2314 Distance: 153.6619666670969
Match 13 Species: 2313 Distance: 154.14279094398154
Match 14 Species: 2056 Distance: 155.6277610196844
Match 15 Species: 2068 Distance: 156.4736399525492
Match 16 Species: 2026 Distance: 157.08596372687154
Match 17 Species: 2308 Distance: 157.22595205626837
Match 18 Species: 2353 Distance: 157.30861387730806
Match 19 Species: 2318 Distance: 157.36581585592216
Match 20 Species: 119 Distance: 158.70097668256489

In [79]:
from sklearn.model_selection import LeaveOneOut

# Perform leave-one-image-out species identification using only the field images as queries
loo = LeaveOneOut()
correct_ranks = []

for train_index, test_index in loo.split(hv):
    # Only use field images as queries
    # if "field" not in image_paths[test_index[0]]:
    #     continue

    # Extract the HoCS feature vector from the query image
    query = hv[test_index[0]]
    query = np.array(query)

    # Perform the nearest neighbors search
    distances, indices = nbrs.kneighbors(query.reshape(1, -1))

    # Get the species labels of the matches
    match_species = species_labels[indices[0]]

    # Get the true species label of the query image
    true_species = species_labels[test_index[0]]

    # Calculate the species match rank
    rank = np.where(match_species == true_species)[0][0] + 1

    correct_ranks.append(rank)

# Calculate the recognition rate as a function of the maximum species match rank
recognition_rates = []
for i in range(1, 26):
    recognition_rate = sum([1 for rank in correct_ranks if rank <= i]) / len(correct_ranks)
    recognition_rates.append(recognition_rate)

In [80]:
recognition_rates

[0.9615912208504801,
 0.9931412894375857,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]