In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing import image

# Define the directory containing the training data
train_dir = '/kaggle/input/vegetable-image-dataset/Vegetable Images/train'

# Initialize lists to store images and their corresponding labels
images = []
labels = []

# Iterate through each class folder
for class_name in os.listdir(train_dir):
    class_dir = os.path.join(train_dir, class_name)

    # Iterate through each image file in the class folder
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)

        # Load and preprocess the image
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_tensor = tf.convert_to_tensor(img_array)

        # Append the image tensor and its label to the lists
        images.append(img_tensor)
        labels.append(class_name)

# Convert the lists to TensorFlow tensors
images_tensor = tf.stack(images)
labels_tensor = tf.constant(labels)

# Display the shape of the tensors
print("Images tensor shape:", images_tensor.shape)
print("Labels tensor shape:", labels_tensor.shape)


2024-05-10 20:15:05.714748: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 20:15:05.714953: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 20:15:05.862420: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Images tensor shape: (15000, 224, 224, 3)
Labels tensor shape: (15000,)


In [None]:
import pickle
import cv2
from sklearn.cluster import KMeans

sift = cv2.SIFT_create()

In [None]:
import numpy as np

def getSIFT(img):
    '''
        @description Get the SIFT features of the input image
        @param img -> np.array: |img| => { (32, 32, 3) }
        @return descriptor -> np.array n x 128
    '''
    # Convert the image to grayscale
    #gray_image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    # Convert the image to unsigned 8-bit integer format
    img_uint8 = img.numpy().astype(np.uint8)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(img_uint8, cv2.COLOR_RGB2GRAY)


    kps, des = sift.detectAndCompute(gray_image, None)
    return des if des is not None else np.array([]).reshape(0, 128)
descriptors_list = []

# Iterate over each image array
for img_array in images_tensor:
    descriptors = getSIFT(img_array)
    descriptors_list.append(descriptors)

# Convert the list of descriptors to a single NumPy array
descriptors_array = np.vstack(descriptors_list)

# Print the shape of the descriptors array
print("Shape of descriptors array:", descriptors_array.shape)

Shape of descriptors array: (5735598, 128)


In [None]:
from sklearn.cluster import MiniBatchKMeans
def create_codebook(features, num_clusters, batch_size):
    # Create a MiniBatchKMeans clustering object
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, random_state=0)
    #kmeans = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, init_size=3*num_clusters, max_iter=100, tol=0.001, random_state=0)


    # Fit the clustering model to the SIFT features
    kmeans.fit(features)

    return kmeans
#Number of clusters for codebook
num_clusters = 50
# Batch size for MiniBatchKMeans
batch_size = 1000

# Create the codebook
codebook = create_codebook(descriptors_array, num_clusters, batch_size)



In [None]:
import numpy as np

def compute_bovw_representation(features, codebook):
    num_clusters = codebook.n_clusters
    bovw_representation = []

    for image_features in features:
        if len(image_features) > 0:
            # Assign each feature to a cluster
            cluster_assignments = codebook.predict(image_features)

            # Create a histogram of cluster frequencies
            histogram = np.bincount(cluster_assignments, minlength=num_clusters)

            # Normalize the histogram
            histogram = histogram / np.sum(histogram)

            bovw_representation.append(histogram)
        else:
            # Handle cases where no features were detected
            bovw_representation.append(np.zeros(num_clusters))

    return bovw_representation

# Compute BoVW representation for features using the codebook
bovw_representation = compute_bovw_representation(descriptors_list, codebook)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

def train_knn_classifier(train_bovw_features, train_labels):
    # Create a k-NN classifier object
    knn_classifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute')

    # Fit the classifier on the training data
    knn_classifier.fit(train_bovw_features, train_labels)

    return knn_classifier
# Train k-NN classifier
label_encoder_np = labels_tensor.numpy()
knn_classifier = train_knn_classifier(bovw_representation, labels)

In [None]:
del images_tensor


In [None]:
del descriptors_array

In [None]:
# Define the directory containing the test data
test_dir = '/kaggle/input/vegetable-image-dataset/Vegetable Images/test'

# Initialize lists to store test images and their corresponding labels
test_images = []
test_labels = []

# Iterate through each class folder in the test directory
for class_name in os.listdir(test_dir):
    class_dir = os.path.join(test_dir, class_name)

    # Iterate through each image file in the class folder
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)

        # Load and preprocess the test image
        test_img = image.load_img(img_path, target_size=(224, 224))
        test_img_array = image.img_to_array(test_img)
        test_img_tensor = tf.convert_to_tensor(test_img_array)

        # Append the test image tensor and its label to the lists
        test_images.append(test_img_tensor)
        test_labels.append(class_name)

# Convert the lists of test images and labels to TensorFlow tensors
test_images_tensor = tf.stack(test_images)
test_labels_tensor = tf.constant(test_labels)

# Display the shape of the test tensors
print("Test Images tensor shape:", test_images_tensor.shape)
print("Test Labels tensor shape:", test_labels_tensor.shape)

Test Images tensor shape: (3000, 224, 224, 3)
Test Labels tensor shape: (3000,)


In [None]:
test_descriptors_list = []
#test_image_arrays = [test_image_tensor.numpy() for test_image_tensor in test_cifar100_data]
# Iterate over each image array
for img_array in test_images_tensor:
    # Check if the shape of the image array is (3, 32, 32)
    # if img_array.shape == (3, 32, 32):
    #     # Convert the image array to the correct data type and dimensions
    #     #img_array = img_array.transpose((1, 2, 0))  # Transpose to (32, 32, 3) format
    #     # Get the SIFT descriptors for the image
    #     img_array = image_arrays[i].transpose((1, 2, 0))
    #     img_array = (img_array - img_array.min()) / (img_array.max() - img_array.min())  # Normalize pixel values
    #     img_array = np.uint8(img_array * 255)
    test_descriptors = getSIFT(img_array)
        # Append the descriptors to the list
    test_descriptors_list.append(test_descriptors)

# Convert the list of descriptors to a single NumPy array
test_descriptors_array = np.vstack(test_descriptors_list)

# Print the shape of the descriptors array
print("Shape of descriptors array:", test_descriptors_array.shape)

Shape of descriptors array: (1173792, 128)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_classifier(classifier, test_bovw_features, train_labels, test_labels):
    # Find nearest neighbors for test data
    distances, indices = classifier.kneighbors(test_bovw_features)

    # Evaluate the classifier
    predictions = []

    for neighbors in indices:
        if len(neighbors) > 0:
            neighbor_labels = train_labels[neighbors]
            most_common_label = np.bincount(neighbor_labels).argmax()
            predictions.append(most_common_label)
        else:
            # Handle cases where no neighbors were found
            predictions.append(-1)

    # Filter out test samples without predictions (-1)
    filtered_test_labels = []
    filtered_predictions = []

    for i, prediction in enumerate(predictions):
        if prediction != -1:
            filtered_test_labels.append(test_labels[i])
            filtered_predictions.append(prediction)

    # Calculate accuracy and generate a classification report
    accuracy = accuracy_score(filtered_test_labels, filtered_predictions)
    report = classification_report(filtered_test_labels, filtered_predictions)

    return accuracy, report


In [None]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have features and labels for training and testing
# features_train, labels_train = ...
# features_test, labels_test = ...

# Initialize the Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(bovw_representation, labels)

# Make predictions on the test set
predictions = nb_classifier.predict(test_bovw_features)

# Evaluate the classifier
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.5103333333333333
Classification Report:
               precision    recall  f1-score   support

        Bean       0.59      0.24      0.34       200
Bitter_Gourd       0.60      0.70      0.65       200
Bottle_Gourd       0.58      0.49      0.53       200
     Brinjal       0.38      0.28      0.32       200
    Broccoli       0.60      0.78      0.68       200
     Cabbage       0.31      0.61      0.41       200
    Capsicum       0.64      0.49      0.56       200
      Carrot       0.34      0.19      0.24       200
 Cauliflower       0.69      0.80      0.74       200
    Cucumber       0.45      0.58      0.51       200
      Papaya       0.44      0.50      0.47       200
      Potato       0.45      0.46      0.46       200
     Pumpkin       0.52      0.81      0.63       200
      Radish       0.64      0.37      0.47       200
      Tomato       0.67      0.35      0.46       200

    accuracy                           0.51      3000
   macro avg       0.53    

In [None]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression(train_bovw_features, train_labels):
    # Create a Logistic Regression classifier object
    logistic_classifier = LogisticRegression(n_jobs=1, max_iter=3000)
    # Fit the classifier on the training data
    logistic_classifier.fit(train_bovw_features, train_labels)

    return logistic_classifier

lr_classifier = train_logistic_regression(bovw_representation, labels)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_classifier2(classifier, test_bovw_features, test_labels):
    # Predict labels for test data
    predictions = classifier.predict(test_bovw_features)

    # Calculate accuracy and generate a classification report
    accuracy = accuracy_score(test_labels, predictions)
    report = classification_report(test_labels, predictions)

    return accuracy, report

In [None]:
accuracy, report = evaluate_classifier2(lr_classifier, test_bovw_features, test_labels)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.5346666666666666
Classification Report:
              precision    recall  f1-score   support

        Bean       0.67      0.49      0.57       200
Bitter_Gourd       0.58      0.71      0.64       200
Bottle_Gourd       0.65      0.34      0.45       200
     Brinjal       0.42      0.13      0.20       200
    Broccoli       0.49      0.80      0.61       200
     Cabbage       0.47      0.45      0.46       200
    Capsicum       0.54      0.45      0.49       200
      Carrot       0.50      0.41      0.45       200
 Cauliflower       0.55      0.84      0.67       200
    Cucumber       0.60      0.56      0.58       200
      Papaya       0.54      0.35      0.43       200
      Potato       0.42      0.46      0.44       200
     Pumpkin       0.54      0.71      0.62       200
      Radish       0.52      0.73      0.61       200
      Tomato       0.55      0.59      0.57       200

    accuracy                           0.53      3000
   macro avg       0.54     