## LeafSnap Data Analysis using KNN

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

In [None]:
df = pd.read_csv('leafsnap-dataset-images.txt', sep='\\t')

species_counts = df['species'].value_counts()
species_to_keep = species_counts[species_counts >= 250].index
df = df[df['species'].isin(species_to_keep)].reset_index(drop=True)

# --- Encode Labels ---
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['species'])

# --- Parameters ---
IMG_SIZE = (64, 64)  # Small size for KNN feature extraction

# --- Feature Extraction Function ---
def extract_features(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = img / 255.0
    img = tf.image.rgb_to_grayscale(img)  # optional: grayscale to simplify
    feature_vector = tf.reshape(img, [-1])  # Flatten
    return feature_vector.numpy()

# --- Build Feature and Label Arrays ---
features = []
labels = []

for path, label in zip(df['segmented_path'], encoded_labels):
    try:
        feature = extract_features(path)
        features.append(feature)
        labels.append(label)
    except Exception as e:
        print(f"Error processing {path}: {e}")

features = np.array(features)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=42
)


print(f"Species to keep: {species_to_keep.size}")
print(f"Feature shape: {features.shape}")


  df = pd.read_csv('leafsnap-dataset-images.txt', sep='\\t')


Species to keep: 9
Feature shape: (2733, 4096)


In [32]:
# --- Train KNN ---
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)


In [33]:
# --- Predict and Evaluate ---
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
top5_acc = top_k_accuracy_score(y_test, knn.predict_proba(X_test), k=5)

print(f"KNN Test Accuracy (Top-1): {acc:.4f}")
print(f"KNN Test Accuracy (Top-5): {top5_acc:.4f}")


KNN Test Accuracy (Top-1): 0.6033
KNN Test Accuracy (Top-5): 0.9232


In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

# --- Load and Filter Dataset ---
df = pd.read_csv('leafsnap-dataset-images.txt', sep='\t')
species_counts = df['species'].value_counts()
species_to_keep = species_counts[species_counts >= 250].index
df = df[df['species'].isin(species_to_keep)].reset_index(drop=True)

# --- Encode Labels ---
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['species'])

# --- Parameters ---
IMG_SIZE = (224, 224)

# --- Feature Extraction Functions ---
def extract_features(file_path, bins=30):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.image.rgb_to_grayscale(img)
    img_np = img.numpy()
    img_np = (img_np * 255).astype(np.uint8)

    # Contours
    contours, _ = cv2.findContours(img_np, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return np.zeros(bins + 7)

    largest_contour = max(contours, key=cv2.contourArea)
    largest_contour = largest_contour.squeeze()

    if largest_contour.ndim != 2 or largest_contour.shape[0] < 5:
        return np.zeros(bins + 7)

    # Curvature Histogram
    dx = np.gradient(largest_contour[:, 0])
    dy = np.gradient(largest_contour[:, 1])
    angles = np.arctan2(dy, dx)
    curvature = np.gradient(angles)
    hist, _ = np.histogram(curvature, bins=bins, range=(-np.pi, np.pi), density=True)

    # Hu Moments
    hu = cv2.HuMoments(cv2.moments(largest_contour)).flatten()

    # Concatenate features
    full_feature = np.concatenate([hist, hu])
    return full_feature

# --- Build Feature and Label Arrays ---
features = []
labels = []

for path, label in zip(df['segmented_path'], encoded_labels):
    try:
        feature = extract_features(path)
        features.append(feature)
        labels.append(label)
    except Exception as e:
        print(f"Error processing {path}: {e}")

features = np.array(features)
labels = np.array(labels)

print(f"Feature shape before PCA: {features.shape}")

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=42
)

# --- Normalize Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Dimensionality Reduction (PCA) ---
n_components = min(X_train_scaled.shape[0], X_train_scaled.shape[1], 50)  # Adjust n_components dynamically
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Feature shape after PCA: {X_train_pca.shape}")

# --- Train KNN (Tuned Parameters) ---
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights='distance')
knn.fit(X_train_pca, y_train)

# --- Evaluate ---
y_pred = knn.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred)
top5_acc = top_k_accuracy_score(y_test, knn.predict_proba(X_test_pca), k=5)

print(f"KNN Test Accuracy (Top-1): {acc:.4f}")
print(f"KNN Test Accuracy (Top-5): {top5_acc:.4f}")


Feature shape before PCA: (2733, 37)
Feature shape after PCA: (2186, 37)
KNN Test Accuracy (Top-1): 0.4662
KNN Test Accuracy (Top-5): 0.8428
