In [None]:
import pandas as pd
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import os

# Load the training and test CSV files from local paths
train_csv_path = r'C:\Users\danica\Downloads\plant-pathology-2020-fgvc7\train.csv'
test_csv_path = r'C:\Users\danica\Downloads\plant-pathology-2020-fgvc7\test.csv'
train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)

# Define a function to extract features from images
def extract_features(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Image not found at {image_path}")
    image = cv2.resize(image, (128, 128))
    features = image.flatten()
    return features

# Define a function to load the dataset
def load_dataset(image_dir, plant_data, is_train=True):
    X = []
    y = []
    if not os.path.exists(image_dir):
        raise FileNotFoundError(f"The directory {image_dir} does not exist")

    for index, row in plant_data.iterrows():
        image_path = os.path.join(image_dir, row['image_id'] + '.jpg')
        features = extract_features(image_path)
        X.append(features)
        if is_train:
            # Assign label based on the highest probability
            label = np.argmax([row['healthy'], row['multiple_diseases'], row['rust'], row['scab']])
            y.append(label)

    if is_train:
        return np.array(X), np.array(y)
    else:
        return np.array(X)

# Define a function to train the SVM model
def train_svm(X_train, y_train):
    svm_model = SVC(kernel='linear', C=1.0, random_state=42)
    svm_model.fit(X_train, y_train)
    return svm_model

# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:")
    print(accuracy_score(y_test, y_pred))

# Define a function to predict the label of a new image
def predict_image(model, image_path):
    features = extract_features(image_path)
    features = features.reshape(1, -1) / 255.0
    prediction = model.predict(features)
    return prediction[0]

# Path to the images directory
image_dir = r'C:\Users\danica\Downloads\plant-pathology-2020-fgvc7\images'

# Load the training dataset
X_train, y_train = load_dataset(image_dir, train_data)

# Check unique labels
unique_labels = np.unique(y_train)
print("Unique labels in the training dataset:", unique_labels)

# Split the training dataset into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Check unique labels in training and validation sets
unique_labels_train = np.unique(y_train_split)
unique_labels_val = np.unique(y_val)
print("Unique labels in the training split:", unique_labels_train)
print("Unique labels in the validation split:", unique_labels_val)

# Normalize the features
X_train_split = X_train_split / 255.0
X_val = X_val / 255.0

# Train the SVM model
svm_model = train_svm(X_train_split, y_train_split)

# Evaluate the model on the validation set
evaluate_model(svm_model, X_val, y_val)

# Load the test dataset
X_test = load_dataset(image_dir, test_data, is_train=False)

# Normalize the test features
X_test = X_test / 255.0

# Predict the labels for the test dataset
y_test_pred = svm_model.predict(X_test)

# Add predictions to the test dataset and save to CSV
test_data['predicted_label'] = y_test_pred
output_path = r'C:\Users\danica\Downloads\plant-pathology-2020-fgvc7\test_with_predictions.csv'
test_data.to_csv(output_path, index=False)

print(f"Predictions for the test dataset have been saved to '{output_path}'.")