In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

import joblib
import os


In [2]:
def load_data_from_directory(base_path, categories, image_size=(64, 64)):
    data, labels = [], []
    for label_index, category in enumerate(categories):
        category_path = os.path.join(base_path, category)
        for img_file in os.listdir(category_path):
            try:
                img_path = os.path.join(category_path, img_file)
                img = imread(img_path)
                img_resized = resize(img, image_size)
                data.append(img_resized.flatten())
                labels.append(label_index)
            except:
                print(f"Failed to load: {img_path}")
    return np.array(data), np.array(labels)


In [3]:
def get_models():
    return {
        "SVM": make_pipeline(StandardScaler(), SVC(probability=True)),
        "Random Forest": RandomForestClassifier(),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    }


In [None]:
dataset_path = 'dataset'
categories = os.listdir(os.path.join(dataset_path, 'train'))
print("Categories:", categories)

x_train, y_train = load_data_from_directory(os.path.join(dataset_path, 'train'), categories)
x_valid, y_valid = load_data_from_directory(os.path.join(dataset_path, 'valid'), categories)
x_test, y_test   = load_data_from_directory(os.path.join(dataset_path, 'test'), categories)

print("Train shape:", x_train.shape, "Labels:", y_train.shape)
print("Valid shape:", x_valid.shape, "Labels:", y_valid.shape)
print("Test shape:", x_test.shape, "Labels:", y_test.shape)


Categories: ['Bacterial diseases - Aeromoniasis', 'Bacterial gill disease', 'Bacterial Red disease', 'EUS', 'Fungal diseases Saprolegniasis', 'Healthy Fish', 'Parasitic diseases', 'Viral diseases White tail disease']


In [None]:
models = get_models()
accuracies = {}

for name, model in models.items():
    print(f"\n🧠 Training {name}...")
    model.fit(x_train, y_train)
    y_val_pred = model.predict(x_valid)
    acc = accuracy_score(y_valid, y_val_pred)
    accuracies[name] = acc
    print(f"✅ {name} Validation Accuracy: {acc * 100:.2f}%")
    print(classification_report(y_valid, y_val_pred, target_names=categories))


In [None]:
best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]

print(f"\n🏁 Final Evaluation using best model: {best_model_name}")
y_test_pred = best_model.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred)
print(f"✅ Test Accuracy: {test_acc * 100:.2f}%")
print(classification_report(y_test, y_test_pred, target_names=categories))
joblib.dump(best_model, 'fish_model.pkl')
print(f"Best model saved as fish_model.pkl")


In [None]:
plt.figure(figsize=(8, 5))
plt.bar(accuracies.keys(), [v * 100 for v in accuracies.values()], color='royalblue')
plt.title("Validation Accuracy Comparison")
plt.ylabel("Accuracy (%)")
plt.xticks(rotation=45)
plt.ylim(0, 100)
plt.grid(axis='y')
plt.tight_layout()
plt.show()