# Fish Species Classification using Transfer Learning (MobileNetV2)

## 1. Importing Necessary Libraries
We will use TensorFlow/Keras for the model, Pandas/Numpy for data handling, and Matplotlib/Seaborn for visualization.

In [None]:
# Data Handling & Linear Algebra
import numpy as np
import pandas as pd
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Manipulating Data and Model Building
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Implementation of Transfer Learning
from tensorflow.keras.applications import MobileNetV2

# Image generation and preprocessing
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Result and Performance Analysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ignoring Warnings
import warnings
warnings.filterwarnings("ignore")


## 2. Data Loading and DataFrame Creation
We will iterate through the dataset directory, excluding the 'GT' (Ground Truth) folders to focus on the image data.

In [None]:
# Define Directory
DIR = '/kaggle/input/a-large-scale-fish-dataset/Fish_Dataset/Fish_Dataset'

# Creating a master directory list
classes = [i for i in os.listdir(DIR) if '.' not in i]

# Creating classes out of the required folders
label = []
path = []

for dirname, _, filenames in os.walk(DIR):
    for filename in filenames:
        if os.path.splitext(filename)[-1] == '.png': # If filename contains .png
            # Check if directory is not a GT (Ground Truth) directory
            # Note: Logic assumes folder structure "Name Name GT"
            if dirname.split()[-1] != 'GT': 
                label.append(os.path.split(dirname)[-1]) # Append the directory name to label
                path.append(os.path.join(dirname, filename)) # Append the full path

# Create the DataFrame
df = pd.DataFrame(columns=['path', 'label'])
df['path'] = path
df['label'] = label

# Inspect the data
print(df.head())
print(df.info())
print(df['label'].value_counts())

## 3. Exploratory Data Analysis (EDA)
Visualizing one sample image from each class to understand the dataset.

In [None]:
idx = 0
plt.figure(figsize=(15, 12))

for unique_label in df['label'].unique():
    plt.subplot(3, 3, idx+1)
    # Read the first image found for this label
    img_path = df[df['label'] == unique_label].iloc[0, 0]
    plt.imshow(plt.imread(img_path))
    plt.title(unique_label)
    plt.axis('off')
    idx += 1

plt.show()

## 4. Preprocessing and Splitting
Splitting the dataframe into Training (80%) and Testing (20%) sets.

In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=42)

print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

## 5. Image Generators
Setting up `ImageDataGenerator` with `MobileNetV2` preprocessing. We also split the training data further to create a validation set.

In [None]:
# Initialize Generators
train_generator = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=0.2
)

test_generator = ImageDataGenerator(
    preprocessing_function=preprocess_input
)

# Flow from dataframe
train_images = train_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='path',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='training'
)

val_images = train_generator.flow_from_dataframe(
    dataframe=train_df,
    x_col='path',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=42,
    subset='validation'
)

test_images = test_generator.flow_from_dataframe(
    dataframe=test_df,
    x_col='path',
    y_col='label',
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

In [None]:
# Check class indices
print("Train Indices:", train_images.class_indices)

# Visualize a batch of Test Images
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
ax = ax.flatten()
j = 0

# Fetch a batch
img, label = test_images.next()

for _ in range(6):
    ax[j].imshow(img[j]) # img[j] is the image data (normalized)
    ax[j].set_title("Sample Image")
    j += 1
plt.show()

## 6. Model Building: MobileNetV2
We use a pre-trained MobileNetV2 model (weights='imagenet') as the base, add dense layers, and freeze the base layers.

In [None]:
# Load Pretrained Model
pretrained_model = tf.keras.applications.MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet',
    pooling='avg'
)

pretrained_model.trainable = False

# Build the Model
inputs = pretrained_model.input
x = tf.keras.layers.Dense(128, activation='relu')(pretrained_model.output)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(9, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

## 7. Model Training

In [None]:
history = model.fit(
    train_images,
    validation_data=val_images,
    epochs=5
)

## 8. Evaluation and Prediction
Calculating loss and accuracy on the test set.

In [None]:
results = model.evaluate(test_images, verbose=0)

print("Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

## 9. Performance Analysis
Creating a dataframe of predictions to compare True vs Predicted labels and visualize the Confusion Matrix.

In [None]:
# Make Predictions
pred = model.predict(test_images)
pred = np.argmax(pred, axis=1)

# Map predictions to labels
labels = {v: k for k, v in test_images.class_indices.items()}

# Create Prediction DataFrame
pred_df = test_df.copy()
pred_df['pred'] = pred
pred_df['pred'] = pred_df['pred'].apply(lambda x: labels[x])
pred_df = pred_df.reset_index(drop=True)

# Visualize a specific error (Sample Index 1604 or first mismatch)
# Note: 1604 might be out of bounds depending on split size, using try/except or safe indexing
try:
    idx_to_plot = 1604
    if idx_to_plot < len(pred_df):
        plt.figure(figsize=(15, 8))
        plt.imshow(plt.imread(pred_df.path[idx_to_plot]))
        
        title1 = pred_df.path[idx_to_plot].split('/')[-2]
        title2 = pred_df.path[idx_to_plot].split('/')[-1]
        title3 = pred_df.pred[idx_to_plot]
        
        plt.title(f'Image: {title2}\nTrue Class: {title1}\nPredicted Class: {title3}', 
                  color='r', weight='bold', fontsize=15)
        plt.show()
except Exception as e:
    print(f"Could not plot specific index: {e}")

# Classification Report
print(f"Accuracy Score: {accuracy_score(pred_df['label'], pred_df['pred'])}")
print(classification_report(pred_df['label'], pred_df['pred']))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(pred_df['label'], pred_df['pred']), annot=True, fmt='2d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()