# Load Data

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

train_loc = '../input/herbarium-2022-fgvc9/train_images/'

with open("../input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_loc = json.load(json_file)

# JSON to Dataframe

In [None]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

#Create the initial training dataframe with the above defined columns
main_df = pd.DataFrame({
    "id" : image_ids,
    "directory" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids})

main_df

# Mapping genus and family

In [None]:
#Add a genus column to the dataframe
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
main_df['genus'] = main_df['genus'].map(genus_map)

##Create a family column in the datagframe based on the genus names
    # Step 1: Create dictionary of genus -> family mapping
genus_family_map = {}
for category in train_meta["category"]:
    genus = category['genus']
    family = category['family']
    genus_family_map[genus] = family

    # Step 2: Create new column with default value of None™
main_df['family'] = None

    # Step 3: Update values in new column based on genus -> family mapping
for i, row in main_df.iterrows():
    genus = row['genus']
    if genus in genus_family_map:
        family = genus_family_map[genus]
        main_df.at[i, 'family'] = family

main_df

# Filtering to Poaceae

In [None]:
#Filter only the images of plants that are in the Poaceae family
main_df = main_df.loc[main_df['family'] == 'Poaceae']
#Reset index
main_df = main_df.reset_index(drop=True)

main_df

# Adding species column

In [None]:
#Add a species column if deemed necessary
"""
main_df["species"] = None

# Extract category_id and species values from categories where the family is Poaceae
species_list = []
for category in train_meta["categories"]:
    if category["family"] == "Poaceae":
        species_list.append({
            "category": category["category_id"],
            "species": category["species"]
        })

# loop through data frame and species list to update species column
for i, row in main_df.iterrows():
    for species in species_list:
        if row['category'] == species['category']:
            main_df.at[i, 'species'] = species['species']"""

# EDA 🖼 

In [None]:
genus_data = main_df['genus'].value_counts().head(15)
genus_data = pd.DataFrame({'Genus' : genus_data.index,
                     'values' : genus_data.values})
                     
plt.figure(figsize = (20, 10))
sns.barplot(x='values', y = 'Genus', data = genus_data , palette='summer_r')
plt.show()

#From most to least: Muhlenbergia, Paspalum, Poa, Dichanthelium, Sporobolus, Eragrostis etc.

# Image displaying

In [None]:
def show_images(genus):
    images = main_df.loc[main_df['genus'] == genus]['image_dir'][:9]
    i = 1
    fig = plt.figure(figsize = (18, 18))
    plt.suptitle(genus, fontsize = '30')
    for image in images:
        img = cv2.imread(image)
        ax = fig.add_subplot(3, 3, i)
        ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax.set_axis_off()
        i += 1
    plt.show()

In [None]:
show_images("Muhlenbergia")

In [None]:
main_df["genus"].value_counts()

# Inference ⚡⚡⚡

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from PIL import Image

!pip install fvcore

In [None]:
main_df['genus'] = pd.factorize(main_df['genus'])[0]
main_df

# Separate data into 3 sets

In [None]:
grouped = main_df.groupby('genus')
# Split the dataset for each class separately
train_dfs = []
test_dfs = []

for name, group in grouped:
    # Split the group into train and test sets
    train, test = train_test_split(group, test_size=0.2, random_state=42)
    # Add the train and test dataframes to the respective lists
    train_dfs.append(train)
    test_dfs.append(test)

# Concatenate the training and evaluation sets for all classes into single DataFrames
train_df_initial = pd.concat(train_dfs)
train_df_initial = train_df_initial.reset_index(drop=True)

test_df = pd.concat(test_dfs)
test_df = test_df.reset_index(drop=True)

In [None]:
grouped2 = train_df_initial.groupby('genus')
# Split the dataset for each class separately
train_dfs = []
val_dfs = []

for name, group in grouped2:
    # Split the group into train and test sets
    train, validation = train_test_split(group, test_size=0.2, random_state=42)
    # Add the train and test dataframes to the respective lists
    train_dfs.append(train)
    val_dfs.append(validation)

# Concatenate the training and evaluation sets for all classes into single DataFrames
train_df = pd.concat(train_dfs)
train_df = train_df.reset_index(drop=True)

val_df = pd.concat(val_dfs)
val_df = val_df.reset_index(drop=True)

In [None]:
batch_size = 32
epochs = 3
IM_SIZE = 224
learning_rate = 1e-4

X_train, Y_train = train_df["image_dir"].values, train_df["genus"].values

X_val, Y_val = val_df["image_dir"].values, val_df["genus"].values

X_test, Y_test  = test_df["image_dir"].values, test_df["genus"].values

Transform = transforms.Compose([
     transforms.Resize((IM_SIZE, IM_SIZE)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

In [None]:
class GetData(Dataset):
    def __init__(self, FNames, Labels, Transform):
        self.fnames = FNames
        self.transform = Transform
        self.labels = Labels         
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):       
        x = Image.open(self.fnames[index])
        return self.transform(x), self.labels[index]
        
        
trainset = GetData(X_train, Y_train, Transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

valset = GetData(X_val, Y_val, Transform)
valloader = DataLoader(valset, batch_size=batch_size, shuffle=True)

testset = GetData(X_test, Y_test, Transform)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest101', pretrained=True)            

In [None]:
num_classes = train_df['genus'].nunique()
total_layers = len(list(model.parameters()))

for param in model.parameters():
    param.requires_grad = True
    
n_inputs = model.fc.in_features
last_layer = nn.Linear(n_inputs, num_classes)
model.fc = last_layer 

if torch.cuda.is_available():
    model.cuda()
    
print(model.fc.out_features)    

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training and evaluation

In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score

def train(trainloader, model, criterion, optimizer, scaler, device=torch.device("cpu")):
    model.train()
    train_acc = 0.0
    train_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(trainloader):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            output = model(images)
            loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            acc = ((output.argmax(dim=1) == labels).float().mean())
            train_acc += acc
            train_loss += loss
            y_true += labels.cpu().numpy().tolist()
            y_pred += output.argmax(dim=1).cpu().numpy().tolist()
            
    train_f1 = f1_score(y_true, y_pred, average=None)
    train_f1_avg = f1_score(y_true, y_pred, average='macro')
    return train_acc/len(trainloader), train_loss/len(trainloader), train_f1, train_f1_avg

In [None]:
def evaluate(testloader, model, criterion, device=torch.device("cpu")):
    model.eval()
    eval_acc = 0.0
    eval_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(testloader):
        images = images.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            output = model(images)
            loss = criterion(output, labels)
        acc = ((output.argmax(dim=1) == labels).float().mean())
        eval_acc += acc
        eval_loss += loss
        y_true += labels.cpu().numpy().tolist()
        y_pred += output.argmax(dim=1).cpu().numpy().tolist()
  
    eval_f1 = f1_score(y_true, y_pred, average=None)
    eval_f1_avg = f1_score(y_true, y_pred, average='macro')
    return eval_acc/len(testloader), eval_loss/len(testloader), eval_f1, eval_f1_avg

In [None]:
scaler = torch.cuda.amp.GradScaler(enabled=True)

train_f1_scores = []  # Initialize an empty list to store training F1 scores
val_f1_scores = []  # Initialize an empty list to store validation F1 scores


for epoch in range(epochs):
    train_acc, train_loss, train_f1, train_f1_avg = train(trainloader, model, criterion, optimizer, scaler, device=device)
    eval_acc, eval_loss, eval_f1, eval_f1_avg = evaluate(valloader, model, criterion, device=torch.device("cuda"))

    train_f1_scores.append(train_f1_avg)  # Store the training F1 score for this epoch
    val_f1_scores.append(eval_f1_avg)  # Store the validation F1 score for this epoch

    print(f"Epoch {epoch + 1} | Train Acc: {train_acc*100} | Train Loss: {train_loss} | Train F1 (Avg): {train_f1_avg}")
    print(f"\t Val Acc: {eval_acc*100} | Val Loss: {eval_loss} | Val F1 (Avg): {eval_f1_avg}")
    
    print("F1 score per class (Train):")
    for i, f1 in enumerate(train_f1):
        print(f"Class {i}: {f1}")

    print("\nF1 score per class (Validation):") 
    for i, f1 in enumerate(eval_f1):
        print(f"Class {i}: {f1}")

    print("===="*8)

# Plot the training results 

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(train_f1_scores, label="Train F1 (Avg)")
plt.plot(val_f1_scores, label="Validation F1 (Avg)")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")
plt.title(f"F1 Scores Over Epochs")
plt.legend()
plt.xticks(range(len(train_f1_scores)), range(1, len(train_f1_scores) + 1))
plt.show()

# Testing loop

In [None]:
# Assume that the `testloader` DataLoader has been created for the test set
test_acc = 0.0
y_true = []
y_pred = []

model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Loop through the test set
with torch.no_grad():
    for images, labels in tqdm(testloader, desc="Testing"):
        images = images.to(device)
        labels = labels.to(device)

        # Make predictions
        output = model(images)

        # Calculate accuracy
        acc = (output.argmax(dim=1) == labels).float().mean()
        test_acc += acc

        # Store true labels and predicted labels for other metrics
        y_true += labels.cpu().numpy().tolist()
        y_pred += output.argmax(dim=1).cpu().numpy().tolist()

# Calculate the final test accuracy
test_acc = test_acc / len(testloader)
print(f"Test accuracy: {test_acc:.2%}")

# Calculate the classification report, which includes the F1 score for all classes
report = classification_report(y_true, y_pred, output_dict=True)

# Print the classification report
for class_label, metrics in report.items():
    if class_label.isdigit():
        print(f"Class {class_label}: F1 score = {metrics['f1-score']:.2f}")