In [5]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

train_dir = '/kaggle/input/herbarium-2022-fgvc9/train_images/'
test_dir = '/kaggle/input/herbarium-2022-fgvc9/test_images'

with open("/kaggle/input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_meta = json.load(json_file)
with open("/kaggle/input/herbarium-2022-fgvc9/test_metadata.json") as json_file:
    test_meta = json.load(json_file)

In [6]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

#Create the initial training dataframe with the above defined columns
train_df = pd.DataFrame({
    "image_id" : image_ids,
    "image_dir" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids})

#Create a testing dataframe
test_df = pd.DataFrame({
    "test_id" : test_ids,
    "test_dir" : test_dirs
})

train_df

Unnamed: 0,image_id,image_dir,category,genus
0,00000__001,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,1
1,00000__002,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,1
2,00000__003,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,1
3,00000__004,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,1
4,00000__005,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,1
...,...,...,...,...
839767,15504__032,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,2584
839768,15504__033,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,2584
839769,15504__035,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,2584
839770,15504__036,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,2584


In [7]:
#Add a genus column to the dataframe
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
train_df['genus'] = train_df['genus'].map(genus_map)

##Create a family column in the datagframe based on the genus names
    # Step 1: Create dictionary of genus -> family mapping
genus_family_map = {}
for category in train_meta["categories"]:
    genus = category['genus']
    family = category['family']
    genus_family_map[genus] = family

    # Step 2: Create new column with default value of None™
train_df['family'] = None

    # Step 3: Update values in new column based on genus -> family mapping
for i, row in train_df.iterrows():
    genus = row['genus']
    if genus in genus_family_map:
        family = genus_family_map[genus]
        train_df.at[i, 'family'] = family

train_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,00000__001,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,Abies,Pinaceae
1,00000__002,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,Abies,Pinaceae
2,00000__003,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,Abies,Pinaceae
3,00000__004,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,Abies,Pinaceae
4,00000__005,/kaggle/input/herbarium-2022-fgvc9/train_image...,0,Abies,Pinaceae
...,...,...,...,...,...
839767,15504__032,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,Zygophyllum,Zygophyllaceae
839768,15504__033,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,Zygophyllum,Zygophyllaceae
839769,15504__035,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,Zygophyllum,Zygophyllaceae
839770,15504__036,/kaggle/input/herbarium-2022-fgvc9/train_image...,15504,Zygophyllum,Zygophyllaceae


In [8]:
#Filter only the images of plants that are in the Poaceae family
train_df = train_df.loc[train_df['family'] == 'Poaceae']
#Reset index
train_df = train_df.reset_index(drop=True)

train_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,00333__001,/kaggle/input/herbarium-2022-fgvc9/train_image...,333,Agrostis,Poaceae
1,00333__002,/kaggle/input/herbarium-2022-fgvc9/train_image...,333,Agrostis,Poaceae
2,00333__003,/kaggle/input/herbarium-2022-fgvc9/train_image...,333,Agrostis,Poaceae
3,00333__004,/kaggle/input/herbarium-2022-fgvc9/train_image...,333,Agrostis,Poaceae
4,00333__005,/kaggle/input/herbarium-2022-fgvc9/train_image...,333,Agrostis,Poaceae
...,...,...,...,...,...
53542,15501__101,/kaggle/input/herbarium-2022-fgvc9/train_image...,15501,Zuloagaea,Poaceae
53543,15501__103,/kaggle/input/herbarium-2022-fgvc9/train_image...,15501,Zuloagaea,Poaceae
53544,15501__105,/kaggle/input/herbarium-2022-fgvc9/train_image...,15501,Zuloagaea,Poaceae
53545,15501__106,/kaggle/input/herbarium-2022-fgvc9/train_image...,15501,Zuloagaea,Poaceae


In [9]:
train_df["genus"].value_counts()

Muhlenbergia       4228
Paspalum           3124
Poa                2608
Dichanthelium      2474
Sporobolus         2304
                   ... 
Ptilagrostiella      14
Rhipidocladum        11
Dupontia             10
Kalinia              10
Barkworthia           8
Name: genus, Length: 158, dtype: int64

In [10]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from PIL import Image

#!pip install fvcore

df = train_df

In [11]:
# Split the dataset for each class separately
train_dfs = []
val_dfs = []
for label in df['genus'].unique():
    # Filter the dataset to only include images with the current label
    label_df = df[df['genus'] == label]
    
    # Split the dataset into training and evaluation sets
    train_df, val_df = train_test_split(label_df, test_size=0.2)
    
    # Append the training and evaluation sets to their respective lists
    train_dfs.append(train_df)
    val_dfs.append(val_df)

# Concatenate the training and evaluation sets for all classes into single DataFrames
train_df = pd.concat(train_dfs)
val_df = pd.concat(val_dfs)

train_df['genus'] = pd.factorize(train_df['genus'])[0]
val_df['genus'] = pd.factorize(val_df['genus'])[0]

In [12]:
batch_size = 128
epochs = 10
IM_SIZE = 256
learning_rate = 1e-4

X_train, Y_train = train_df["image_dir"].values, train_df["genus"].values

X_val, Y_val = val_df["image_dir"].values, val_df["genus"].values

Transform = transforms.Compose(
    [transforms.Resize((IM_SIZE, IM_SIZE)),
     transforms.CenterCrop(224),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

In [14]:
class GetData(Dataset):
    def __init__(self, FNames, Labels, Transform):
        self.fnames = FNames
        self.transform = Transform
        self.labels = Labels         
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):       
        x = Image.open(self.fnames[index])
    
        if "train" in self.fnames[index]:             
            return self.transform(x), self.labels[index]
        elif "test" in self.fnames[index]:            
            return self.transform(x), self.fnames[index]

                
trainset = GetData(X_train, Y_train, Transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

valset = GetData(X_val, Y_val, Transform)
valloader = DataLoader(valset, batch_size=batch_size, shuffle=True)

num_classes = train_df['genus'].nunique()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = models.resnet152(pretrained=True)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "
Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth


URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [None]:
print(model.fc.in_features) 
print(model.fc.out_features)
    
n_inputs = model.fc.in_features
last_layer = nn.Linear(n_inputs, num_classes)
model.fc = last_layer

for idx, (name, param) in enumerate(model.named_parameters()):
    if idx < len(list(model.named_parameters())) - 1:
        param.requires_grad = False

print(model.fc.out_features)    

if torch.cuda.is_available():
    model.cuda()
    
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.fc.parameters(), learning_rate) 

In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score

def train(trainloader, model, criterion, optimizer, scaler, device=torch.device("cpu")):
    train_acc = 0.0
    train_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(trainloader):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            output = model(images)
            loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            acc = ((output.argmax(dim=1) == labels).float().mean())
            train_acc += acc
            train_loss += loss
            y_true += labels.cpu().numpy().tolist()
            y_pred += output.argmax(dim=1).cpu().numpy().tolist()
            
    train_f1 = f1_score(y_true, y_pred, average=None)
    return train_acc/len(trainloader), train_loss/len(trainloader), train_f1               

In [None]:
def evaluate(testloader, model, criterion, device=torch.device("cpu")):
    eval_acc = 0.0
    eval_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(testloader):
        images = images.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            output = model(images)
            loss = criterion(output, labels)
        acc = ((output.argmax(dim=1) == labels).float().mean())
        eval_acc += acc
        eval_loss += loss
        y_true += labels.cpu().numpy().tolist()
        y_pred += output.argmax(dim=1).cpu().numpy().tolist()
  
    eval_f1 = f1_score(y_true, y_pred, average=None)
    return eval_acc/len(testloader), eval_loss/len(testloader), eval_f1    

In [None]:
scaler = torch.cuda.amp.GradScaler(enabled=True)
for epoch in range(epochs):
    train_acc, train_loss, train_f1 = train(trainloader, model, criterion, optimizer, scaler, device=device)
    eval_acc, eval_loss, eval_f1 = evaluate(valloader, model, criterion, device=torch.device("cuda"))

    print(f"Epoch {epoch + 1} | Train Acc: {train_acc*100} | Train Loss: {train_loss} | Train F1: {train_f1}")
    print(f"\t Val Acc: {eval_acc*100} | Val Loss: {eval_loss} | Val F1: {eval_f1}")
    
    print("F1 score per class (Train):")
    for i, f1 in enumerate(train_f1):
        print(f"Class {i}: {f1}")

    print("\nF1 score per class (Validation):")
    for i, f1 in enumerate(eval_f1):
        print(f"Class {i}: {f1}")

    print("===="*8)