# Load Data

In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

train_dir = '../input/herbarium-2022-fgvc9/train_images/'
test_dir = '../input/herbarium-2022-fgvc9/test_images/'

with open("../input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_meta = json.load(json_file)
with open("../input/herbarium-2022-fgvc9/test_metadata.json") as json_file:
    test_meta = json.load(json_file)

# JSON -> Dataframe

In [3]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

#Create the initial training dataframe with the above defined columns
train_df = pd.DataFrame({
    "image_id" : image_ids,
    "image_dir" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids})

#Create a testing dataframe
test_df = pd.DataFrame({
    "test_id" : test_ids,
    "test_dir" : test_dirs
})

train_df

Unnamed: 0,image_id,image_dir,category,genus
0,00000__001,../input/herbarium-2022-fgvc9/train_images/000...,0,1
1,00000__002,../input/herbarium-2022-fgvc9/train_images/000...,0,1
2,00000__003,../input/herbarium-2022-fgvc9/train_images/000...,0,1
3,00000__004,../input/herbarium-2022-fgvc9/train_images/000...,0,1
4,00000__005,../input/herbarium-2022-fgvc9/train_images/000...,0,1
...,...,...,...,...
839767,15504__032,../input/herbarium-2022-fgvc9/train_images/155...,15504,2584
839768,15504__033,../input/herbarium-2022-fgvc9/train_images/155...,15504,2584
839769,15504__035,../input/herbarium-2022-fgvc9/train_images/155...,15504,2584
839770,15504__036,../input/herbarium-2022-fgvc9/train_images/155...,15504,2584


# Mapping genus and family

In [4]:
#Add a genus column to the dataframe
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
train_df['genus'] = train_df['genus'].map(genus_map)

##Create a family column in the datagframe based on the genus names
    # Step 1: Create dictionary of genus -> family mapping
genus_family_map = {}
for category in train_meta["categories"]:
    genus = category['genus']
    family = category['family']
    genus_family_map[genus] = family

    # Step 2: Create new column with default value of None™
train_df['family'] = None

    # Step 3: Update values in new column based on genus -> family mapping
for i, row in train_df.iterrows():
    genus = row['genus']
    if genus in genus_family_map:
        family = genus_family_map[genus]
        train_df.at[i, 'family'] = family

train_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,00000__001,../input/herbarium-2022-fgvc9/train_images/000...,0,Abies,Pinaceae
1,00000__002,../input/herbarium-2022-fgvc9/train_images/000...,0,Abies,Pinaceae
2,00000__003,../input/herbarium-2022-fgvc9/train_images/000...,0,Abies,Pinaceae
3,00000__004,../input/herbarium-2022-fgvc9/train_images/000...,0,Abies,Pinaceae
4,00000__005,../input/herbarium-2022-fgvc9/train_images/000...,0,Abies,Pinaceae
...,...,...,...,...,...
839767,15504__032,../input/herbarium-2022-fgvc9/train_images/155...,15504,Zygophyllum,Zygophyllaceae
839768,15504__033,../input/herbarium-2022-fgvc9/train_images/155...,15504,Zygophyllum,Zygophyllaceae
839769,15504__035,../input/herbarium-2022-fgvc9/train_images/155...,15504,Zygophyllum,Zygophyllaceae
839770,15504__036,../input/herbarium-2022-fgvc9/train_images/155...,15504,Zygophyllum,Zygophyllaceae


# Filtering to Poaceae

In [5]:
#Filter only the images of plants that are in the Poaceae family
train_df = train_df.loc[train_df['family'] == 'Poaceae']
#Reset index
train_df = train_df.reset_index(drop=True)
train_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,00333__001,../input/herbarium-2022-fgvc9/train_images/003...,333,Agrostis,Poaceae
1,00333__002,../input/herbarium-2022-fgvc9/train_images/003...,333,Agrostis,Poaceae
2,00333__003,../input/herbarium-2022-fgvc9/train_images/003...,333,Agrostis,Poaceae
3,00333__004,../input/herbarium-2022-fgvc9/train_images/003...,333,Agrostis,Poaceae
4,00333__005,../input/herbarium-2022-fgvc9/train_images/003...,333,Agrostis,Poaceae
...,...,...,...,...,...
53542,15501__101,../input/herbarium-2022-fgvc9/train_images/155...,15501,Zuloagaea,Poaceae
53543,15501__103,../input/herbarium-2022-fgvc9/train_images/155...,15501,Zuloagaea,Poaceae
53544,15501__105,../input/herbarium-2022-fgvc9/train_images/155...,15501,Zuloagaea,Poaceae
53545,15501__106,../input/herbarium-2022-fgvc9/train_images/155...,15501,Zuloagaea,Poaceae


# Adding species column

In [6]:
"""#Add category_id and species column
train_df["species"] = None

# Extract category_id and species values from categories where the family is Poaceae
species_list = []
for category in train_meta["categories"]:
    if category["family"] == "Poaceae":
        species_list.append({
            "category": category["category_id"],
            "species": category["species"]
        })

# loop through data frame and species list to update species column
for i, row in train_df.iterrows():
    for species in species_list:
        if row['category'] == species['category']:
            train_df.at[i, 'species'] = species['species']"""

'#Add category_id and species column\ntrain_df["species"] = None\n\n# Extract category_id and species values from categories where the family is Poaceae\nspecies_list = []\nfor category in train_meta["categories"]:\n    if category["family"] == "Poaceae":\n        species_list.append({\n            "category": category["category_id"],\n            "species": category["species"]\n        })\n\n# loop through data frame and species list to update species column\nfor i, row in train_df.iterrows():\n    for species in species_list:\n        if row[\'category\'] == species[\'category\']:\n            train_df.at[i, \'species\'] = species[\'species\']'

# Data visualization

In [7]:
"""
genus_data = train_df['genus'].value_counts().head(15)
genus_data = pd.DataFrame({'Genus' : genus_data.index,
                     'values' : genus_data.values})
                     
plt.figure(figsize = (20, 10))
sns.barplot(x='values', y = 'Genus', data = genus_data , palette='summer_r')
plt.show()

From most to least: Muhlenbergia, Paspalum, Poa, Dichanthelium, Sporobolus, Eragrostis etc.
"""

"\ngenus_data = train_df['genus'].value_counts().head(15)\ngenus_data = pd.DataFrame({'Genus' : genus_data.index,\n                     'values' : genus_data.values})\n                     \nplt.figure(figsize = (20, 10))\nsns.barplot(x='values', y = 'Genus', data = genus_data , palette='summer_r')\nplt.show()\n\nFrom most to least: Muhlenbergia, Paspalum, Poa, Dichanthelium, Sporobolus, Eragrostis etc.\n"

# Filter to two genus

In [8]:
#Muhlenbergia data
muh_pas_df = train_df[(train_df['genus'] == 'Paspalum') | (train_df['genus'] == 'Muhlenbergia')]
muh_pas_df = muh_pas_df.reset_index(drop=True)
muh_pas_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,09492__001,../input/herbarium-2022-fgvc9/train_images/094...,9492,Muhlenbergia,Poaceae
1,09492__003,../input/herbarium-2022-fgvc9/train_images/094...,9492,Muhlenbergia,Poaceae
2,09492__004,../input/herbarium-2022-fgvc9/train_images/094...,9492,Muhlenbergia,Poaceae
3,09492__005,../input/herbarium-2022-fgvc9/train_images/094...,9492,Muhlenbergia,Poaceae
4,09492__006,../input/herbarium-2022-fgvc9/train_images/094...,9492,Muhlenbergia,Poaceae
...,...,...,...,...,...
7347,10398__026,../input/herbarium-2022-fgvc9/train_images/103...,10398,Paspalum,Poaceae
7348,10398__029,../input/herbarium-2022-fgvc9/train_images/103...,10398,Paspalum,Poaceae
7349,10398__030,../input/herbarium-2022-fgvc9/train_images/103...,10398,Paspalum,Poaceae
7350,10398__031,../input/herbarium-2022-fgvc9/train_images/103...,10398,Paspalum,Poaceae


# Image displaying

In [9]:
def show_images(genus):
    images = train_df.loc[train_df['genus'] == genus]['image_dir'][:9]
    i = 1
    fig = plt.figure(figsize = (18, 18))
    plt.suptitle(genus, fontsize = '30')
    for image in images:
        img = cv2.imread(image)
        ax = fig.add_subplot(3, 3, i)
        ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        ax.set_axis_off()
        i += 1
    plt.show()

In [10]:
#show_images("Muhlenbergia")

# Splitting the dataset into training and validation

**2 Genuses**

In [11]:
muh_df = muh_pas_df[muh_pas_df["genus"] == "Muhlenbergia"] 
pas_df = muh_pas_df[muh_pas_df["genus"] == "Paspalum"]

#15 percent of images will be used for validation
# Muh total: 4228 --> 15% = 634
# Pas total: 3124 --> 15% = 467
muh_valid = muh_df.sample(n=634, random_state=42)
muh_train = muh_df.drop(muh_valid.index)
muh_valid = muh_valid.reset_index(drop=True)
muh_train = muh_train.reset_index(drop=True)

pas_valid = pas_df.sample(n=467, random_state=42)
pas_train = pas_df.drop(pas_valid.index)
pas_valid = pas_valid.reset_index(drop=True)
pas_train = pas_train.reset_index(drop=True)

# Merging the Muhlanbergia and Paspalum databases
muh_pas_train = pd.concat([muh_train, pas_train])
muh_pas_train = muh_pas_train.reset_index(drop=True)

muh_pas_valid = pd.concat([muh_valid, pas_valid])
muh_pas_valid = muh_pas_valid.reset_index(drop=True)

muh_pas_train["genus"].nunique()

2

In [12]:
train_df["genus"].value_counts().tail(20)

Otatea             33
Chaetium           30
Trichoneura        30
Coleanthus         28
Tetrapogon         27
Arctopoa           21
Willkommia         20
Setariopsis        20
Tuctoria           19
Neostapfia         18
Chusquea           18
Amelichloa         16
Allolepis          14
Ptilagrostis       14
Swallenia          14
Ptilagrostiella    14
Rhipidocladum      11
Dupontia           10
Kalinia            10
Barkworthia         8
Name: genus, dtype: int64

# Creating the model

In [13]:
import os
import torch
import torchvision
import numpy as np
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from sklearn import preprocessing
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision.models import resnet50, ResNet50_Weights

df = train_df

In [14]:
# Split the dataset for each class separately
train_dfs = []
val_dfs = []
for label in df['genus'].unique():
    # Filter the dataset to only include images with the current label
    label_df = df[df['genus'] == label]
    
    # Split the dataset into training and evaluation sets
    train_df, val_df = train_test_split(label_df, test_size=0.2)
    
    # Append the training and evaluation sets to their respective lists
    train_dfs.append(train_df)
    val_dfs.append(val_df)

# Concatenate the training and evaluation sets for all classes into single DataFrames
train_df = pd.concat(train_dfs)
val_df = pd.concat(val_dfs)

train_df['genus'] = pd.factorize(train_df['genus'])[0]
val_df['genus'] = pd.factorize(val_df['genus'])[0]

In [15]:
batch_size = 256
epochs = 10
IM_SIZE = 224

X_train, Y_train = train_df["image_dir"].values, train_df["genus"].values

X_val, Y_val = val_df["image_dir"].values, val_df["genus"].values

Transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

In [16]:
class GetData(Dataset):
    def __init__(self, FNames, Labels, Transform):
        self.fnames = FNames
        self.transform = Transform
        self.labels = Labels         
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):       
        x = Image.open(self.fnames[index])
    
        if "train" in self.fnames[index]:             
            return self.transform(x), self.labels[index]
        elif "test" in self.fnames[index]:            
            return self.transform(x), self.fnames[index]

                
trainset = GetData(X_train, Y_train, Transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

valset = GetData(X_val, Y_val, Transform)
valloader = DataLoader(valset, batch_size=batch_size, shuffle=True)

N_Classes = train_df['genus'].nunique()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip
  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [17]:
print(model.fc.in_features) 
print(model.fc.out_features)

for param in model.parameters():
    param.requires_grad= False
    
n_inputs = model.fc.in_features
last_layer = nn.Linear(n_inputs, N_Classes)
model.fc = last_layer
if torch.cuda.is_available():
    model.cuda()
print(model.fc.out_features)    

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.fc.parameters()) 

512
1000
158


In [18]:
from tqdm import tqdm
from sklearn.metrics import f1_score

def train(trainloader, model, criterion, optimizer, scaler, device=torch.device("cpu")):
    train_acc = 0.0
    train_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(trainloader):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            output = model(images)
            loss = criterion(output, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            acc = ((output.argmax(dim=1) == labels).float().mean())
            train_acc += acc
            train_loss += loss
            y_true += labels.cpu().numpy().tolist()
            y_pred += output.argmax(dim=1).cpu().numpy().tolist()
            
    train_f1 = f1_score(y_true, y_pred, average='weighted')
    return train_acc/len(trainloader), train_loss/len(trainloader), train_f1               

In [19]:
def evaluate(testloader, model, criterion, device=torch.device("cpu")):
    eval_acc = 0.0
    eval_loss = 0.0
    y_true = []
    y_pred = []
    for images, labels in tqdm(testloader):
        images = images.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            output = model(images)
            loss = criterion(output, labels)
        acc = ((output.argmax(dim=1) == labels).float().mean())
        eval_acc += acc
        eval_loss += loss
        y_true += labels.cpu().numpy().tolist()
        y_pred += output.argmax(dim=1).cpu().numpy().tolist()
  
    eval_f1 = f1_score(y_true, y_pred, average='weighted')
    return eval_acc/len(testloader), eval_loss/len(testloader), eval_f1    

In [20]:
%%time
scaler = torch.cuda.amp.GradScaler(enabled=True)
for epoch in range(epochs):
    train_acc, train_loss, train_f1 = train(trainloader, model, criterion, optimizer, scaler, device=device)
    eval_acc, eval_loss, eval_f1 = evaluate(valloader, model, criterion, device=torch.device("cuda"))
    print("")
    print(f"Epoch {epoch + 1} | Train Acc: {train_acc*100} | Train Loss: {train_loss} | Train F1: {train_f1}")
    print(f"\t Val Acc: {eval_acc*100} | Val Loss: {eval_loss} | Val F1: {eval_f1}")
    print("===="*8)

100%|██████████| 168/168 [19:48<00:00,  7.08s/it]
100%|██████████| 43/43 [04:52<00:00,  6.80s/it]



Epoch 1 | Train Acc: 16.403459548950195 | Train Loss: 3.766195774078369 | Train F1: 0.1081795200092149
	 Val Acc: 20.890735626220703 | Val Loss: 3.4356048107147217 | Val F1: 0.14404519553513295


100%|██████████| 168/168 [15:02<00:00,  5.37s/it]
100%|██████████| 43/43 [03:58<00:00,  5.54s/it]



Epoch 2 | Train Acc: 23.735118865966797 | Train Loss: 3.2197251319885254 | Train F1: 0.1842372453144336
	 Val Acc: 23.96478271484375 | Val Loss: 3.1856908798217773 | Val F1: 0.18375572551832997


100%|██████████| 168/168 [14:36<00:00,  5.22s/it]
100%|██████████| 43/43 [03:32<00:00,  4.95s/it]



Epoch 3 | Train Acc: 26.941499710083008 | Train Loss: 2.9927594661712646 | Train F1: 0.22568527529171944
	 Val Acc: 26.015466690063477 | Val Loss: 3.077845811843872 | Val F1: 0.21916630729173295


100%|██████████| 168/168 [13:10<00:00,  4.70s/it]
100%|██████████| 43/43 [03:20<00:00,  4.67s/it]



Epoch 4 | Train Acc: 29.2457218170166 | Train Loss: 2.8497979640960693 | Train F1: 0.25682104844703013
	 Val Acc: 27.85286521911621 | Val Loss: 2.97890305519104 | Val F1: 0.24097532851771206


100%|██████████| 168/168 [12:48<00:00,  4.57s/it]
100%|██████████| 43/43 [03:21<00:00,  4.68s/it]



Epoch 5 | Train Acc: 30.99283790588379 | Train Loss: 2.746217966079712 | Train F1: 0.27914291676956554
	 Val Acc: 27.681053161621094 | Val Loss: 2.9542810916900635 | Val F1: 0.24437800825862702


100%|██████████| 168/168 [12:54<00:00,  4.61s/it]
100%|██████████| 43/43 [03:25<00:00,  4.77s/it]



Epoch 6 | Train Acc: 32.61439895629883 | Train Loss: 2.666487693786621 | Train F1: 0.2996139803447096
	 Val Acc: 28.751028060913086 | Val Loss: 2.9061787128448486 | Val F1: 0.25171983461367353


100%|██████████| 168/168 [13:09<00:00,  4.70s/it]
100%|██████████| 43/43 [03:25<00:00,  4.79s/it]



Epoch 7 | Train Acc: 33.918338775634766 | Train Loss: 2.6004745960235596 | Train F1: 0.31682473872517714
	 Val Acc: 29.7763671875 | Val Loss: 2.8679893016815186 | Val F1: 0.26158706184782987


100%|██████████| 168/168 [13:06<00:00,  4.68s/it]
100%|██████████| 43/43 [03:24<00:00,  4.77s/it]



Epoch 8 | Train Acc: 34.896297454833984 | Train Loss: 2.5505127906799316 | Train F1: 0.3288081024949046
	 Val Acc: 29.585599899291992 | Val Loss: 2.877291202545166 | Val F1: 0.2670416053455258


100%|██████████| 168/168 [12:55<00:00,  4.62s/it]
100%|██████████| 43/43 [03:23<00:00,  4.74s/it]



Epoch 9 | Train Acc: 35.63662338256836 | Train Loss: 2.50864577293396 | Train F1: 0.33934545005867484
	 Val Acc: 30.6180477142334 | Val Loss: 2.817570686340332 | Val F1: 0.264591122449837


100%|██████████| 168/168 [13:10<00:00,  4.71s/it]
100%|██████████| 43/43 [03:21<00:00,  4.68s/it]


Epoch 10 | Train Acc: 36.24860763549805 | Train Loss: 2.4710896015167236 | Train F1: 0.34727328791951523
	 Val Acc: 30.47704315185547 | Val Loss: 2.812758445739746 | Val F1: 0.28332747449124623
CPU times: user 2h 28min 10s, sys: 4min 8s, total: 2h 32min 18s
Wall time: 2h 56min 51s



