In [358]:
from torchvision.transforms import RandAugment
from PIL import Image
import torch
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import shutil

In [257]:
np.random.seed(42)

In [245]:
root_dir = "../datasets/waterbird/"
df = pd.read_csv(os.path.join(root_dir, "metadata.csv"))
df.head(5)

Unnamed: 0,img_id,img_filename,y,split,place,place_filename
0,1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,2,1,/o/ocean/00002178.jpg
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/l/lake/natural/00000065.jpg
2,3,001.Black_footed_Albatross/Black_Footed_Albatr...,1,2,0,/b/bamboo_forest/00000131.jpg
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/o/ocean/00001268.jpg
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/o/ocean/00003147.jpg


In [368]:
train_df = df[df['split'] == 0]
print(len(train_df))
train_df.head(5)

4795


Unnamed: 0,img_id,img_filename,y,split,place,place_filename
1,2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/l/lake/natural/00000065.jpg
3,4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/o/ocean/00001268.jpg
4,5,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/o/ocean/00003147.jpg
6,7,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/o/ocean/00003917.jpg
8,9,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,/l/lake/natural/00000560.jpg


### Copying train data to new folder

In [387]:
metadata = {
    'img_filename': [],
    'y': [],
    'place': []
}

dest = "../datasets/waterbird_augmented/training"
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    img_filename = row['img_filename']
    label = row['y']
    place = row['place']
    source_path = os.path.join("../datasets/waterbird", img_filename)
    img_filename = source_path.split("/")[-1]
    
    shutil.copy(src=source_path, dst=os.path.join(dest, img_filename))

    metadata['img_filename'].append(f'training/{img_filename}')
    metadata['y'].append(label)
    metadata['place'].append(place)

  0%|          | 0/4795 [00:00<?, ?it/s]

In [388]:
train_metadata_df = pd.DataFrame(metadata)
train_metadata_df.to_excel('../datasets/waterbird_augmented/metadata.xlsx', index=False)

### Random Augmentation

In [373]:
LANDBIRD = 0
WATERBIRD = 1
LAND = 0
WATER = 1

land_bird_on_land_df = train_df[(train_df['place'] == LAND) & (train_df['y'] == LANDBIRD)]
waterbird_on_water_df = train_df[(train_df['place'] == WATER) & (train_df['y'] == WATERBIRD)]
land_bird_on_water_df = train_df[(train_df['place'] == WATER) & (train_df['y'] == LANDBIRD)]
water_bird_on_land_df = train_df[(train_df['place'] == LAND) &(train_df['y'] == WATERBIRD)]

land_bird_on_land_df.reset_index(inplace=True)
waterbird_on_water_df.reset_index(inplace=True)
land_bird_on_water_df.reset_index(inplace=True)
water_bird_on_land_df.reset_index(inplace=True)

len(land_bird_on_land_df), len(waterbird_on_water_df), len(water_bird_on_land_df), len(land_bird_on_water_df)

(3498, 1057, 56, 184)

In [457]:
def take_sample(num_sample: int):
    land_bird_on_land_idx = np.random.choice(
        a=len(land_bird_on_land_df), size=num_sample, replace=False)
    waterbird_on_water_idx = np.random.choice(
        a=len(waterbird_on_water_df), size=num_sample, replace=False)
    water_bird_on_land_idx = np.random.choice(
        a=len(water_bird_on_land_df), size=num_sample, replace=True)
    land_bird_on_water_idx = np.random.choice(
        a=len(land_bird_on_water_df), size=num_sample, replace=True)
    
    land_bird_on_land_sampled = land_bird_on_land_df.iloc[land_bird_on_land_idx]
    waterbird_on_water_sampled = waterbird_on_water_df.iloc[waterbird_on_water_idx]
    water_bird_on_land_sampled = water_bird_on_land_df.iloc[water_bird_on_land_idx]
    land_bird_on_water_sampled = land_bird_on_water_df.iloc[land_bird_on_water_idx]

    return land_bird_on_land_sampled, waterbird_on_water_sampled, water_bird_on_land_sampled, land_bird_on_water_sampled


sample_size = 800
land_bird_on_land_sampled, waterbird_on_water_sampled, water_bird_on_land_sampled, land_bird_on_water_sampled = take_sample(num_sample=sample_size)

In [458]:
rand_augment = RandAugment(num_ops=5)

sample_size_folder_name = f'{sample_size}_samples'
sample_turn = 'sample_3'
saving_dir = f'../datasets/waterbird_augmented/{sample_size_folder_name}/{sample_turn}'
metadata = {
            'img_filename': [],
            'y': [],
            'place': []
            }

os.makedirs(saving_dir, exist_ok=True)
i = 1
for sampled_df in [land_bird_on_land_sampled, land_bird_on_water_sampled, waterbird_on_water_sampled, water_bird_on_land_sampled]:
    for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
        img_filename = row['img_filename']
        label = row['y']
        place = row['place']
        img_fullpath = os.path.join(root_dir, img_filename)
        img_filename = img_fullpath.split('/')[-1]
        img = Image.open(img_fullpath).convert('RGB')
        augmented_img = rand_augment(img)
        
        img_filename = f'augmented_{i}_{img_filename}'
        augmented_img.save(os.path.join(saving_dir, img_filename))

        metadata['img_filename'].append(f'{sample_size_folder_name}/{sample_turn}/{img_filename}')
        metadata['place'].append(place)
        metadata['y'].append(label)
        i += 1


metadata_df = pd.DataFrame(metadata)
metadata_df.head(5)

  0%|          | 0/800 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

  0%|          | 0/800 [00:00<?, ?it/s]

Unnamed: 0,img_filename,y,place
0,800_samples/sample_3/augmented_1_Yellow_Bellie...,0,0
1,800_samples/sample_3/augmented_2_Red_Bellied_W...,0,0
2,800_samples/sample_3/augmented_3_Grasshopper_S...,0,0
3,800_samples/sample_3/augmented_4_Cape_Glossy_S...,0,0
4,800_samples/sample_3/augmented_5_Clark_Nutcrac...,0,0


In [459]:
metadata = pd.concat([train_metadata_df, metadata_df], axis=0)
metadata.to_excel(f'../datasets/waterbird_augmented/metadata_{sample_size}_{sample_turn}.xlsx', index=False)