# Generate Waterbirds Original, FG-Only and BG-Only data

## Imports

Based on this notebook: https://github.com/PolinaKirichenko/deep_feature_reweighting/blob/main/notebooks/data_generation/generate_waterbirds_fg_bg.ipynb

In [21]:
%reload_ext autoreload
%autoreload 2

import os
import numpy as np
import random
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
# from pycocotools.mask import encode, decode

import cv2

# from detectron2.structures import BoxMode

import sys
sys.path.insert(0, "../group_DRO/")

# code from https://github.com/kohpangwei/group_DRO
from dataset_scripts.dataset_utils import crop_and_resize, combine_and_mask
sys.path.pop(0)

# Change the paths below.
cache_path = "/mnt/lustre/work/oh/arubinstein17/cache"
cub_dir = os.path.join(cache_path, "CUB_200_2011", "CUB_200_2011")
places_dir = os.path.join(cache_path, "Places", "train", "data_large")
output_dir = os.path.join(cache_path, "Waterbirds", "FG-Only")
dataset_name = 'waterbirds_birds_places'

target_places = [
    ['bamboo_forest', 'forest/broadleaf'],  # Land backgrounds
    ['ocean', 'lake/natural']]              # Water backgrounds

val_frac = 0.2             # What fraction of the training data to use as validation
confounder_strength = 0.95 # Determines relative size of majority vs. minority groups

## Functions

In [19]:
def get_segmentation(mask):
    contours, hierarchy = cv2.findContours((mask[:, :, 0]).astype(np.uint8), cv2.RETR_TREE,
                                                        cv2.CHAIN_APPROX_SIMPLE)
    segmentation = []

    for contour in contours:
        contour = contour.flatten().tolist()
        # segmentation.append(contour)
        if len(contour) > 4:
            segmentation.append(contour)
    return segmentation

def make_symlink_cmd(src, dst):
    symlink_cmd = (
        f"ln -s "
        f"'{src}' "
        f"{dst}"
    )
    return symlink_cmd

## Birds

In [4]:
images_path = os.path.join(cub_dir, 'images.txt')

df = pd.read_csv(
    images_path,
    sep=" ",
    header=None,
    names=['img_id', 'img_filename'],
    index_col='img_id')

In [5]:
### Set up labels of waterbirds vs. landbirds
# We consider water birds = seabirds and waterfowl.
species = np.unique([img_filename.split('/')[0].split('.')[1].lower() for img_filename in df['img_filename']])
water_birds_list = [
    'Albatross', # Seabirds
    'Auklet',
    'Cormorant',
    'Frigatebird',
    'Fulmar',
    'Gull',
    'Jaeger',
    'Kittiwake',
    'Pelican',
    'Puffin',
    'Tern',
    'Gadwall', # Waterfowl
    'Grebe',
    'Mallard',
    'Merganser',
    'Guillemot',
    'Pacific_Loon'
]


water_birds = {}
for species_name in species:
    water_birds[species_name] = 0
    for water_bird in water_birds_list:
        if water_bird.lower() in species_name:
            water_birds[species_name] = 1
species_list = [img_filename.split('/')[0].split('.')[1].lower() for img_filename in df['img_filename']]
df['y'] = [water_birds[species] for species in species_list]

In [6]:
train_test_df =  pd.read_csv(
    os.path.join(cub_dir, 'train_test_split.txt'),
    sep=" ",
    header=None,
    names=['img_id', 'split'],
    index_col='img_id')

In [7]:
df = df.join(train_test_df, on='img_id')
test_ids = df.loc[df['split'] == 0].index
train_ids = np.array(df.loc[df['split'] == 1].index)
val_ids = np.random.choice(
    train_ids,
    size=int(np.round(val_frac * len(train_ids))),
    replace=False)


In [8]:
df.loc[train_ids, 'split'] = 0
df.loc[val_ids, 'split'] = 1
df.loc[test_ids, 'split'] = 2

## Places

In [9]:
df['place'] = 0
train_ids = np.array(df.loc[df['split'] == 0].index)
val_ids = np.array(df.loc[df['split'] == 1].index)
test_ids = np.array(df.loc[df['split'] == 2].index)
for split_idx, ids in enumerate([train_ids, val_ids, test_ids]):
    for y in (0, 1):
        if split_idx == 0: # train
            if y == 0:
                pos_fraction = 1 - confounder_strength
            else:
                pos_fraction = confounder_strength
        else:
            pos_fraction = 0.5
        subset_df = df.loc[ids, :]
        y_ids = np.array((subset_df.loc[subset_df['y'] == y]).index)
        pos_place_ids = np.random.choice(
            y_ids,
            size=int(np.round(pos_fraction * len(y_ids))),
            replace=False)
        df.loc[pos_place_ids, 'place'] = 1


In [10]:
for split, split_label in [(0, 'train'), (1, 'val'), (2, 'test')]:
    print(f"{split_label}:")
    split_df = df.loc[df['split'] == split, :]
    print(f"waterbirds are {np.mean(split_df['y']):.3f} of the examples")
    print(f"y = 0, c = 0: {np.mean(split_df.loc[split_df['y'] == 0, 'place'] == 0):.3f}, n = {np.sum((split_df['y'] == 0) & (split_df['place'] == 0))}")
    print(f"y = 0, c = 1: {np.mean(split_df.loc[split_df['y'] == 0, 'place'] == 1):.3f}, n = {np.sum((split_df['y'] == 0) & (split_df['place'] == 1))}")
    print(f"y = 1, c = 0: {np.mean(split_df.loc[split_df['y'] == 1, 'place'] == 0):.3f}, n = {np.sum((split_df['y'] == 1) & (split_df['place'] == 0))}")
    print(f"y = 1, c = 1: {np.mean(split_df.loc[split_df['y'] == 1, 'place'] == 1):.3f}, n = {np.sum((split_df['y'] == 1) & (split_df['place'] == 1))}")

train:
waterbirds are 0.229 of the examples
y = 0, c = 0: 0.950, n = 3513
y = 0, c = 1: 0.050, n = 185
y = 1, c = 0: 0.050, n = 55
y = 1, c = 1: 0.950, n = 1042
val:
waterbirds are 0.235 of the examples
y = 0, c = 0: 0.501, n = 459
y = 0, c = 1: 0.499, n = 458
y = 1, c = 0: 0.500, n = 141
y = 1, c = 1: 0.500, n = 141
test:
waterbirds are 0.222 of the examples
y = 0, c = 0: 0.500, n = 2255
y = 0, c = 1: 0.500, n = 2255
y = 1, c = 0: 0.500, n = 642
y = 1, c = 1: 0.500, n = 642


In [11]:
place_ids_df = pd.read_csv(
    os.path.join(places_dir, 'categories_places365.txt'),
    sep=" ",
    header=None,
    names=['place_name', 'place_id'],
    index_col='place_id')

In [12]:
target_place_ids = []

for idx, target_places in enumerate(target_places):
    place_filenames = []

    for target_place in target_places:
        target_place_full = f'/{target_place[0]}/{target_place}'
        print(target_place, target_place_full)
        assert (np.sum(place_ids_df['place_name'] == target_place_full) == 1)
        target_place_ids.append(place_ids_df.index[place_ids_df['place_name'] == target_place_full][0])
        print(f'train category {idx} {target_place_full} has id {target_place_ids[idx]}')

        place_filenames += [
            f'/{target_place[0]}/{target_place}/{filename}' for filename in os.listdir(
                os.path.join(places_dir, target_place[0], target_place))
            if filename.endswith('.jpg')]

    random.shuffle(place_filenames)

    # Assign each filename to an image
    indices = (df.loc[:, 'place'] == idx)
    assert len(place_filenames) >= np.sum(indices),\
        f"Not enough places ({len(place_filenames)}) to fit the dataset ({np.sum(df.loc[:, 'place'] == idx)})"
    df.loc[indices, 'place_filename'] = place_filenames[:np.sum(indices)]

bamboo_forest /b/bamboo_forest
train category 0 /b/bamboo_forest has id 36
forest/broadleaf /f/forest/broadleaf
train category 0 /f/forest/broadleaf has id 36
ocean /o/ocean
train category 1 /o/ocean has id 150
lake/natural /l/lake/natural
train category 1 /l/lake/natural has id 150


## Construct Waterbirds

In [13]:
output_subfolder = os.path.join(output_dir, dataset_name)
os.makedirs(output_subfolder, exist_ok=True)

In [14]:
df.to_csv(os.path.join(output_subfolder, 'metadata.csv'))

In [16]:
train_instances = []
test_instances = []
val_instances = []

for i in tqdm(df.index):
    # Load bird image and segmentation
    img_path = os.path.join(cub_dir, 'images', df.loc[i, 'img_filename'])
    seg_path = os.path.join(cub_dir, 'segmentations', df.loc[i, 'img_filename'].replace('.jpg','.png'))
    img_np = np.asarray(Image.open(img_path).convert('RGB'))
    seg_np = np.asarray(Image.open(seg_path).convert('RGB')) / 255


    place_path = os.path.join(places_dir, df.loc[i, 'place_filename'][1:])
    place = Image.open(place_path).convert('RGB')

    img_black = Image.fromarray(np.around(img_np * seg_np).astype(np.uint8))
    combined_img = combine_and_mask(place, seg_np, img_black)
    bird_img = combine_and_mask(Image.fromarray(np.ones_like(place) * 150), seg_np, img_black)

    seg_np *= 0.
    img_black = Image.fromarray(np.around(img_np * seg_np).astype(np.uint8))
    place_img = combine_and_mask(place, seg_np * 0, img_black)

    combined_path = os.path.join(output_subfolder, "combined", df.loc[i, 'img_filename'])
    bird_path = os.path.join(output_subfolder, "birds", df.loc[i, 'img_filename'])
    place_path = os.path.join(output_subfolder, "places", df.loc[i, 'img_filename'])

    os.makedirs('/'.join(combined_path.split('/')[:-1]), exist_ok=True)
    os.makedirs('/'.join(bird_path.split('/')[:-1]), exist_ok=True)
    os.makedirs('/'.join(place_path.split('/')[:-1]), exist_ok=True)

    combined_img.save(combined_path)
    bird_img.save(bird_path)
    place_img.save(place_path)

100%|██████████| 11788/11788 [09:03<00:00, 21.70it/s]


In [None]:
combined_img

In [None]:
bird_img

In [None]:
place_img

# Split into groups

In [26]:
WATERBIRDS_BASE_PATH = "/mnt/lustre/work/oh/arubinstein17/cache/Waterbirds"
# /mnt/lustre/work/oh/arubinstein17/cache/Waterbirds/FG-Only/waterbirds_birds_places/birds
FG_ONLY_PATH = os.path.join(WATERBIRDS_BASE_PATH, "FG-Only")
ONLY_BIRDS_WB_PATH = os.path.join(FG_ONLY_PATH, "waterbirds_birds_places", "birds")
WATERBIRDS_FG_BASE_PATH = os.path.join(FG_ONLY_PATH, "test_split")
CLASSIC_WB_PATH = os.path.join(WATERBIRDS_BASE_PATH, "waterbird_complete95_forest2water2")

wb_df = pd.read_csv(os.path.join(CLASSIC_WB_PATH, "metadata.csv"))
wb_df["groups"] = wb_df["y"] * len(wb_df["place"].unique()) + wb_df["place"]
test_df = wb_df[wb_df["split"] == 2]

In [27]:
df = test_df
num_groups = len(df["groups"].unique())
num_classes = len(df["y"].unique())

base_path = WATERBIRDS_FG_BASE_PATH

for group_id in range(num_groups):
    dataset_path = os.path.join(base_path, "group_" + str(group_id))
    os.makedirs(dataset_path, exist_ok=True)

    folders = []
    for label in range(num_classes):
        cur_folder = os.path.join(dataset_path, str(label))
        os.makedirs(cur_folder, exist_ok=True)
        folders.append(cur_folder)

    filtered_values = [
        (row["img_filename"], row["y"])
        for _, row in df.iterrows()
        if row["groups"] == group_id
    ]

    # make symlinks
    for img_filename, y in tqdm(filtered_values):

        parent_dir = os.path.dirname(img_filename)
        base_name = os.path.basename(img_filename)
        final_name = f"{parent_dir}_{base_name}.jpg"
        symlink_cmd = make_symlink_cmd(
            # os.path.join(CLASSIC_WB_PATH, img_filename),
            os.path.join(ONLY_BIRDS_WB_PATH, img_filename),
            os.path.join(folders[y], final_name)
        )
        os.system(symlink_cmd)

    # clean empty class folders
    for folder in folders:
        if len(os.listdir(folder)) == 0:
            shutil.rmtree(folder)

100%|██████████| 2255/2255 [00:07<00:00, 286.38it/s]
100%|██████████| 2255/2255 [00:07<00:00, 302.93it/s]
100%|██████████| 642/642 [00:02<00:00, 304.86it/s]
100%|██████████| 642/642 [00:02<00:00, 283.08it/s]


In [None]:
Image.open("/mnt/lustre/work/oh/arubinstein17/cache/Waterbirds/FG-Only/test_split/group_0/0/200.Common_Yellowthroat_Common_Yellowthroat_0125_190902.jpg.jpg").convert('RGB')

In [None]:
Image.open("/mnt/lustre/work/oh/arubinstein17/cache/Waterbirds/test_split/group_0/0/200.Common_Yellowthroat_Common_Yellowthroat_0125_190902.jpg.jpg").convert('RGB')