# Overview of Singapore Outputs

## Hidden Code

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
import subprocess
import math

In [2]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

import boto3
import json
from boto3.s3.transfer import TransferConfig

client = initialise_session('./credentials.json')

# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [3]:
def download_images(key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    client.download_file(bucket_name, key, download_path, Config=transfer_config)

download_dir = './data/singapore/combined/downloaded_images'
os.makedirs(download_dir, exist_ok=True)

In [4]:
def annotate_image(image_path, dir, df, ax, scaling_required=False, crop_to_highlight=None, buffer=5):
    df_image = df.loc[df['image_path'] == image_path, ]

    img = plt.imread(f"{dir}/{os.path.basename(image_path)}")
    image = Image.open(f"{dir}/{os.path.basename(image_path)}").convert("RGB")
    subp = ax.imshow(img, origin='lower')

    for j, row in df_image.iterrows():
        x_min = row['x_min'] -buffer
        y_min = row['y_min'] -buffer
        x_max = row['x_max'] +buffer
        y_max = row['y_max'] +buffer

        if scaling_required:
            original_width, original_height = image.size
            x_min = x_min *300 / original_width
            y_min = y_min *300 / original_height
            x_max = x_max *300 / original_width
            y_max = y_max *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        if (row['class_name'] == "moth") and ('Lepidoptera' not in row['order_name']):
            col = 'orange'
        elif (row['class_name'] != "moth") and ('Lepidoptera' in row['order_name']):
            col = 'purple'
        elif (row['class_name'] == "moth") and ('Lepidoptera' in row['order_name']):
                col = 'green'
        else:
            col = 'red'

        alph = 1
        if crop_to_highlight is not None:
            if row['crop_status'] != crop_to_highlight:
                alph = 0.2


        if (row['class_name'] == "moth") or ("Lepidoptera" in row['order_name']):
            ax.text(x_min, y_max,
                    f"{row['top_1_species']}: {row['top_1_confidence']:.2f}",
                    color=col,
                    fontsize=5, alpha=alph,
                    verticalalignment="bottom")

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor=col, linewidth=1, alpha=alph)
        ax.add_patch(rect)

    ax.set_title(f"{os.path.basename(image_path)}")
    ax.axis('off')
    return subp

## Combine the outputs for each deployment

In [None]:
# for each dir in ./data/singapore, run 05_combine_outputs.py
# and save the output in ./data/singapore/combined

dirs = os.listdir('./data/singapore')
dirs = [d for d in dirs if os.path.isdir(os.path.join('./data/singapore', d)) and d.startswith('dep')]
dirs = [os.path.join('./data/singapore', d) for d in dirs]

print(dirs)

In [6]:
# Only need to run once, commenting out
# for d in dirs[1:len(dirs)]:
#     print(d)
#     # run the script using subproces:
#     subprocess.run(['python3', '05_combine_outputs.py',
#                     '--csv_file_pattern', f'{d}/dep*.csv',
#                     '--main_csv_file', f'./data/singapore/combined/{os.path.basename(d)}.csv'
#                     ])

## Create a moth prediction df

In [None]:
# For each csv in ./data/singapore/combined, remove where crops_status is 'NO DETECTIONS FOR IMAGE', and combine into one csv
csvs = os.listdir('./data/singapore/combined2')
csvs = [c for c in csvs if c.endswith('.csv')]
csvs = [c for c in csvs if not c.startswith('combined')]

df = pd.DataFrame()
for c in csvs:
    input_df = pd.read_csv(f'./data/singapore/combined2/{c}')
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]

    input_df = input_df.loc[input_df['top_1_species'].isna() == False, ]
    input_df['dep'] = os.path.basename(c).split('.')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    df = pd.concat([df, input_df])

df.to_csv('./data/singapore/combined2/combined_moths.csv', index=False)

In [8]:
# show subset where crop_status is nan
df = df.loc[df['box_score'] != 'IMAGE CORRUPT', ]
df = df.loc[df['box_score'] != 'NO DETECTIONS FOR IMAGE', ]

In [None]:
df.shape

In [None]:
df['order_name'].value_counts()

In [None]:
df['class_name'].value_counts()

In [12]:
# populate the crop column, if missing
df['crop_status'] = [os.path.basename(str(x)).replace('.jpg', '') for x in df['cropped_image_path'] ]

# strip all before '_' i not nan
df['crop_status'] = [x.split('_')[1] if x != 'nan' else x for x in df['crop_status']]

In [None]:
df = df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max', 'crop_area'])
df.reset_index(drop=True, inplace=True)
df.head()

In [14]:
# define order moth as moth if Lepidoptera in order_name, else nonmoth
df['order_moth'] = np.where(df['order_name'].str.contains('Lepidoptera'), 'moth', 'nonmoth')
df[['order_name', 'order_moth']].value_counts()


df = df.loc[df['order_moth'] == 'moth', ]

# Data Exploration

In [None]:
# Group and normalize by class_name
df_grouped = df.groupby(['class_name', 'order_moth']).size().unstack()
df_percent = df_grouped.div(df_grouped.sum(axis=1), axis=0) * 100  # Convert to percentage

fig, ax = plt.subplots(figsize=(4, 4))
df_percent.plot(kind='bar', stacked=True, ax=ax)


for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Avoid labeling empty bars
        ax.annotate(f'{height:.1f}%', (x + width / 2, y + height / 2),
                    ha='center', va='center', fontsize=10, color='black')

# Formatting
plt.title('Consistency between Order and Binary Prediction')
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Binary Prediction')
plt.legend(title='Order Prediction', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
df['top_1_species'].value_counts().head(30).plot(kind='bar', figsize=(7, 5))

# rotate x labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('Most Popular Predicted Moth Species in Singapore')
plt.show()

In [None]:
# Plto the crop area distribution

df['crop_area'].plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crop Area Distribution')
plt.xlabel('Crop Area (pixels squared)')
plt.show()

## Crop Confidence

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=20, figsize=(7, 5))
df['top_2_confidence'].plot(kind='hist', bins=20, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=20, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=20, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=20, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of Top 5 Predictions')
plt.xlabel('Confidence')

## Crop Area

In [None]:
df['crop_area'].plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crop Area Distribution')
plt.xlabel('Crop Area (pixels squared)')
plt.show()

## Number of Crops per Image

In [None]:
df['image_path'].value_counts().plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crops per image')
plt.xlabel('Number of crops per image')
plt.show()

In [None]:
df['image_path'].value_counts()

# Explore Instances of Moth Predictions

- Most confidence predictions
- Most popular predictions
- Largest Moths
- Most populated images

## Most Confident Predictions

In [22]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_confident = df_confident.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])

top_n=10

df_confident = df_confident.head(top_n)

In [None]:
# sort by image_path
df_confident.sort_values(by='image_path', inplace=True)
df_confident.reset_index(drop=True, inplace=True)
df_confident

In [24]:
download_dir='./data/singapore/combined2/'

In [27]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'confident'), exist_ok=True)

for i, row in df_confident.head(top_n).iterrows():
    download_images(row['keys'], os.path.join(download_dir, 'confident'), 'sgp')

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(5, 2, figsize=(10, 10))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_confident.head(top_n).iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'confident'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20, scaling_required=True)

plt.tight_layout()
plt.show()

## Most Popular Predictions


In [None]:
try:
    os.mkdir(os.path.join(download_dir, 'most_popular'))
except OSError as error:
    print(error)

In [36]:
def species_plot(df, species_name, n_examples):
    df_species = df.loc[df['top_1_species'] == species_name]
    df_species = df_species.sort_values(by='top_1_confidence', ascending=False)
    df_species.reset_index(drop=True, inplace=True)

    for i, row in df_species.head(n_examples).iterrows():
        download_images(row['keys'], os.path.join(download_dir, 'most_popular'), 'sgp')

    fig, ax = plt.subplots(5, 2, figsize=(10, 10))
    ax = ax.ravel()

    # for eah row in df_confident, get the image_path
    for i, row in df_species.head(top_n).iterrows():
        annotate_image(row['image_path'],
                    os.path.join(download_dir, 'most_popular'),
                    df,
                    ax[i],
                    crop_to_highlight=row['crop_status'],
                    buffer=20, scaling_required=True)

    plt.tight_layout()
    plt.suptitle(species_name)
    plt.show()

In [None]:
top_n_moths = 5
species = df['top_1_species'].value_counts()[0:top_n_moths].index
for i in species:
    print(i)

In [None]:
species_plot(df, species[0], 10)

In [None]:
species_plot(df, species[1], 10)

In [None]:
species_plot(df, species[2], 10)

## Largest Moths

In [None]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)

# drop rows where bounding box and area identical
df_area = df_area.drop_duplicates(subset=['x_min', 'x_max', 'y_min', 'y_max', 'crop_area'])

# remove duplicated species
df_area = df_area.drop_duplicates(subset=['top_1_species'])
df_area.reset_index(drop=True, inplace=True)
top_n=50

df_area = df_area.head(top_n)
df_area

In [44]:
# make the directory
os.makedirs(os.path.join(download_dir, 'largest'), exist_ok=True)

# download the relevant images
for i, row in df_area.iterrows():
    download_images(row['keys'], os.path.join(download_dir, 'largest'), 'sgp')

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(math.ceil(top_n/2), 2, figsize=(20, top_n*2))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_area.iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'largest'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20,
                scaling_required=True)

plt.tight_layout()
plt.show()