# Overview of Singapore Outputs

## Hidden Code

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
import subprocess
import math

In [2]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

import boto3
import json
from boto3.s3.transfer import TransferConfig

client = initialise_session('./credentials.json')

# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [3]:
def download_images(key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    client.download_file(bucket_name, key, download_path, Config=transfer_config)

download_dir = './data/singapore/combined/downloaded_images'
os.makedirs(download_dir, exist_ok=True)

## Combine the outputs for each deployment

In [None]:
# for each dir in ./data/singapore, run 05_combine_outputs.py
# and save the output in ./data/singapore/combined

dirs = os.listdir('./data/singapore')
dirs = [d for d in dirs if os.path.isdir(os.path.join('./data/singapore', d)) and d.startswith('dep')]
dirs = [os.path.join('./data/singapore', d) for d in dirs]

print(dirs)

In [5]:
# Only need to run once, commenting out
# for d in dirs[1:len(dirs)]:
#     print(d)
#     # run the script using subproces:
#     subprocess.run(['python3', '05_combine_outputs.py',
#                     '--csv_file_pattern', f'{d}/dep*.csv',
#                     '--main_csv_file', f'./data/singapore/combined/{os.path.basename(d)}.csv'
#                     ])

## Create a moth prediction df

In [6]:
# For each csv in ./data/singapore/combined, remove where crops_status is 'NO DETECTIONS FOR IMAGE', and combine into one csv
csvs = os.listdir('./data/singapore/combined')
csvs = [c for c in csvs if c.endswith('.csv')]
csvs = [c for c in csvs if not c.startswith('combined')]

df = pd.DataFrame()
for c in csvs:
    input_df = pd.read_csv(f'./data/singapore/combined/{c}')
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]

    # moths only
    input_df = input_df.dropna(subset=['top_1_confidence'])

    input_df['dep'] = os.path.basename(c).split('.')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    df = pd.concat([df, input_df])

df.to_csv('./data/singapore/combined/combined_moths.csv', index=False)

In [7]:
# define order moth as moth if Lepidoptera in order_name, else nonmoth
df['order_moth'] = np.where(df['order_name'].str.contains('Lepidoptera'), 'moth', 'nonmoth')

# Data Exploration

In [None]:
# Group and normalize by class_name
df_grouped = df.groupby(['class_name', 'order_moth']).size().unstack()
df_percent = df_grouped.div(df_grouped.sum(axis=1), axis=0) * 100  # Convert to percentage

fig, ax = plt.subplots(figsize=(4, 4))
df_percent.plot(kind='bar', stacked=True, ax=ax)


for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Avoid labeling empty bars
        ax.annotate(f'{height:.1f}%', (x + width / 2, y + height / 2),
                    ha='center', va='center', fontsize=10, color='black')

# Formatting
plt.title('Consistency between Order and Binary Prediction')
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Binary Prediction')
plt.legend(title='Order Prediction', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
df['top_1_species'].value_counts().head(30).plot(kind='bar', figsize=(7, 5))

# rotate x labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('Most Popular Predicted Moth Species in Singapore')
plt.show()

In [None]:
# Plto the crop area distribution

df['crop_area'].plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crop Area Distribution')
plt.xlabel('Crop Area (pixels squared)')
plt.show()

## Crop Confidence

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=20, figsize=(7, 5))
df['top_2_confidence'].plot(kind='hist', bins=20, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=20, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=20, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=20, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of Top 5 Predictions')
plt.xlabel('Confidence')

# Explore Instances of Moth Predictions

- Most confidence predictions
- Most popular predictions
- Largest Moths

## Most Confident Predictions

In [15]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

In [16]:
# download the relevant images
for i, row in df_confident.head(10).iterrows():
    download_images(row['keys'], download_dir, 'sgp')

In [None]:
# plot the images for the first 10 rows
fig, ax = plt.subplots(5, 2, figsize=(20, 20))
ax = ax.ravel()

# reset the index
for i in range(0, 10):
    row = df_confident.loc[i]

    img = plt.imread(f"{download_dir}/{os.path.basename(row['keys'])}")
    image = Image.open(f"{download_dir}/{os.path.basename(row['keys'])}").convert("RGB")
    ax[i].imshow(img, origin='lower')
    original_width, original_height = image.size

    # add box to plot based on the bounding box
    x_min = row['x_min'] *300 / original_width  # Convert to pixels
    y_min = row['y_min'] *300 / original_height
    x_max = row['x_max'] *300 / original_width
    y_max = row['y_max'] *300 / original_height

    x = int(x_min)
    y = int(y_min)
    w = int(x_max - x_min)
    h = int(y_max - y_min)

    rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor='red', linewidth=2)
    ax[i].add_patch(rect)

    ax[i].set_title(row['top_1_species'])
    ax[i].axis('off')
    ax[i].invert_yaxis()

plt.show()

## Most Popular Predictions


In [18]:
def species_plot(df, species_name, n_examples):
    df_species = df.loc[df['top_1_species'] == species_name]
    df_species = df_species.sort_values(by='top_1_confidence', ascending=False)
    df_species.reset_index(drop=True, inplace=True)

    for i, row in df_species.head(n_examples).iterrows():
        download_images(row['keys'], download_dir, 'sgp')

    # plot the images for the first n rows
    fig, ax = plt.subplots(math.ceil(n_examples/2), 2, figsize=(20, n_examples*2))
    ax = ax.ravel()

    # reset the index
    for i in range(0, n_examples):
        row = df_species.loc[i]

        img = plt.imread(f"{download_dir}/{os.path.basename(row['keys'])}")
        image = Image.open(f"{download_dir}/{os.path.basename(row['keys'])}").convert("RGB")
        ax[i].imshow(img, origin='lower')
        original_width, original_height = image.size

        # add box to plot based on the bounding box
        x_min = row['x_min'] *300 / original_width  # Convert to pixels
        y_min = row['y_min'] *300 / original_height
        x_max = row['x_max'] *300 / original_width
        y_max = row['y_max'] *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor='red', linewidth=2)
        ax[i].add_patch(rect)


        ax[i].axis('off')
        ax[i].set_title(f"{row['top_1_confidence']:.2f}")
        ax[i].invert_yaxis()

    plt.suptitle(species_name)
    plt.show()

In [None]:
top_n_moths = 5
species = df['top_1_species'].value_counts()[0:top_n_moths].index
print(species)

In [None]:
species_plot(df, species[0], 10)

In [None]:
species_plot(df, species[1], 10)

In [None]:
species_plot(df, species[2], 10)

## Largest Moths

In [21]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)
df_area.reset_index(drop=True, inplace=True)

top_n=20

In [22]:
# download the relevant images
for i, row in df_area.head(top_n).iterrows():
    download_images(row['keys'], download_dir, 'sgp')

In [None]:
# plot the images for the first 10 rows
fig, ax = plt.subplots(math.ceil(top_n/2), 2, figsize=(20, top_n*2))
ax = ax.ravel()

# reset the index
for i in range(0, top_n):
    row = df_area.loc[i]

    img = plt.imread(f"{download_dir}/{os.path.basename(row['keys'])}")
    image = Image.open(f"{download_dir}/{os.path.basename(row['keys'])}").convert("RGB")
    ax[i].imshow(img, origin='lower')
    original_width, original_height = image.size

    # add box to plot based on the bounding box
    x_min = row['x_min'] *300 / original_width  # Convert to pixels
    y_min = row['y_min'] *300 / original_height
    x_max = row['x_max'] *300 / original_width
    y_max = row['y_max'] *300 / original_height

    x = int(x_min)
    y = int(y_min)
    w = int(x_max - x_min)
    h = int(y_max - y_min)

    rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor='red', linewidth=2)
    ax[i].add_patch(rect)

    ax[i].set_title(row['top_1_species'])
    ax[i].axis('off')
    ax[i].invert_yaxis()

plt.show()