# Overview of UK Outputs

## Hidden Code

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
import subprocess
import math

In [2]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

import boto3
import json
from boto3.s3.transfer import TransferConfig

client = initialise_session('./credentials.json')

# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [3]:
def download_images(key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    client.download_file(bucket_name, key, download_path, Config=transfer_config)

download_dir = './data/uk/downloaded_images'
os.makedirs(download_dir, exist_ok=True)

In [38]:
def annotate_image(image_path, dir, df, ax, scaling_required=False, crop_to_highlight=None, buffer=5):
    df_image = df.loc[df['image_path'] == image_path, ]

    img = plt.imread(f"{dir}/{os.path.basename(image_path)}")
    image = Image.open(f"{dir}/{os.path.basename(image_path)}").convert("RGB")
    subp = ax.imshow(img, origin='lower')

    for j, row in df_image.iterrows():
        x_min = row['x_min'] -buffer
        y_min = row['y_min'] -buffer
        x_max = row['x_max'] +buffer
        y_max = row['y_max'] +buffer

        if scaling_required:
            original_width, original_height = image.size
            x_min = x_min *300 / original_width
            y_min = y_min *300 / original_height
            x_max = x_max *300 / original_width
            y_max = y_max *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        if (row['class_name'] == "moth") and ('Lepidoptera' not in row['order_name']):
            col = 'orange'
        elif (row['class_name'] != "moth") and ('Lepidoptera' in row['order_name']):
            col = 'purple'
        elif (row['class_name'] == "moth") and ('Lepidoptera' in row['order_name']):
                col = 'green'
        else:
            col = 'red'

        alph = 1
        if crop_to_highlight is not None:
            if row['crop_status'] != crop_to_highlight:
                alph = 0.2


        if (row['class_name'] == "moth") or ("Lepidoptera" in row['order_name']):
            ax.text(x_min, y_max,
                    f"{row['top_1_species']}: {row['top_1_confidence']:.2f}",
                    color=col,
                    fontsize=5, alpha=alph,
                    verticalalignment="bottom")

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor=col, linewidth=1, alpha=alph)
        ax.add_patch(rect)

    ax.set_title(f"{os.path.basename(image_path)}")
    ax.axis('off')
    return subp

## Combine the outputs for each deployment

In [None]:
# for each dir in ./data/singapore, run 05_combine_outputs.py
# and save the output in ./data/singapore/combined

def dir_set(root_dir):
    dirs = os.listdir(root_dir)
    dirs = [d for d in dirs if d.startswith('dep')]
    dirs = [d for d in dirs if d.endswith('csv')]
    dirs = [os.path.join(root_dir, d) for d in dirs]
    return dirs

all_dirs = dir_set('./data/nettlebed') + dir_set('./data/solar/gbr') #+ dir_set('./data/farralia/dep000080')

print(all_dirs)

In [11]:
# for each dir, read in the csv and combine into one df
combined = pd.DataFrame()
for d in all_dirs:
    df = pd.read_csv(d)
    df['dep'] = os.path.basename(d).split('.')[0].split('_')[0]
    combined = pd.concat([combined, df])

# # subset to unique rows
combined = combined.drop_duplicates()


In [None]:
# show subset where crop_status is nan
combined = combined.loc[combined['box_score'] != 'IMAGE CORRUPT', ]
combined = combined.loc[combined['box_score'] != 'NO DETECTIONS FOR IMAGE', ]

In [22]:
# populate the crop column, if missing
combined['crop_status'] = [os.path.basename(x).split('_')[1].replace('.jpg', '') for x in combined['cropped_image_path'] ]

In [None]:
combined = combined.drop_duplicates(subset=['x_min', 'x_max', 'y_min', 'y_max', 'crop_area'])
combined.reset_index(drop=True, inplace=True)
combined.head()

In [None]:
combined['dep'].value_counts()

## Create a moth prediction df

In [None]:
# moths only
df = combined.dropna(subset=['top_1_confidence'])

df['crop_area'] = (df['x_max'] - df['x_min']) * (df['y_max'] - df['y_min'])

# # set new keys column as 'dep' and 'image_path' combined
df['keys'] = df['dep'] + '/snapshot_images/' + [os.path.basename(x) for x in df['image_path']]

In [None]:
# define order moth as moth if Lepidoptera in order_name, else nonmoth
df['order_moth'] = np.where(df['order_name'].str.contains('Lepidoptera'), 'moth', 'nonmoth')

In [None]:
# Check this
df[['order_name', 'order_moth']].value_counts()

# Data Exploration

In [None]:
# Group and normalize by class_name
df_grouped = df.groupby(['class_name', 'order_moth']).size().unstack()
df_percent = df_grouped.div(df_grouped.sum(axis=1), axis=0) * 100  # Convert to percentage

fig, ax = plt.subplots(figsize=(4, 4))
df_percent.plot(kind='bar', stacked=True, ax=ax)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Avoid labeling empty bars
        ax.annotate(f'{height:.1f}%', (x + width / 2, y + height / 2),
                    ha='center', va='center', fontsize=10, color='black')

# Formatting
plt.title('Consistency between Order and Binary Prediction')
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Binary Prediction')
plt.legend(title='Order Prediction', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
df['top_1_species'].value_counts().head(30).plot(kind='bar', figsize=(7, 5))

# rotate x labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('Most Popular Predicted Moth Species in the UK')
plt.show()

## Crop Confidence

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=20, figsize=(7, 5))
df['top_2_confidence'].plot(kind='hist', bins=20, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=20, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=20, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=20, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of Top 5 Predictions')
plt.xlabel('Confidence')

## Crop Area

In [None]:
df['crop_area'].plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crop Area Distribution')
plt.xlabel('Crop Area (pixels squared)')
plt.show()

## Number of Crops per Image

In [None]:
# histogram of image_path frequency
df['image_path'].value_counts().plot(kind='hist', bins=50, figsize=(5, 3))
plt.title('Crops per image')
plt.xlabel('Number of crops per image')
plt.show()

# Explore Instances of Moth Predictions

- Most confidence predictions
- Most popular predictions
- Largest Moths
- Most populated images

## Most Confident Predictions

In [33]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_confident = df_confident.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])

top_n=10

In [34]:
df_confident = df_confident.head(top_n)

In [35]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'confident'), exist_ok=True)

for i, row in df_confident.head(top_n).iterrows():
    download_images(row['keys'], os.path.join(download_dir, 'confident'), 'gbr')

In [None]:
# sort by image_path
df_confident.sort_values(by='image_path', inplace=True)
df_confident.reset_index(drop=True, inplace=True)
df_confident

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(5, 2, figsize=(10, 10))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_confident.head(top_n).iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'confident'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20)

plt.tight_layout()
plt.show()

## Most Popular Predictions


In [None]:
os.mkdir(os.path.join(download_dir, 'most_popular'), exist_ok=True)

In [47]:
def species_plot(df, species_name, n_examples):
    df_species = df.loc[df['top_1_species'] == species_name]
    df_species = df_species.sort_values(by='top_1_confidence', ascending=False)
    df_species.reset_index(drop=True, inplace=True)

    for i, row in df_species.head(n_examples).iterrows():
        download_images(row['keys'], os.path.join(download_dir, 'most_popular'), 'gbr')

    fig, ax = plt.subplots(5, 2, figsize=(10, 10))
    ax = ax.ravel()

    # for eah row in df_confident, get the image_path
    for i, row in df_species.head(top_n).iterrows():
        annotate_image(row['image_path'],
                    os.path.join(download_dir, 'most_popular'),
                    df,
                    ax[i],
                    crop_to_highlight=row['crop_status'],
                    buffer=20)

    plt.tight_layout()
    plt.show()

    plt.suptitle(species_name)
    plt.show()

In [None]:
top_n_moths = 5
species = df['top_1_species'].value_counts()[0:top_n_moths].index
for i in species:
    print(i)

In [None]:
species_plot(df, species[0], 10)

In [None]:
species_plot(df, species[1], 10)

In [None]:
species_plot(df, species[2], 10)

## Largest Moths

In [69]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)

# drop rows where bounding box and area identical
df_area = df_area.drop_duplicates(subset=['x_min', 'x_max', 'y_min', 'y_max', 'crop_area'])
df_area.reset_index(drop=True, inplace=True)

top_n=20

In [None]:
df_area = df_area.head(top_n)
df_area

In [None]:
# make the directory
os.makedirs(os.path.join(download_dir, 'largest'), exist_ok=True)

# download the relevant images
for i, row in df_area.iterrows():
    print(row['keys'])
    download_images(row['keys'], os.path.join(download_dir, 'largest'), 'gbr')

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(math.ceil(top_n/2), 2, figsize=(20, top_n*2))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_area.iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'largest'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20)

plt.tight_layout()
plt.show()

## Largest moths with confidence

In [73]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)

# drop rows where bounding box and area identical
df_area = df_area.drop_duplicates(subset=['x_min', 'x_max', 'y_min', 'y_max', 'crop_area'])
df_area = df_area.loc[df_area['top_1_confidence'] > 0.7, ]

df_area.reset_index(drop=True, inplace=True)

top_n=20

In [74]:
df_area = df_area.head(top_n)

In [None]:
# make the directory
os.makedirs(os.path.join(download_dir, 'largest'), exist_ok=True)

# download the relevant images
for i, row in df_area.iterrows():
    download_images(row['keys'], os.path.join(download_dir, 'largest'), 'gbr')

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(math.ceil(top_n/2), 2, figsize=(20, top_n*2))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_area.iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'largest'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20)

plt.tight_layout()
plt.show()

## Most Populated Images

In [None]:
top_n = 5

most_pop = df['image_path'].value_counts().head(top_n).index

df_pop = df.loc[df['image_path'].isin(most_pop)]
df_pop.head()

In [None]:
# create the directory and download the images
os.makedirs(os.path.join(download_dir, 'most_populated'), exist_ok=True)
for i in list(set(df_pop['keys'])):
    print(i)
    download_images(i, os.path.join(download_dir, 'most_populated'), 'gbr')

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(3, 2, figsize=(10, 10))
ax = ax.ravel()

# for each most_pop image, create an annotated image subplot
for i, image_path in enumerate(most_pop):
    annotate_image(image_path, os.path.join(download_dir, 'most_populated'), df_pop, ax[i])

# add legend
ax[5].legend(handles=[plt.Line2D([0], [0], color='orange', lw=4, label='Non-Moth, Lepidoptera'),
                    plt.Line2D([0], [0], color='purple', lw=4, label='Moth, Non-Lepidoptera'),
                    plt.Line2D([0], [0], color='green', lw=4, label='Moth, Lepidoptera'),
                    plt.Line2D([0], [0], color='red', lw=4, label='Non-Moth, Non-Lepidoptera')], loc='upper left')
ax[5].axis('off')
plt.tight_layout()
plt.show()