# Regional QC

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import json
from boto3.s3.transfer import TransferConfig
from PIL import Image
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta, time
import matplotlib.dates as mdates
import numpy as np
from matplotlib.colors import LogNorm, ListedColormap
from tqdm import tqdm

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
# set the working directory
os.chdir(os.path.expanduser('~/amber-inferences'))

In [None]:
region='cri'
country='costarica'
download_dir=f'./data/qc_plots/{country}'
os.makedirs(download_dir, exist_ok=True)

inference_dir = os.path.abspath(f'./data/{country}_inferences/')

#listdir recursively
def listdir_recursive(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            yield os.path.join(root, file)

# Get all csv files in the inference directory
inference_csvs = list(listdir_recursive(inference_dir))
inference_csvs = [c for c in inference_csvs if c.endswith('.csv')]

In [None]:
len(inference_csvs)

## Plotting and Data Wrangling Functions

In [None]:
def annotate_image(image_path, dir, df, ax, scaling_required=False, crop_to_highlight=None, buffer=5, subtitle=None):
    df_image = df.loc[df['image_path'] == image_path, ]

    img = plt.imread(f"{dir}/{os.path.basename(image_path)}")
    image = Image.open(f"{dir}/{os.path.basename(image_path)}").convert("RGB")
    subp = ax.imshow(img, origin='lower')

    for j, row in df_image.iterrows():
        x_min = row['x_min'] -buffer
        y_min = row['y_min'] -buffer
        x_max = row['x_max'] +buffer
        y_max = row['y_max'] +buffer

        if scaling_required:
            original_width, original_height = image.size
            x_min = x_min *300 / original_width
            y_min = y_min *300 / original_height
            x_max = x_max *300 / original_width
            y_max = y_max *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        if (row['class_name'] == "moth") and ('Lepidoptera' not in row['order_name']):
            col = 'orange'
        elif (row['class_name'] != "moth") and ('Lepidoptera' in row['order_name']):
            col = 'purple'
        elif (row['class_name'] == "moth") and ('Lepidoptera' in row['order_name']):
                col = 'green'
        else:
            col = 'red'

        alph = 1
        if crop_to_highlight is not None:
            if row['crop_status'] != crop_to_highlight:
                alph = 0.2


        if (row['class_name'] == "moth") or ("Lepidoptera" in row['order_name']):
            ax.text(x_min, y_max,
                    f"{row['top_1_species']}: {row['top_1_confidence']:.2f}",
                    color=col,
                    fontsize=5, alpha=alph,
                    verticalalignment="bottom")

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor=col, linewidth=1, alpha=alph)
        ax.add_patch(rect)

    if not subtitle:
        subtitle=f"{os.path.basename(image_path)}"
    ax.set_title(subtitle)
    ax.axis('off')
    return subp

In [None]:
def download_images(s3_client, config, key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    s3_client.download_file(bucket_name, key, download_path, Config=config)

In [None]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

client = initialise_session('./credentials.json')

In [None]:
# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [None]:
def moth_only_df(inference_csvs):
    df = pd.DataFrame()
    for c in tqdm(inference_csvs, desc='Reading in the csvs'):
        input_df = pd.read_csv(c, low_memory=False)
        input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
        input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

        input_df = input_df.loc[input_df['top_1_species'].isna() == False, ]
        input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
        input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

        # set new keys column as 'dep' and 'image_path' combined
        input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
        df = pd.concat([df, input_df])
        del input_df

    return df

In [None]:
def cat_summary(inference_csvs, category='order_name'):
    df = pd.DataFrame()
    for c in inference_csvs:
        input_df = pd.read_csv(c, low_memory=False)
        input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
        input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

        input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
        input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

        # summarise the order_name by deployment
        summary = input_df[['dep', category]].value_counts()
        summary = summary.reset_index()
        summary.columns = ['deployment', category, 'count']
        summary['file'] = os.path.basename(c)

        df = pd.concat([df, summary], ignore_index=True)
        del input_df

    df = df[['deployment', category, 'count']].groupby(['deployment', category]).sum().reset_index()

    df = df.sort_values(by=['deployment', 'count'], ascending=[True, False])
    return df

In [None]:
def get_date(file_name, format="%Y-%m-%d"):
    """
    Extracts the date from the file name based on the specified format.
    Assuming the date is in the filename and formatted as YYYYMMDD or similar.
    """
    try:
        file_name = os.path.splitext(file_name)[0]
        file_raw = os.path.basename(file_name).replace("_", "-").split("-")
        file = [x for x in file_raw if x.startswith("202")][0]

        # catch for delim between date and time in file name
        if len(file) < 12:
            i0 = [idx for idx in range(len(file_raw)) if file_raw[idx].startswith("202")][0]
            file = ('').join(file_raw[i0:i0+2])

        image_dt = datetime.strptime(file, "%Y%m%d%H%M%S%f")
        image_dt = datetime.strftime(image_dt, format)
        return image_dt

    except Exception as e:
        print(f"  - Error parsing date from file name {file_name}: {e}")
        return np.nan

def assign_night(ts, night_endpoint=12, night_startpoint=12):
    """
    Defines the recording night from date and time.
    The recording night cutoff is defined between night_startpoint on day 1
    and night_endpoint day 2.
    """
    try:
        if ts.time() < time(night_endpoint, 0):  # before night_endpoint o'clock
            night_start = ts.date() - timedelta(days=1)
        elif ts.time() >= time(night_startpoint, 0):  # after night_startpoint or later
            night_start = ts.date()
        else:
            # times not included in this overnight window
            night_start = pd.NaT
        if pd.isna(night_start):
            return None
        night_end = night_start + timedelta(days=1)
        return night_start
    except Exception as e:
        return 'No known date'

In [None]:
df=dep_session_df_all
end_date=None
min_date=pd.to_datetime('2024-01-01')
drop_empty_days = True


In [None]:
pd.to_datetime('2024-01-01').strftime('%b')

In [None]:
def activity_plot_data(df, drop_empty_days = True, min_date=pd.to_datetime('2024-01-01'), end_date=None):
    df['session'] = pd.to_datetime(df['session'])

    if end_date is None:
        end_date = df['session'].max()

    all_dates = pd.date_range(start=min_date, end=end_date)
    df = df.set_index('session').reindex(all_dates).fillna(0).rename_axis('session').reset_index()

    # Add week, day of week, and year
    df['week'] = df['session'].dt.isocalendar().week
    df['weekday'] = df['session'].dt.weekday  # Monday=0
    df['month'] = df['session'].dt.month
    df['year'] = df['session'].dt.year
    df['week_number'] = ((df['session'] - min_date).dt.days // 7).astype(int)
    df['week_label'] = 'W' + df['week_number'].astype(str)

    # Some dates in last week of December may belong to week 1 of next year
    df.loc[df['week'] == 1, 'year'] = df['session'].dt.year

    # Pivot for heatmap
    heatmap_data = df.pivot_table(index='weekday', columns='week_number', values='count', aggfunc='sum')

    # Reorder to GitHub style
    day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    heatmap_data.index = [day_names[i] for i in heatmap_data.index]
    heatmap_data = heatmap_data.reindex(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'])

    # Determine month labels
    year_labels = df.drop_duplicates('week_number').set_index('year').index
    month_labels = df.drop_duplicates('week_number').set_index('session').index.strftime('%b')
    week_labels = df.drop_duplicates('week_number').set_index('week_number').loc[heatmap_data.columns, 'week_label']


    if drop_empty_days:
        heatmap_data = heatmap_data.fillna(0)
        heatmap_data = heatmap_data.loc[(heatmap_data != 0).any(axis=1)]
        heatmap_data = heatmap_data.dropna(axis=0, how='all')

    return [heatmap_data, month_labels, week_labels, year_labels]

In [None]:
def activity_plot(df, ax, dep, min_date=pd.to_datetime('2024-01-01'), vmin=0, vmax=1e6, custom_cmap='Greens', include_month_labels=True, end_date=None, label_buffer=1, show_colourbar=True):
    heatmap_data, month_labels, week_labels, year_labels = activity_plot_data(df, drop_empty_days=True, min_date=min_date, end_date=end_date)
    norm = LogNorm(vmin=vmin, vmax=vmax + 1)

    c = ax.pcolor(heatmap_data.values, cmap=custom_cmap,
                  edgecolors='grey', linewidths=1, norm=norm)

    ax.set_yticks(np.arange(0.5, len(heatmap_data.index), 1))
    ax.set_yticklabels(heatmap_data.index)

    ax.set_xticks(np.arange(0.5, len(week_labels), 1))
    ax.set_xticklabels(week_labels, rotation=90)

    if include_month_labels:
        for i, label in enumerate(month_labels):
            if i > 0 and month_labels[i] != month_labels[i - 1]:
                ax.text(i + 0.5, -2*label_buffer, label, ha='center', va='center')
            elif i == 0:
                ax.text(i + 0.5, -2*label_buffer, label, ha='center', va='center')

        for i, label in enumerate(year_labels):
            if i > 0 and year_labels[i] != year_labels[i - 1]:
                ax.text(i + 0.5, -3*label_buffer, label, ha='center', va='center')
            elif i == 0:
                ax.text(i + 0.5, -3*label_buffer, label, ha='center', va='center')

    if show_colourbar:
        fig.colorbar(c, ax=ax, orientation='vertical', label='Number of crops')
    ax.set_title(dep)

    return c

## Activity Plots

In [None]:
# plot showing the correlation between order confidence and crop area

# this plot takes a while to compile, uncomment if you want to run
# sns.regplot(x=df['order_confidence'], y=df['crop_area'], logx=True, line_kws=dict(color="r"))
# plt.yscale('log')
# plt.show()

In [None]:
# create a df for deployment counts
dep_session_df = pd.DataFrame()
dep_session_df_moth = pd.DataFrame()
for c in tqdm(inference_csvs, desc='reading in the csvs'):
    input_df = pd.read_csv(c, low_memory=False)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

    input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])
    input_df['datetime'] = input_df['image_path'].apply(lambda x: get_date(x, format="%Y-%m-%d %H:%M:%S"))
    input_df['datetime'] = pd.to_datetime(input_df['datetime'])

    # get the session night
    input_df['session'] = input_df['datetime'].apply(assign_night)

    # summarise the order_name by deployment
    summary = input_df[['dep', 'session']].value_counts()
    summary = summary.reset_index()
    summary['file'] = os.path.basename(c)
    dep_session_df = pd.concat([dep_session_df, summary], ignore_index=True)

    # summarise the order_name by deployment
    input_df = input_df.loc[input_df['order_name'].str.contains("Lepidoptera"), ]
    summary_moth = input_df[['dep', 'session']].value_counts()
    summary_moth = summary_moth.reset_index()
    summary_moth['file'] = os.path.basename(c)
    dep_session_df_moth = pd.concat([dep_session_df_moth, summary_moth], ignore_index=True)

    del input_df


In [None]:
dep_session_df = dep_session_df.loc[dep_session_df['session'] != 'No known date', ]
dep_session_df = dep_session_df[['dep', 'count', 'session']].groupby(['dep', 'session']).sum().reset_index()
dep_session_df.head()

In [None]:
dep_df = dep_session_df[['dep', 'count']].groupby(['dep']).sum().reset_index()
dep_df = dep_df.sort_values(by=['dep', 'count'], ascending=[True, False])
dep_df.head()

In [None]:
dep_session_df_all = dep_session_df[['session', 'count']].groupby(['session']).sum().reset_index()
dep_session_df_all.head()

In [None]:
cmap = plt.cm.Greens
newcolors = cmap(np.linspace(0, 1, 256))
newcolors[0] = [1, 1, 1, 1]  # RGBA for white
custom_cmap = ListedColormap(newcolors)

In [None]:
fig, ax = plt.subplots(figsize=(20, 4))

activity_plot(dep_session_df_all, ax, dep='All', min_date=pd.to_datetime('2024-01-01'),
              vmin=1, vmax=dep_session_df_all['count'].max(), custom_cmap=custom_cmap)

ax.set_title("Crop Activity Heatmap")
plt.tight_layout()
plt.show()

In [None]:
min_date=pd.to_datetime('2024-01-01')
max_labels = ((pd.to_datetime(dep_session_df['session']).max() - min_date).days // 7)
date_range = pd.date_range(start=min_date, end=pd.to_datetime(dep_session_df['session']).max(), freq='W')

week_labels = ['W' + str(i+1) for i in range(len(date_range))]
year_labels = [x.strftime('%Y') for x in date_range]
month_labels = [x.strftime('%b') for x in date_range]

In [None]:
# subplots with activity plots for each deployment
ncols=1
fig, axs = plt.subplots(6, ncols, figsize=(20, 10), sharex='all')
axs = axs.flatten()

for i, dep in enumerate(dep_df['dep'].unique()):
    c=activity_plot(dep_session_df.loc[dep_session_df['dep'] == dep, ['dep', 'session', 'count']],
                  axs[i], dep=dep, min_date=pd.to_datetime('2024-01-01'),
                vmin=1, vmax=dep_session_df['count'].max(), label_buffer=2,
                custom_cmap=custom_cmap, include_month_labels=False, end_date = dep_session_df['session'].max(),
                show_colourbar=False)
    axs[i].set_aspect('equal')

axs[i].set_xticks(np.arange(0.5, len(week_labels), 1))
axs[i].set_xticklabels(week_labels, rotation=90)
axs[i-1].set_xticks(np.arange(0.5, len(week_labels), 1))
axs[i-1].set_xticklabels(week_labels, rotation=90)

cbar_ax = fig.add_axes([0.25, -0.1*ncols, 0.5, 0.03])  # [left, bottom, width, height]
fig.colorbar(c, cax=cbar_ax, orientation='horizontal', label='Crop count (log scale)')

# Optional week index labels
for i, label in enumerate(month_labels):
    if i > 0 and month_labels[i] != month_labels[i - 1]:
        for j in range(1, ncols+1):
            axs[len(dep_df['dep'].unique())-j].text(i + 0.5, -2*ncols, label, ha='center', va='center')
    elif i == 0:
        for j in range(1, ncols+1):
            axs[len(dep_df['dep'].unique())-j].text(i + 0.5, -2*ncols, label, ha='center', va='center')


for i, label in enumerate(year_labels):
    if i > 0 and year_labels[i] != year_labels[i - 1]:
        for j in range(1, ncols+1):
            axs[len(dep_df['dep'].unique())-j].text(i + 0.5, -3*ncols, label, ha='center', va='center')
    elif i == 0:
        for j in range(1, ncols+1):
            axs[len(dep_df['dep'].unique())-j].text(i + 0.5, -3*ncols, label, ha='center', va='center')

plt.suptitle("Crop Activity Heatmap")

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=dep_df, hue='count', y='count', x='dep')
plt.title('All crops per deployment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.yscale('log')
plt.legend().set_visible(False)
plt.ylabel('Number of crops')
plt.xlabel('Deployment')
plt.show()

### Moth Activity

In [None]:
df = moth_only_df(inference_csvs)

In [None]:
df['image_path'].value_counts().plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Moth crops per image')
plt.xlabel('Number of moth crops per image (n > 0)')
plt.yscale('log')
plt.show()

In [None]:
# Show the confidence distribution for each deployment in subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10, 5), sharex=True)
axes = axes.flatten()

for i, dep in enumerate(df['dep'].unique()):
    ax = axes[i]
    dep_df = df.loc[df['dep'] == dep, ]
    dep_df['image_path'].value_counts().plot(kind='hist', bins=100, ax=ax)
    ax.set_title(dep)
    ax.set_xlabel('Number of moth crops per image (n > 0)')
    ax.set_yscale('log')

plt.suptitle(f'Confidence Distribution of 5 Most Likely Species Predictions')
plt.tight_layout()
plt.show()

In [None]:
dep_counts = df['dep'].value_counts()
dep_counts = dep_counts.reset_index()
dep_counts.columns = ['deployment', 'count']

plt.figure(figsize=(7, 5))
sns.barplot(data=dep_counts, hue='count', y='count', x='deployment')
plt.title('Moth crops per deployment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.yscale('log')
plt.legend(visible=False)
plt.show()

## Order Prediction Plots

In [None]:
order_counts = cat_summary(inference_csvs)

In [None]:
order_counts.head()

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=order_counts, hue='deployment', y='count', x='order_name')
plt.title('Counts by Deployment and Order')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Order')
plt.ylabel('Number of Crops')
plt.legend(title='Order Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## Species Prediction Plots

In [None]:
species_counts = cat_summary(inference_csvs, category='top_1_species')

# subset to only the top 10 species
top_species = species_counts[['top_1_species', 'count']].groupby('top_1_species').sum().reset_index()
top_species = top_species.sort_values(by='count', ascending=False)
top_species = top_species.head(10)

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=top_species, y='count', x='top_1_species')
plt.title('Top Species Counts')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Most Likely Species')
plt.ylabel('Number of Crops')
plt.tight_layout()
plt.show()

In [None]:
top_species.head()

In [None]:
species_counts.head()

In [None]:
species_counts['deployment'].unique()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10, 7), sharey=True)
axes = axes.flatten()

for i, dep in enumerate(species_counts['deployment'].unique()):
    ax = axes[i]
    dep_df = species_counts.loc[species_counts['deployment'] == dep, ]
    dep_df = dep_df.sort_values(by='count', ascending=False).head(10)
    sns.barplot(data=dep_df, y='count', x='top_1_species', ax=ax)
    ax.set_title(dep)
    ax.set_xticks(range(len(dep_df)))
    ax.set_xticklabels(dep_df['top_1_species'], rotation=45, ha='right')
    ax.set_xlabel('')
    ax.set_ylabel('Number of Crops')

plt.suptitle('Top Species Observations by Deployment')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=species_counts.loc[species_counts['top_1_species'].isin(top_species['top_1_species']), ], hue='deployment', y='count', x='top_1_species')
plt.title('Top Species Counts by Deployment')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Most popular species', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.xlabel('Most Likely Species')
plt.ylabel('Number of Crops')
plt.show()

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=50, figsize=(6, 3))
df['top_2_confidence'].plot(kind='hist', bins=50, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=50, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=50, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=50, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of 5 Most Likely Species Predictions')
plt.xlabel('Confidence')
plt.show()

In [None]:
# Show the confidence distribution for each deployment in subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(10, 5), sharex=True)
axes = axes.flatten()

for i, dep in enumerate(df['dep'].unique()):
    ax = axes[i]
    dep_df = df.loc[df['dep'] == dep, ]
    dep_df['top_1_confidence'].plot(kind='hist', bins=50, ax=ax, alpha=0.5)
    dep_df['top_2_confidence'].plot(kind='hist', bins=50, ax=ax, color='orange', alpha=0.5)
    dep_df['top_3_confidence'].plot(kind='hist', bins=50, ax=ax, color='yellow', alpha=0.5)
    dep_df['top_4_confidence'].plot(kind='hist', bins=50, ax=ax, color='green', alpha=0.5)
    dep_df['top_5_confidence'].plot(kind='hist', bins=50, ax=ax, color='purple', alpha=0.5)
    ax.legend().set_visible(False)
    ax.set_title(dep)
    ax.set_xlabel('Confidence')
    ax.set_ylabel('Count')


handles, _ = axes[0].get_legend_handles_labels()
labels = ['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence']
fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(1.2, 0.9))
plt.suptitle(f'Confidence Distribution of 5 Most Likely Species Predictions')
plt.tight_layout()
plt.show()

## Most Confident cases for each order

In [None]:
# number of images required per order
n_images = 1

# create a df for the most confidence order predictions
order_df = pd.DataFrame()
for c in inference_csvs:
    input_df = pd.read_csv(c, low_memory=False)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])
    input_df = input_df.sort_values(by='order_confidence', ascending=False)

    input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    order_df = pd.concat([order_df, input_df])
    del input_df

order_df = order_df.sort_values(by=['dep', 'order_confidence'], ascending=[True, False])
order_df = order_df.groupby(['order_name', 'dep']).head(n_images).reset_index(drop=True)

In [None]:
# order by confidence and subset
df_order = order_df.sort_values(by='order_confidence', ascending=False)
df_order = df_order.loc[df_order['order_confidence'] > 0.95, ]
df_order.reset_index(drop=True, inplace=True)

print(df_order.shape)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'order'), exist_ok=True)

for i, row in df_order.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'order'), region)

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(5, 6, figsize=(15, 7.5))
ax = ax.ravel()

df_order = df_order.sort_values(by='order_name', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_order.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'order'),
        df_order,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['dep']}\n{row['order_name']}, {row['order_confidence']:.2f}"
    )

for i in range(len(df_order), len(ax)):
    ax[i].axis('off')

plt.tight_layout()
plt.show()

# Most Confident Species Predictions

In [None]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_confident = df_confident.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])
df_confident = df_confident.drop_duplicates(subset=['top_1_species'])

top_n=20

df_confident = df_confident.head(top_n)

In [None]:
# sort by image_path
df_confident.sort_values(by='image_path', inplace=True)
df_confident.reset_index(drop=True, inplace=True)
df_confident

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'confident'), exist_ok=True)

for i, row in df_confident.head(top_n).iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'confident'), region)

In [None]:
df_confident.shape

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_confident = df_confident.sort_values(by='top_1_confidence', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_confident.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'confident'),
        df_confident,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['top_1_species']}, ({row['top_1_confidence']:.2f})"
    )

for i in range(len(df_confident), len(ax)):
    ax[i].axis('off')

plt.tight_layout()
plt.show()

## Largest Moths

In [None]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)
df_area.reset_index(drop=True, inplace=True)
df_area = df_area.drop_duplicates(subset=['top_1_species'])
df_area = df_area.loc[df_area['top_1_confidence'] > 0.85, ]

top_n=20

df_area = df_area.head(top_n)

df_area.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'largest'), exist_ok=True)

for i, row in df_area.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'largest'), region)

In [None]:
df_area.head()

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_area = df_area.sort_values(by='crop_area', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_area.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'largest'),
        df_area,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['top_1_species']}, ({row['top_1_confidence']:.2f})"
    )

plt.tight_layout()
plt.show()

# Blurriest Crops

In [None]:
# order by moth size
df_blur = df.sort_values(by='crop_bluriness', ascending=False)
df_blur.reset_index(drop=True, inplace=True)

df_blur = df_blur.drop_duplicates(subset=['top_1_species'])
top_n=20

df_blur = df_blur.head(top_n)

df_blur.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'blur'), exist_ok=True)

for i, row in df_blur.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'blur'), region)

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_blur = df_blur.sort_values(by='crop_bluriness', ascending=False)

# for eah row in df_blur, get the image_path
for i, row in df_blur.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'blur'),
        df_blur,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"Crop blur: {float(row['crop_bluriness']):.3f}\nImage blur: {float(row['image_bluriness']):.3f}"

    )

plt.tight_layout()
plt.show()

## Bluriest Images

In [None]:
df.head()

In [None]:
# order by moth size
df = df.astype({'image_bluriness': 'float'})
df_blur = df.sort_values(by='image_bluriness', ascending=False)

df_blur.reset_index(drop=True, inplace=True)

df_blur = df_blur.drop_duplicates(subset=['top_1_species'])
df_blur = df_blur.drop_duplicates(subset=['image_bluriness'])
top_n=20

df_blur = df_blur.head(top_n)

df_blur.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'blur'), exist_ok=True)

for i, row in df_blur.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'blur'), region)

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_blur = df_blur.sort_values(by='crop_bluriness', ascending=False)

# for eah row in df_blur, get the image_path
for i, row in df_blur.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'blur'),
        df_blur,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"Crop blur: {float(row['crop_bluriness']):.3f}\nImage blur: {float(row['image_bluriness']):.3f}"

    )

plt.tight_layout()
plt.show()