# Regional QC

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import json
from boto3.s3.transfer import TransferConfig
from PIL import Image
import seaborn as sns

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
region='tha'
country='thailand'
download_dir=f'./data/qc_plots/{country}'
os.makedirs(download_dir, exist_ok=True)

inference_dir = f'./data/{country}_inferences/'
# inference_csvs = os.listdir(inference_dir)

#listdir recursively
def listdir_recursive(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            yield os.path.join(root, file)

# Get all csv files in the inference directory
inference_csvs = list(listdir_recursive(inference_dir))
inference_csvs = [c for c in inference_csvs if c.endswith('.csv')]

In [None]:
inference_csvs

In [None]:
def annotate_image(image_path, dir, df, ax, scaling_required=False, crop_to_highlight=None, buffer=5, subtitle=None):
    df_image = df.loc[df['image_path'] == image_path, ]

    img = plt.imread(f"{dir}/{os.path.basename(image_path)}")
    image = Image.open(f"{dir}/{os.path.basename(image_path)}").convert("RGB")
    subp = ax.imshow(img, origin='lower')

    for j, row in df_image.iterrows():
        x_min = row['x_min'] -buffer
        y_min = row['y_min'] -buffer
        x_max = row['x_max'] +buffer
        y_max = row['y_max'] +buffer

        if scaling_required:
            original_width, original_height = image.size
            x_min = x_min *300 / original_width
            y_min = y_min *300 / original_height
            x_max = x_max *300 / original_width
            y_max = y_max *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        if (row['class_name'] == "moth") and ('Lepidoptera' not in row['order_name']):
            col = 'orange'
        elif (row['class_name'] != "moth") and ('Lepidoptera' in row['order_name']):
            col = 'purple'
        elif (row['class_name'] == "moth") and ('Lepidoptera' in row['order_name']):
                col = 'green'
        else:
            col = 'red'

        alph = 1
        if crop_to_highlight is not None:
            if row['crop_status'] != crop_to_highlight:
                alph = 0.2


        if (row['class_name'] == "moth") or ("Lepidoptera" in row['order_name']):
            ax.text(x_min, y_max,
                    f"{row['top_1_species']}: {row['top_1_confidence']:.2f}",
                    color=col,
                    fontsize=5, alpha=alph,
                    verticalalignment="bottom")

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor=col, linewidth=1, alpha=alph)
        ax.add_patch(rect)

    if not subtitle:
        subtitle=f"{os.path.basename(image_path)}"
    ax.set_title(subtitle)
    ax.axis('off')
    return subp

In [None]:
def download_images(s3_client, config, key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    s3_client.download_file(bucket_name, key, download_path, Config=config)

In [None]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

client = initialise_session('./credentials.json')

In [None]:
# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [None]:
def moth_only_df(inference_csvs):
    df = pd.DataFrame()
    for c in inference_csvs:
        input_df = pd.read_csv(c, low_memory=False)
        input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
        input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

        input_df = input_df.loc[input_df['top_1_species'].isna() == False, ]
        input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
        input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

        # set new keys column as 'dep' and 'image_path' combined
        input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
        df = pd.concat([df, input_df])
        del input_df

    return df

In [None]:
def cat_summary(inference_csvs, category='order_name'):
    df = pd.DataFrame()
    for c in inference_csvs:
        input_df = pd.read_csv(c, low_memory=False)
        input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
        input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

        input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
        input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

        # summarise the order_name by deployment
        summary = input_df[['dep', category]].value_counts()
        summary = summary.reset_index()
        summary.columns = ['deployment', category, 'count']
        summary['file'] = os.path.basename(c)

        df = pd.concat([df, summary], ignore_index=True)
        del input_df

    df = df[['deployment', category, 'count']].groupby(['deployment', category]).sum().reset_index()

    df = df.sort_values(by=['deployment', 'count'], ascending=[True, False])
    return df
    return df

In [None]:
order_counts = cat_summary(inference_csvs)

In [None]:
order_counts.head()

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=order_counts, hue='deployment', y='count', x='order_name')
plt.title('Counts by Deployment and Order')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Order Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
species_counts = cat_summary(inference_csvs, category='top_1_species')

# subset to only the top 10 species
top_species = species_counts[['top_1_species', 'count']].groupby('top_1_species').sum().reset_index()
top_species = top_species.sort_values(by='count', ascending=False)
top_species = top_species.head(10)

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=top_species, y='count', x='top_1_species')
plt.title('Top Species Counts')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(data=species_counts.loc[species_counts['top_1_species'].isin(top_species['top_1_species']), ], hue='deployment', y='count', x='top_1_species')
plt.title('Top Species Counts by Deployment')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Most popular species', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df = moth_only_df(inference_csvs)

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=50, figsize=(6, 3))
df['top_2_confidence'].plot(kind='hist', bins=50, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=50, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=50, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=50, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of Top 5 Predictions')
plt.xlabel('Confidence')

In [None]:
df['image_path'].value_counts().plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Moth crops per image')
plt.xlabel('Number of moth crops per image (n > 0)')
plt.yscale('log')
plt.show()

In [None]:
dep_counts = df['dep'].value_counts()
dep_counts = dep_counts.reset_index()
dep_counts.columns = ['deployment', 'count']

plt.figure(figsize=(7, 5))
sns.barplot(data=dep_counts, hue='count', y='count', x='deployment')
plt.title('Moth crops per deployment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.yscale('log')
plt.show()

In [None]:
df['crop_ratio_of_field'] = df['crop_area']#/(image_width*image_height)


In [None]:
sns.regplot(x=df['order_confidence'], y=df['crop_ratio_of_field'], logx=True, line_kws=dict(color="r"))
plt.yscale('log')
plt.show()

In [None]:
dep_df = pd.DataFrame()
for c in inference_csvs:
    input_df = pd.read_csv(c, low_memory=False)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

    input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # summarise the order_name by deployment
    summary = input_df[['dep']].value_counts()
    summary = summary.reset_index()
    # summary.columns = ['deployment' 'count']
    summary['file'] = os.path.basename(c)

    dep_df = pd.concat([dep_df, summary], ignore_index=True)
    del input_df

dep_df = dep_df[['dep', 'count']].groupby(['dep']).sum().reset_index()

dep_df = dep_df.sort_values(by=['dep', 'count'], ascending=[True, False])


In [None]:
dep_df

plt.figure(figsize=(7, 5))
sns.barplot(data=dep_df, hue='count', y='count', x='dep')
plt.title('All crops per deployment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.yscale('log')
plt.legend().set_visible(False)
plt.ylabel('Number of crops')
plt.xlabel('Deployment')
plt.show()

## Most Confident cases for each order

In [None]:
order_df = pd.DataFrame()
for c in inference_csvs:
    input_df = pd.read_csv(c, low_memory=False)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])
    input_df = input_df.sort_values(by='order_confidence', ascending=False)

    input_df['dep'] = os.path.basename(c).split('.')[0].split('_')[0]
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    order_df = pd.concat([order_df, input_df])
    del input_df

order_df = order_df.sort_values(by=['dep', 'order_confidence'], ascending=[True, False])
order_df = order_df.groupby(['order_name', 'dep']).head(1).reset_index(drop=True)

In [None]:
# order by top_1_confidence
df_order = order_df.sort_values(by='order_confidence', ascending=False)
df_order = df_order.loc[df_order['order_confidence'] > 0.8, ]
df_order.reset_index(drop=True, inplace=True)

print(df_order.shape)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'order'), exist_ok=True)

for i, row in df_order.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'order'), region)

In [None]:
df_order.shape

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 6, figsize=(15, 7.5))
ax = ax.ravel()

df_order = df_order.sort_values(by='order_name', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_order.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'order'),
        df_order,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['order_name']}, {row['order_confidence']:.2f}"
    )

for i in range(len(df_order), len(ax)):
    ax[i].axis('off')


plt.tight_layout()
plt.show()

# Most Confident Species Predictions

In [None]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_confident = df_confident.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])
df_confident = df_confident.drop_duplicates(subset=['top_1_species'])


top_n=20

df_confident = df_confident.head(top_n)

In [None]:
# sort by image_path
df_confident.sort_values(by='image_path', inplace=True)
df_confident.reset_index(drop=True, inplace=True)
df_confident

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'confident'), exist_ok=True)

for i, row in df_confident.head(top_n).iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'confident'), region)

In [None]:
df_confident.shape

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_confident = df_confident.sort_values(by='top_1_confidence', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_confident.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'confident'),
        df_confident,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['top_1_species']}, ({row['top_1_confidence']:.2f})"
    )

for i in range(len(df_confident), len(ax)):
    ax[i].axis('off')

plt.tight_layout()
plt.show()

## Largest Moths

In [None]:
# order by moth size
df_area = df.sort_values(by='crop_area', ascending=False)
df_area.reset_index(drop=True, inplace=True)
df_area = df_area.drop_duplicates(subset=['top_1_species'])

top_n=20

df_area = df_area.head(top_n)

df_area.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'largest'), exist_ok=True)

for i, row in df_area.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'largest'), region)

In [None]:
df_area

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_area = df_area.sort_values(by='crop_area', ascending=False)

# for eah row in df_confident, get the image_path
for i, row in df_area.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'largest'),
        df_area,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"{row['top_1_species']}, ({row['top_1_confidence']:.2f})"
    )

plt.tight_layout()
plt.show()

# Blurriest Crops

In [None]:
# order by moth size
df_blur = df.sort_values(by='crop_bluriness', ascending=False)
df_blur.reset_index(drop=True, inplace=True)

df_blur = df_blur.drop_duplicates(subset=['top_1_species'])
top_n=20

df_blur = df_blur.head(top_n)

df_blur.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'blur'), exist_ok=True)

for i, row in df_blur.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'blur'), region)

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_blur = df_blur.sort_values(by='crop_bluriness', ascending=False)

# for eah row in df_blur, get the image_path
for i, row in df_blur.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'blur'),
        df_blur,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"Crop blur: {float(row['crop_bluriness']):.3f}\nImage blur: {float(row['image_bluriness']):.3f}"

    )

plt.tight_layout()
plt.show()

## Bluriest Images

In [None]:
df.head()

In [None]:
# order by moth size
df = df.astype({'image_bluriness': 'float'})
df_blur = df.sort_values(by='image_bluriness', ascending=False)

df_blur.reset_index(drop=True, inplace=True)

df_blur = df_blur.drop_duplicates(subset=['top_1_species'])
df_blur = df_blur.drop_duplicates(subset=['image_bluriness'])
top_n=20

df_blur = df_blur.head(top_n)

df_blur.reset_index(drop=True, inplace=True)

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'blur'), exist_ok=True)

for i, row in df_blur.iterrows():
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'blur'), region)

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(4, 5, figsize=(13.33, 7.5))
ax = ax.ravel()

df_blur = df_blur.sort_values(by='crop_bluriness', ascending=False)

# for eah row in df_blur, get the image_path
for i, row in df_blur.iterrows():
    annotate_image(
        row['image_path'],
        os.path.join(download_dir, 'blur'),
        df_blur,
        ax[i],
        crop_to_highlight=row['crop_status'],
        buffer=20,
        scaling_required=False,
        subtitle=f"Crop blur: {float(row['crop_bluriness']):.3f}\nImage blur: {float(row['image_bluriness']):.3f}"

    )

plt.tight_layout()
plt.show()