# Regional QC

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import json
from boto3.s3.transfer import TransferConfig


os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
# Retrospectively get the scaled crop area
img = Image.open('./data/qc_plots/singapore/order/20240430002900-snapshot.jpg')
image_width, image_height = img.width, img.height

print(image_width, image_height)

# in cm
print('in cm:', image_width*0.0264583333, image_height*0.0264583333)

In [None]:
region='sgp'
country='singapore'
download_dir=f'./data/qc_plots/{country}'
os.makedirs(download_dir, exist_ok=True)

inference_dir = './data/singapore/'
inference_csvs = os.listdir(inference_dir)
inference_csvs = [c for c in inference_csvs if c.endswith('_cleaned.csv')]
inference_csvs = [os.path.abspath(os.path.join(inference_dir, x)) for x in inference_csvs]

In [None]:
def annotate_image(image_path, dir, df, ax, scaling_required=False, crop_to_highlight=None, buffer=5, subtitle=None):
    df_image = df.loc[df['image_path'] == image_path, ]

    img = plt.imread(f"{dir}/{os.path.basename(image_path)}")
    image = Image.open(f"{dir}/{os.path.basename(image_path)}").convert("RGB")
    subp = ax.imshow(img, origin='lower')

    for j, row in df_image.iterrows():
        x_min = row['x_min'] -buffer
        y_min = row['y_min'] -buffer
        x_max = row['x_max'] +buffer
        y_max = row['y_max'] +buffer

        if scaling_required:
            original_width, original_height = image.size
            x_min = x_min *300 / original_width
            y_min = y_min *300 / original_height
            x_max = x_max *300 / original_width
            y_max = y_max *300 / original_height

        x = int(x_min)
        y = int(y_min)
        w = int(x_max - x_min)
        h = int(y_max - y_min)

        if (row['class_name'] == "moth") and ('Lepidoptera' not in row['order_name']):
            col = 'orange'
        elif (row['class_name'] != "moth") and ('Lepidoptera' in row['order_name']):
            col = 'purple'
        elif (row['class_name'] == "moth") and ('Lepidoptera' in row['order_name']):
                col = 'green'
        else:
            col = 'red'

        alph = 1
        if crop_to_highlight is not None:
            if row['crop_status'] != crop_to_highlight:
                alph = 0.2


        if (row['class_name'] == "moth") or ("Lepidoptera" in row['order_name']):
            ax.text(x_min, y_max,
                    f"{row['top_1_species']}: {row['top_1_confidence']:.2f}",
                    color=col,
                    fontsize=5, alpha=alph,
                    verticalalignment="bottom")

        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor=col, linewidth=1, alpha=alph)
        ax.add_patch(rect)

    if not subtitle:
        subtitle=f"{os.path.basename(image_path)}"
    ax.set_title(subtitle)
    ax.axis('off')
    return subp

In [None]:
def download_images(s3_client, config, key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    s3_client.download_file(bucket_name, key, download_path, Config=config)

In [None]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

client = initialise_session('./credentials.json')

In [None]:
# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [None]:
# combine into one csv
df = pd.DataFrame()
for c in inference_csvs:
    input_df = pd.read_csv(c, low_memory=False)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])

    input_df = input_df.loc[input_df['top_1_species'].isna() == False, ]
    input_df['dep'] = os.path.basename(c).split('.')[0].replace('_cleaned', '')
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    df = pd.concat([df, input_df])
    del input_df

In [None]:

df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
df['order_name'].value_counts().plot(kind='bar', figsize=(6, 4))

# rotate x labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('Predicted Order')
plt.xlabel('Order')
plt.ylabel('Number of crops')
plt.show()

In [None]:
df['class_name'].value_counts()

In [None]:
df['top_1_species'].value_counts().head(30).plot(kind='bar', figsize=(6, 4))

# rotate x labels
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title('Most Popular Predicted Moth Species in Singapore')
plt.show()

In [None]:
df['top_1_confidence'].plot(kind='hist', bins=20, figsize=(6, 3))
df['top_2_confidence'].plot(kind='hist', bins=20, color='orange', alpha=0.5)
df['top_3_confidence'].plot(kind='hist', bins=20, color='yellow', alpha=0.5)
df['top_4_confidence'].plot(kind='hist', bins=20, color='green', alpha=0.5)
df['top_5_confidence'].plot(kind='hist', bins=20, color='purple', alpha=0.5)

plt.legend(['Top 1 Confidence', 'Top 2 Confidence', 'Top 3 Confidence', 'Top 4 Confidence', 'Top 5 Confidence'])
plt.title('Confidence Distribution of Top 5 Predictions')
plt.xlabel('Confidence')

In [None]:
df['image_path'].value_counts().plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Crops per image')
plt.xlabel('Number of crops per image (n > 0)')
plt.show()

In [None]:
df['dep'].value_counts().plot(kind='bar', figsize=(5, 3))
plt.title('Potential Moth crops per deployment')
plt.xlabel('Number of crops per deployment')
plt.show()

In [None]:
df['image_path'].value_counts().plot(kind='hist', bins=100, figsize=(5, 3))
plt.title('Potential Moth crops per image')
plt.xlabel('Number of crops per image')
plt.show()

In [None]:
df['crop_ratio_of_field'] = df['crop_area']/(image_width*image_height)
df['crop_ratio_of_field']

In [None]:
import seaborn as sns

sns.regplot(x=df['order_confidence'], y=df['crop_ratio_of_field'], logx=True, line_kws=dict(color="r"))
plt.show()

## Most Confident cases for each order

In [None]:
order_df = pd.DataFrame()
for c in inference_csvs:
    input_df = pd.read_csv(c)
    input_df = input_df.loc[input_df['crop_status'] != 'NO DETECTIONS FOR IMAGE', ]
    input_df = input_df.drop_duplicates(subset=[ 'x_min', 'x_max', 'y_min', 'y_max'])
    input_df = input_df.sort_values(by='order_confidence', ascending=False)

    input_df = input_df.groupby('order_name').head(1).reset_index(drop=True)
    input_df['dep'] = os.path.basename(c).split('.')[0].replace('_cleaned', '')
    input_df['crop_area'] = (input_df['x_max'] - input_df['x_min']) * (input_df['y_max'] - input_df['y_min'])

    # set new keys column as 'dep' and 'image_path' combined
    input_df['keys'] = input_df['image_path'].apply(lambda x: f"{input_df['dep'].iloc[0]}/snapshot_images/{os.path.basename(x)}")
    order_df = pd.concat([order_df, input_df])
    del input_df

In [None]:
# order by top_1_confidence
df_order = order_df.sort_values(by='order_confidence', ascending=False)
df_order = df_order.loc[df_order['order_confidence'] > 0.8, ]
df_order.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_order = df_order.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])

# df_order = df_order.groupby('order_name').reset_index(drop=True)
print(df_order.shape)
df_order

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'order'), exist_ok=True)

for i, row in df_order.iterrows():
    print(row['keys'])
    download_images(client, transfer_config, row['keys'], os.path.join(download_dir, 'order'), region)

In [None]:
df_order.shape

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(11, 4, figsize=(10, 25))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_order.iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'order'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20, scaling_required=True,
                subtitle=row['order_name'])

plt.tight_layout()
plt.show()

In [None]:
# for each unique image, annotate the bounding box
fig, ax = plt.subplots(2, 1, figsize=(10, 10))
ax = ax.ravel()

# for eah row in df_confident, get the image_path
for i, row in df_order.iterrows():
    annotate_image(row['image_path'],
                os.path.join(download_dir, 'order'),
                df,
                ax[i],
                crop_to_highlight=row['crop_status'],
                buffer=20, scaling_required=True,
                subtitle=row['order_name'])

plt.tight_layout()
plt.show()

# Most Confident Species Predictions

In [None]:
# order by top_1_confidence
df_confident = df.sort_values(by='top_1_confidence', ascending=False)
df_confident.reset_index(drop=True, inplace=True)

# remove duplicated rows by image path, and bounding box
df_confident = df_confident.drop_duplicates(subset=['image_path', 'x_min', 'y_min', 'x_max', 'y_max'])

top_n=10

df_confident = df_confident.head(top_n)

In [None]:
# sort by image_path
df_confident.sort_values(by='image_path', inplace=True)
df_confident.reset_index(drop=True, inplace=True)
df_confident

In [None]:
# download the relevant images
os.makedirs(os.path.join(download_dir, 'confident'), exist_ok=True)

for i, row in df_confident.head(top_n).iterrows():
    download_images(row['keys'], os.path.join(download_dir, 'confident'), region)