# Searching for Specific Moths

In this notebook, we will search for specific moths in a dataset. Specifically, those which are: 
- large
- sloth moths
- highest test accuracy

In [None]:
import os
import pandas as pd
import boto3
import json
from boto3.s3.transfer import TransferConfig
from PIL import Image
import numpy as np
from tqdm import tqdm

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
# set the working directory
os.chdir(os.path.expanduser('~/amber-inferences'))

In [None]:
region='aia'
country='anguilla'
download_dir=f'./data/qc_plots/{country}'
os.makedirs(download_dir, exist_ok=True)

inference_dir = os.path.abspath(f'/gws/nopw/j04/ceh_generic/kgoldmann/{country}_inferences_tracking/')

#listdir recursively
def listdir_recursive(path):
    for root, dirs, files in os.walk(path):
        for file in files:
            yield os.path.join(root, file)

# Get all csv files in the inference directory
inference_csvs = list(listdir_recursive(inference_dir))
inference_csvs = [c for c in inference_csvs if c.endswith('.csv')]
inference_csvs = [c for c in inference_csvs if not 'compute' in c]

In [None]:
inference_csvs[0]

In [None]:
deps = [os.path.basename(os.path.dirname(x)) for x in inference_csvs]

# print the value counts
print(pd.Series(deps).value_counts())

In [None]:
len(inference_csvs)

## Plotting and Data Wrangling Functions

In [None]:
def download_images(s3_client, config, key, download_dir, bucket_name):
    download_path = os.path.join(download_dir, os.path.basename(key))
    s3_client.download_file(bucket_name, key, download_path, Config=config)

In [None]:
def initialise_session(credentials_file="credentials.json"):
    """
    Load AWS and API credentials from a configuration file and initialise an AWS session.

    Args:
        credentials_file (str): Path to the credentials JSON file.

    Returns:
        boto3.Client: Initialised S3 client.
    """
    with open(credentials_file, encoding="utf-8") as config_file:
        aws_credentials = json.load(config_file)
    session = boto3.Session(
        aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
        region_name=aws_credentials["AWS_REGION"],
    )
    client = session.client("s3", endpoint_url=aws_credentials["AWS_URL_ENDPOINT"])
    return client

client = initialise_session('./credentials.json')

In [None]:
# Transfer configuration for optimised S3 download
transfer_config = TransferConfig(
    max_concurrency=20,  # Increase the number of concurrent transfers
    multipart_threshold=8 * 1024 * 1024,  # 8MB
    max_io_queue=1000,
    io_chunksize=262144,  # 256KB
)

In [None]:
def subset_by_species(inference_csvs, species_names, top_n=1, confidence_threshold=0):
    # Pre-allocate list for better performance than repeated concatenations
    dataframes = []

    # Convert species_names to set for O(1) lookup performance
    species_set = set(species_names)

    for c in tqdm(inference_csvs, desc='reading in the csvs'):
        try:
            input_df = pd.read_csv(c, low_memory=False)
        except Exception as e:
            print(f" - Error reading {c}: {e}")
            continue

        # Pre-allocate list for this file's subsets
        file_subsets = []

        for i in range(top_n):
            col_name = f'top_{i+1}_species'
            confidence_col = f'top_{i+1}_confidence'

            if col_name not in input_df.columns:
                print(f" - Column {col_name} not found in {c}. Skipping this file.")
                break

            if confidence_col not in input_df.columns:
                print(f" - Column {confidence_col} not found in {c}. Skipping this column.")
                continue

            # Vectorized filtering with combined conditions
            species_mask = input_df[col_name].isin(species_set)
            confidence_mask = input_df[confidence_col] > confidence_threshold
            combined_mask = species_mask & confidence_mask

            temp = input_df[combined_mask]
            if not temp.empty:
                file_subsets.append(temp)

        # Only proceed if we found any matching data
        if file_subsets:
            # Single concatenation for this file's data
            if len(file_subsets) == 1:
                subset_df = file_subsets[0].copy()
            else:
                subset_df = pd.concat(file_subsets, ignore_index=True)

            # Vectorized key generation
            prefix = os.path.basename(os.path.dirname(c)) + "/snapshot_images/"
            subset_df['key'] = prefix + subset_df['image_path'].apply(os.path.basename)

            dataframes.append(subset_df)

    # Single concatenation at the end
    if dataframes:
        df_moths = pd.concat(dataframes, ignore_index=True)
    else:
        df_moths = pd.DataFrame()

    return df_moths

In [None]:
def subset_by_size(inference_csvs, keep_n=20, moth_only=True, size_cutoff=None, drop_duplicates=True):
    # Pre-allocate list for better performance than repeated DataFrame concatenation
    dataframes = []

    for c in tqdm(inference_csvs, desc='reading in the csvs'):
        try:
            # Only read necessary columns if we know them, otherwise read all
            input_df = pd.read_csv(c, low_memory=False)

        except Exception as e:
            print(f" - Error reading {c}: {e}")
            continue

        # Vectorized string operation with na=False for better performance
        crop_mask = input_df['crop_status'].str.contains('crop_', na=False)
        input_df = input_df[crop_mask]

        if input_df.empty:
            continue

        # Apply filters early to reduce data size before sorting
        if moth_only:
            # Vectorized operations with na=False
            lepidoptera_mask = input_df['order_name'].str.contains('Lepidoptera', na=False)
            moth_mask = input_df['class_name'] == 'moth'
            input_df = input_df[lepidoptera_mask | moth_mask]

            if input_df.empty:
                continue

        if size_cutoff is not None:
            input_df = input_df[input_df['crop_area'] > size_cutoff]

            if input_df.empty:
                continue

        # Sort only after filtering to reduce computational cost
        input_df = input_df.sort_values('crop_area', ascending=False)

        # Drop duplicates before taking head for efficiency
        if 'track_id' in input_df.columns and drop_duplicates:
            input_df = input_df.drop_duplicates('track_id')

        # Take only what we need early
        input_df = input_df.head(keep_n)

        # Vectorized string operations instead of apply
        prefix = os.path.basename(os.path.dirname(c)) + "/snapshot_images/"
        input_df = input_df.copy()  # Avoid SettingWithCopyWarning
        input_df['key'] = prefix + input_df['image_path'].apply(os.path.basename)

        dataframes.append(input_df)

    # Single concatenation at the end is much faster than repeated concatenations
    if dataframes:
        df_moths = pd.concat(dataframes, ignore_index=True)
    else:
        df_moths = pd.DataFrame()

    return df_moths

In [None]:
def output_crops(df, output_dir, region, buffer=5, group_by_species=True):

    if group_by_species:
        species = df['top_1_species'].unique()
        for sp in species:
            os.makedirs(os.path.join(output_dir, sp.replace(' ', '_')), exist_ok=True)

    for i, row in tqdm(df.iterrows(), desc='downloading images', total=df.shape[0]):
        try:
            download_images(client, transfer_config, row['key'], output_dir, region)
        except Exception as e:
            print(f" - Error downloading {row['key']}: {e}")

        dep = row['deployment_id']

        # crop the image and save it
        image_path = os.path.join(output_dir, str(os.path.basename(row['key'])))

        if group_by_species:
            cropped_image_path = os.path.join(output_dir, row['top_1_species'].replace(' ', '_'),  f"{dep}_{row['crop_status']}_{os.path.basename(row['key'])}")
        else:
            cropped_image_path = os.path.join(output_dir, f"{dep}_{row['crop_status']}_{os.path.basename(row['key'])}")

        try:
            with Image.open(image_path) as img:
                # Crop the image
                x_min = float(row['x_min']) -buffer
                y_min = float(row['y_min']) -buffer
                x_max = float(row['x_max']) +buffer
                y_max = float(row['y_max']) +buffer

                img_cropped = img.crop((x_min, y_min, x_max, y_max))
                # Save the cropped image
                img_cropped.save(cropped_image_path)
            os.remove(image_path)  # Remove the original image after cropping

        except Exception as e:
            print(f" - Error cropping {image_path}: {e}")

# From Lists Provided 

In [None]:
# read in the txt file
def read_species_list(file_path):
    with open(file_path, 'r') as file:
        species_list = [line.strip() for line in file if line.strip()]
    return species_list


# if the file exists
if os.path.exists(f'./sandbox/lists/{region}_interesting_moths.txt'):
    moths = read_species_list(f'./sandbox/lists/{region}_interesting_moths.txt') # or './sandbox/lists/sloth_moths.txt

    moths = [x.replace('nr. ', '') for x in moths]

In [None]:
all_moths = pd.read_csv(f'../gbif_download_standalone/species_checklists/{country}-moths-keys-nodup.csv')

In [None]:
# get moths in all_moths['species_name']
moths_in_all = all_moths[all_moths['species_name_provided'].isin(moths)]
moths_missing = [x for x in moths if x not in moths_in_all['species_name_provided'].values]

In [None]:
moths_missing

In [None]:
notable_moths = all_moths.loc[(all_moths['gbif_species_name'].isin(moths)) |
                              (all_moths['search_species_name'].isin(moths)) |
                              (all_moths['species_name_provided'].isin(moths)), ]

notable_moths = notable_moths['gbif_species_name'].values.tolist()
notable_moths

In [None]:
inferenced_moths = subset_by_species(inference_csvs, notable_moths, 1, 0)

In [None]:
inferenced_moths.head()

In [None]:
inferenced_moths['top_1_species'].value_counts()

In [None]:
inferenced_moths['deployment_name'].value_counts().plot(kind='bar',
                            title='Deployment Name Counts',
                            xlabel='Deployment Name',
                            ylabel='Count')


In [None]:
# plot a historgram
inferenced_moths['top_1_confidence'].plot(kind='hist',
                            title='Confidence Distribution for Interesting Moths',
                            xlabel='Confidence',
                            bins=50)


In [None]:
plot_df = inferenced_moths[(~inferenced_moths[['deployment_name', 'track_id']].duplicated()) | (inferenced_moths['track_id'].isnull())]

plot_df['top_1_confidence'].plot(kind='hist',
                            title='Confidence Distribution (accounting for tracking)',
                            xlabel='Confidence',
                            color='orange',
                            bins=50)

plt.show()

In [None]:
inferenced_backup = inferenced_moths.copy()

In [None]:
inferenced_moths = inferenced_backup

In [None]:
# Option to remove certain dates

inferenced_moths['date'] = pd.to_datetime(inferenced_moths['image_datetime'])

# subset to date only
inferenced_moths['date'] = inferenced_moths['date'].dt.date



# get the number of occurrences per date
occurrences_per_date_by_deployment = inferenced_moths[['date', 'deployment_name']].value_counts().sort_index()
occurrences_per_date_by_deployment

occurrences_per_date = inferenced_moths['date'].value_counts().sort_index()
occurrences_per_date

In [None]:
# plot date vs count
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(occurrences_per_date.index, occurrences_per_date.values, marker='o')
plt.title('Occurrences Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

# colour by deployment_name
for dep in inferenced_moths['deployment_name'].astype('category').cat.categories:
    dep_mask = inferenced_moths['deployment_name'] == dep
    plt.scatter(inferenced_moths.loc[dep_mask, 'date'], inferenced_moths.loc[dep_mask, 'top_1_confidence'], label=dep)
plt.legend()
plt.title('Occurrences Over Time')
plt.xlabel('Date')
plt.ylabel('Prediction Confidence')
plt.xticks(rotation=45)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
for deployment_name, group in occurrences_per_date_by_deployment.groupby(level=1):
    plt.plot(group.index.get_level_values(0), group.values, marker='o', label=deployment_name)
plt.title('Occurrences Over Time by Deployment')
plt.xlabel('Date')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45)
plt.grid()
plt.legend(title='Deployment Name')
plt.show()

In [None]:
download_dir = '/gws/nopw/j04/ceh_generic/kgoldmann/thai_interesting_moths/'

os.makedirs(download_dir, exist_ok=True)

In [None]:
import datetime

# for each top_1_species get the most confident images for that species
confident_moths = inferenced_moths.sort_values('top_1_confidence', ascending=False)

confident_moths = confident_moths.loc[confident_moths['top_1_confidence'] > 0.8, ]
# confident_moths = confident_moths.groupby('top_1_species').head(1000)
confident_moths.head()

In [None]:
confident_moths['top_1_species'].value_counts()

# subset to 100 of each top_1_species
subset_confident_moths = confident_moths.groupby('top_1_species').head(100)

In [None]:
output_crops(subset_confident_moths, download_dir, region, buffer=5, group_by_species=True)

# High Test Accuracy Moths

In [None]:
# load in the json files
def load_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

region_list = {
    'costarica':'03',
    'singapore':'02',
    'thailand':'02',
}


with open(f'/home/users/katriona/amber-inferences/sandbox/turing-{country}_v{region_list[country]}_taxon-accuracy.json') as f:
    accuracy = json.load(f)

info = accuracy['About']
(k := next(iter(accuracy)), accuracy.pop(k))

In [None]:
len(accuracy['species'])

In [None]:
tax_df_list = {}

for tax in accuracy.keys():
    print(tax)

    tax_acc = accuracy[tax]

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame.from_dict(tax_acc, orient='index', columns=['Top1 Accuracy', 'Total Test Points'])

    # Reset the index to turn the index into a column
    df.reset_index(inplace=True)

    # Rename the index column to 'Family'
    df.rename(columns={'index': tax}, inplace=True)

    df['Total Train Points'] = df['Total Test Points']/0.15 * 0.75

    tax_df_list[tax] = df

In [None]:
#Aside: just checking something...
all_species = tax_df_list['species']['species']

# subset to where Timocratica is in all_species
species_names = [s for s in all_species if 'Timocratica' in s]
species_names

In [None]:
tax_df_list['species']

no_sig = tax_df_list['species']
no_sig = no_sig.loc[(no_sig['Top1 Accuracy'] > 90) & (no_sig['Total Train Points'] > 200)]

print(f'There are {no_sig.shape[0]} species with >90% accuracy and >200 training points')

In [None]:
# filter the inferences for these species
df_moths = subset_by_species(inference_csvs, no_sig['species'])
df_moths.head()

In [None]:
crops = pd.DataFrame(df_moths['top_1_species'].value_counts())
crops

In [None]:
df_moths.head()

# group by top_1_species, subset to 20 random rows with a mix of dep, and image_date
df_moths_subset = df_moths.groupby('top_1_species').apply(lambda x: x.sample(n=min([20, crops['count'][-1]]), random_state=42)).reset_index(drop=True)

In [None]:
download_dir = f'/gws/nopw/j04/ceh_generic/kgoldmann/{region}_confident_species'

# save the csv
df_moths_subset.to_csv(os.path.join(download_dir, '{region}_confident_species.csv'), index=False)

In [None]:
output_crops(df_moths_subset, download_dir, region, buffer=5)

# Large Crops

In [None]:
large_moths_backup = subset_by_size(inference_csvs, keep_n=10, moth_only=False, size_cutoff=20000, drop_duplicates=False)

In [None]:
large_moths = large_moths_backup.copy()

In [None]:
large_moths = large_moths.sort_values('crop_area', ascending=False)

In [None]:
large_nonmoths = large_moths.loc[(~ large_moths['order_name'].str.contains('Lepidoptera')) & (large_moths['class_name'] != 'moth'), ]
large_moths = large_moths.loc[large_moths['order_name'].str.contains('Lepidoptera') | (large_moths['class_name'] == 'moth'), ]

In [None]:
if 'deployment_id' not in large_moths.columns:
    large_moths['deployment_id'] = [x.split('/')[0] for x in large_moths['key']]
    large_nonmoths['deployment_id'] = [x.split('/')[0] for x in large_nonmoths['key']]

In [None]:
# large_moths = large_moths[large_moths['crop_area'] > 20000]

print(large_moths.shape)
print(large_nonmoths.shape)

In [None]:
download_dir = f'/gws/nopw/j04/ceh_generic/kgoldmann/interesting_crops/{region}/large_moths/'
os.makedirs(os.path.join(download_dir), exist_ok=True)
output_crops(large_moths, download_dir, region, buffer=5)

In [None]:
download_dir = f'/gws/nopw/j04/ceh_generic/kgoldmann/interesting_crops/{region}/large_nonmoths/'
os.makedirs(os.path.join(download_dir), exist_ok=True)
output_crops(large_nonmoths, download_dir, region, buffer=5, group_by_species=False)


In [None]:
large_moths.loc[large_moths['image_path'].str.contains('20240914015359'), 'top_1_species']

# High enough species confidence

In [None]:
species = large_moths.loc[large_moths['top_1_confidence'] > 0.8, ]

print(species.shape)

In [None]:
download_dir = f'/gws/nopw/j04/ceh_generic/kgoldmann/interesting_crops/{region}/large_moths/'
os.makedirs(os.path.join(download_dir), exist_ok=True)
output_crops(species, download_dir, region, buffer=5)