In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from multiprocessing.pool import ThreadPool

In [2]:
def copy_file(src_path, dst_path):
    shutil.copy(src_path, dst_path)

def copy_files(files_list, src_dir, dst_dir, num_threads=8):
    src_paths = [os.path.join(src_dir, f) for f in files_list]
    dst_paths = [os.path.join(dst_dir, f) for f in files_list]
    pool = ThreadPool(num_threads)
    pool.starmap(copy_file, zip(src_paths, dst_paths))
    pool.close()
    pool.join()

In [3]:
data_dir=r"C:\Users\gcmar\Desktop\DATA\Beetle_classification_deep_data_segmented"
metadata_df = pd.read_csv(data_dir+"\\segmented_images_metadata.csv", index_col="Unnamed: 0",low_memory=False)

In [4]:
metadata_df

Unnamed: 0,centroid-0,centroid-1,bbox-0,bbox-1,bbox-2,bbox-3,orientation,axis_major_length,axis_minor_length,area,...,real_area,kmeans_label,circle_class,pixel_count,composite_image_path,species,vial,subset,composite_image_number,segmented_image_name
0,902.268968,2434.645990,669,2159,1176,2680,0.851181,551.446345,208.663148,77526,...,2.186159,1,non_circle,77554,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_0
1,1126.571843,617.557308,951,321,1310,897,1.244527,533.552869,184.634309,71691,...,2.021593,1,non_circle,71716,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_1
2,1278.681092,1165.795350,1053,916,1552,1452,-0.894788,567.996227,196.588315,76511,...,2.157012,1,non_circle,76520,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_2
3,1605.594882,2032.423819,1344,1766,1884,2305,0.817268,584.804556,204.174837,88073,...,2.483354,1,non_circle,88097,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_3
4,1697.276058,749.737463,1543,486,1879,1032,-1.357185,488.424417,172.302348,61679,...,1.739225,1,non_circle,61699,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38299,4387.395919,2062.724121,4145,1892,4616,2251,0.471133,430.302729,159.668073,50283,...,1.496886,1,non_circle,50338,Z:\lab records\Christopher_Marais\Beetle_class...,Xyleborus_affinis,22828,1-1,6555,Xyleborus_affinis_22828_1-1_6555_34
38300,4444.338758,3110.142140,4249,2878,4697,3359,-0.971264,463.192137,172.039486,56958,...,1.709563,1,non_circle,57490,Z:\lab records\Christopher_Marais\Beetle_class...,Xyleborus_affinis,22828,1-1,6555,Xyleborus_affinis_22828_1-1_6555_35
38301,4581.270791,1120.434139,4336,926,4823,1333,0.556566,458.425670,175.599140,60423,...,1.796780,1,non_circle,60423,Z:\lab records\Christopher_Marais\Beetle_class...,Xyleborus_affinis,22828,1-1,6555,Xyleborus_affinis_22828_1-1_6555_36
38302,4542.008141,706.795962,4385,443,4717,978,-1.339780,466.504137,176.460841,61660,...,1.833565,1,non_circle,61660,Z:\lab records\Christopher_Marais\Beetle_class...,Xyleborus_affinis,22828,1-1,6555,Xyleborus_affinis_22828_1-1_6555_37


In [5]:
# rename circle class to unkown when no circle was detected
df_unique_composite_per_circle_class = metadata_df.groupby('circle_class')['composite_image_path'].unique()
# find the composite images that do not have a circle detected
unkown_circle_arr = np.setdiff1d(df_unique_composite_per_circle_class['non_circle'], df_unique_composite_per_circle_class['circle'])
# change circle class value ot unkown
metadata_df.loc[metadata_df['composite_image_path'].isin(unkown_circle_arr), 'circle_class'] = 'unknown'

# create list of known circle images and non-circle images
circle_df = metadata_df[metadata_df['circle_class']=='circle']
circles_lst = (circle_df['segmented_image_name']+'.JPG').tolist()
non_circle_df = metadata_df[metadata_df['circle_class']!='circle']
non_circles_lst = (non_circle_df['segmented_image_name']+'.JPG').tolist()
unknown_df = metadata_df[metadata_df['circle_class']=='unknown']
unknown_lst = (unknown_df['segmented_image_name']+'.JPG').tolist()

In [6]:
output_dir = r"C:\Users\gcmar\Desktop\DATA\segmented_circle_images"

# save the newly edited dataframe to disk
metadata_df.to_csv(output_dir+"\\segmented_images_metadata.csv")

# copy known circle images ot new locations
copy_files(files_list=circles_lst, src_dir=data_dir, dst_dir=output_dir+r"\\circle", num_threads=8)
copy_files(files_list=non_circles_lst, src_dir=data_dir, dst_dir=output_dir+r"\\non_circle", num_threads=8)
copy_files(files_list=unknown_lst, src_dir=data_dir, dst_dir=output_dir+r"\\unknown", num_threads=8)

In [7]:
# TRAIN
# get list of all circle images and copy to new folder
# get list of all non-circle images
# train classifier with fastai

In [8]:
# APPLY
# import metadata file
# get lsit of filenames of segmented images
# create new column of circle classification
# open images in loop
# classify images and add vlaue to new column

In [9]:
# Explainable AI
# LIME
# SHAP