In [29]:
import os
import shutil
import numpy as np
import pandas as pd
from multiprocessing.pool import ThreadPool
from sklearn.model_selection import train_test_split

In [57]:
def copy_file(src_path, dst_path):
    shutil.copy(src_path, dst_path)

def copy_files(files_list, src_dir, dst_dir, num_threads=8):
    src_paths = [os.path.join(src_dir, f) for f in files_list]
    dst_paths = [os.path.join(dst_dir, f) for f in files_list]
    pool = ThreadPool(num_threads)
    pool.starmap(copy_file, zip(src_paths, dst_paths))
    pool.close()
    pool.join()
    
# move files in list from source directory to target directory
def move_files(source, destination, file_list):
    unmoved_files = 0
    moved_files = 0
    for file_name in file_list:
        try:
            file_name = file_name.rstrip()
            shutil.move(f'{source}\\{file_name}', destination)
            moved_files+=1
        except:
            unmoved_files+=1
    print("Number of files moved: ", moved_files)
    print("Number of files not moved: ", unmoved_files)

In [3]:
# data_dir=r"C:\Users\gcmar\Desktop\DATA\Beetle_classification_deep_data_segmented"
data_dir=r"F:\Beetle_classification_deep_data_segmented"
metadata_df = pd.read_csv(data_dir+"\\segmented_images_metadata.csv", index_col="Unnamed: 0",low_memory=False)

In [4]:
metadata_df

Unnamed: 0,centroid-0,centroid-1,bbox-0,bbox-1,bbox-2,bbox-3,orientation,axis_major_length,axis_minor_length,area,...,real_area,kmeans_label,circle_class,pixel_count,composite_image_path,species,vial,subset,composite_image_number,segmented_image_name
0,902.268968,2434.645990,669,2159,1176,2680,0.851181,551.446345,208.663148,77526,...,2.186159,1,non_circle,77554,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_0
1,1126.571843,617.557308,951,321,1310,897,1.244527,533.552869,184.634309,71691,...,2.021593,1,non_circle,71716,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_1
2,1278.681092,1165.795350,1053,916,1552,1452,-0.894788,567.996227,196.588315,76511,...,2.157012,1,non_circle,76520,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_2
3,1605.594882,2032.423819,1344,1766,1884,2305,0.817268,584.804556,204.174837,88073,...,2.483354,1,non_circle,88097,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_3
4,1697.276058,749.737463,1543,486,1879,1032,-1.357185,488.424417,172.302348,61679,...,1.739225,1,non_circle,61699,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32459,1204.977287,2932.880730,677,2582,1754,3262,0.209518,1060.008301,363.079292,269539,...,0.000000,1,non_circle,269628,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_3
32460,2177.058473,1305.929270,1928,788,2526,1820,1.363932,1015.882306,357.259946,258306,...,0.000000,1,non_circle,259200,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_4
32461,2594.744857,2537.921139,2133,2010,2979,3023,-0.988549,1082.147025,374.123220,286138,...,0.000000,1,non_circle,286247,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_5
32462,3848.317215,2570.917765,3314,2274,4412,3003,-0.173802,1095.605815,395.401014,292272,...,0.000000,1,non_circle,292323,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_6


In [5]:
# rename circle class to unkown when no circle was detected
df_unique_composite_per_circle_class = metadata_df.groupby('circle_class')['composite_image_path'].unique()
# find the composite images that do not have a circle detected
unkown_circle_arr = np.setdiff1d(df_unique_composite_per_circle_class['non_circle'], df_unique_composite_per_circle_class['circle'])
# change circle class value ot unkown
metadata_df.loc[metadata_df['composite_image_path'].isin(unkown_circle_arr), 'circle_class'] = 'unknown'

# create list of known circle images and non-circle images
circle_df = metadata_df[metadata_df['circle_class']=='circle']
circles_lst = (circle_df['segmented_image_name']+'.JPG').tolist()
non_circle_df = metadata_df[metadata_df['circle_class']=='non_circle']
non_circles_lst = (non_circle_df['segmented_image_name']+'.JPG').tolist()
unknown_df = metadata_df[metadata_df['circle_class']=='unknown']
unknown_lst = (unknown_df['segmented_image_name']+'.JPG').tolist()

In [6]:
# output_dir = r"C:\Users\gcmar\Desktop\DATA\segmented_circle_images"
output_dir = r"F:\segmented_circle_images"

# save the newly edited dataframe to disk
metadata_df.to_csv(r"F:\ball_classifier\segmented_images_metadata.csv")

# copy known circle images ot new locations
copy_files(files_list=circles_lst, src_dir=data_dir, dst_dir=output_dir+r"\\circle", num_threads=8)
copy_files(files_list=non_circles_lst, src_dir=data_dir, dst_dir=output_dir+r"\\non_circle", num_threads=8)
copy_files(files_list=unknown_lst, src_dir=data_dir, dst_dir=output_dir+r"\\unknown", num_threads=8)

In [25]:
# sort out circle and non-circle images manually and add true labels
circle_lst = os.listdir(output_dir+r"\\circle")
circle_lst = [os.path.splitext(filename)[0] for filename in circle_lst]
metadata_df['target'] = 'non_ball'
metadata_df.loc[metadata_df['segmented_image_name'].isin(circle_lst), 'target'] = 'ball'

# save the newly edited dataframe to disk
metadata_df.to_csv(r"F:\ball_classifier\segmented_images_metadata.csv")

In [64]:
# split data into training and validation sets
y = metadata_df['target']
X = metadata_df['segmented_image_name']

# split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

file_lst = (X_val+".JPG").tolist()
# move circles data
move_files(source=r"F:\segmented_circle_images\train\circle", destination=r"F:\segmented_circle_images\valid\circle", file_list=file_lst)
# move non_circles data
move_files(source=r"F:\segmented_circle_images\train\non_circle", destination=r"F:\segmented_circle_images\valid\non_circle", file_list=file_lst)

Number of files moved:  442
Number of files not moved:  10272
Number of files moved:  10272
Number of files not moved:  442
