In [1]:
import pandas as pd
# import plotly.offline as pyo
# pyo.init_notebook_mode()

import plotly.io as pio
pio.renderers.default = 'iframe'
pd.options.plotting.backend = "plotly"
import plotly.graph_objects as go
import plotly.express as px
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import iqr
import shutil
from multiprocessing.pool import ThreadPool
from random import choice, seed

In [2]:
# parameters
# meta_data_file_path = r"F:\Beetle_classification_deep_data_segmented_clean\segmented_images_metadata.csv"
# meta_data_file_path = r"C:\Users\gcmar\Dropbox (UFL)\Beetle_classification_deep_data_segmented_clean\segmented_images_metadata.csv"
meta_data_file_path = r"/blue/hulcr/gmarais/Beetle_data/Beetle_classification_deep_data_segmented_clean/segmented_images_metadata.csv"
random_state=42

In [3]:
# have script to remove dirty images after they are deleted by hand
df = pd.read_csv(meta_data_file_path, index_col='Unnamed: 0')
# add unique column for vial and subset
df["vial_subset"] = df['vial'].astype(str)+"_"+df['subset'].astype(str)
df["vial_subset_img"] = df["vial_subset"].astype(str) +"_"+ df["composite_image_number"].astype(str)
# add file name with extension
df["file_name"] = df["segmented_image_name"] + ".JPG"

# remove all detected ball bearing images
df_nz = df[df['circle_class']=='non_circle']

# select only area, iamge name and species columns
df_spcs = df_nz[['species', 'real_area', 'area', 'segmented_image_name', 'vial', 'vial_subset', 'vial_subset_img']].copy()
df_spcs_counts = df_spcs['species'].value_counts().sort_index()
df_spcs

Unnamed: 0,species,real_area,area,segmented_image_name,vial,vial_subset,vial_subset_img
0,Coccotypes_dactyliperda,2.186159,77526,Coccotypes_dactyliperda_16296_1_0052_0,16296,16296_1,16296_1_52
1,Coccotypes_dactyliperda,2.021593,71691,Coccotypes_dactyliperda_16296_1_0052_1,16296,16296_1,16296_1_52
2,Coccotypes_dactyliperda,2.157012,76511,Coccotypes_dactyliperda_16296_1_0052_2,16296,16296_1,16296_1_52
3,Coccotypes_dactyliperda,2.483354,88073,Coccotypes_dactyliperda_16296_1_0052_3,16296,16296_1,16296_1_52
4,Coccotypes_dactyliperda,1.739225,61679,Coccotypes_dactyliperda_16296_1_0052_4,16296,16296_1,16296_1_52
...,...,...,...,...,...,...,...
32459,Platypus_cylindrus,0.000000,269539,Platypus_cylindrus_22849_5_0065_3,22849,22849_5,22849_5_0065
32460,Platypus_cylindrus,0.000000,258306,Platypus_cylindrus_22849_5_0065_4,22849,22849_5,22849_5_0065
32461,Platypus_cylindrus,0.000000,286138,Platypus_cylindrus_22849_5_0065_5,22849,22849_5,22849_5_0065
32462,Platypus_cylindrus,0.000000,292272,Platypus_cylindrus_22849_5_0065_6,22849,22849_5,22849_5_0065


## Visualize data BEFORE cleaning

In [4]:
# Visualize data before cleaning
####################
# vial counts
vial_counts = []
for i in df['species'].unique():
    vials = len(df[df['species']==i]["vial"].unique())
    vial_counts.append(vials)
df_vial = pd.DataFrame.from_dict({"species":df['species'].unique(),
                                   "vial_counts":vial_counts})
df_vial = df_vial.set_index("species")
fig = df_vial.plot.bar(color=df_vial.index, 
                              title="Vial counts per species before cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  height=500)
fig.show()

In [5]:
####################
# subset counts
subset_counts = []
for i in df['species'].unique():
    subsets = len(df[df['species']==i]["vial_subset"].unique())
    subset_counts.append(subsets)
df_subset = pd.DataFrame.from_dict({"species":df['species'].unique(),
                                   "subset_counts":subset_counts})
df_subset = df_subset.set_index("species")
fig = df_subset.plot.bar(color=df_subset.index, 
                              title="Subset counts per species before cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=500
                 )
fig.show()

In [6]:
####################
# counts of images 
fig = df_spcs_counts.plot.bar(color=df_spcs_counts.index, 
                              title="Image counts per species before cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=500
                 )
fig.show()

In [7]:
####################
# boxplot of area per species 
fig = px.box(df_spcs, x="species", y="area", color="species",
             notched=True, # used notched shape
             title="Species area values before cleaning",
             hover_data=["real_area", "area", "segmented_image_name"])
fig.update_traces(boxmean=True)
fig.update_traces(boxmean="sd")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=1000
                 )
fig.show()

# Clean data

## Strict data cleaning

In [8]:
# copy data to clean
df_cln = df_spcs.copy()
df_cln = df_cln[df_cln["real_area"]!=0.0] # remove all values that do not have a detected area
filer_column = "area" # use area becasue real_area is skewed to 0
df_cum_temp = pd.DataFrame(columns=['species', 'real_area', 'area', 'segmented_image_name', 'vial', 'vial_subset', 'vial_subset_img'])
species_ar = df_cln['species'].unique() # species names
for i in species_ar:
    df_temp = df_cln[df_cln['species']==i]
    # remove all outliers from boxplots
    percentile25 = df_temp[filer_column].quantile(0.25)
    percentile75 = df_temp[filer_column].quantile(0.75)
    IQR = iqr(df_temp[filer_column])
    upper_limit = percentile75 + 1.5 * IQR
    lower_limit = percentile25 - 1.5 * IQR
    df_temp = df_temp[(df_temp[filer_column] <= upper_limit) & (df_temp[filer_column] >= lower_limit)]
    df_cum_temp = pd.concat([df_cum_temp, df_temp])
# df_cum_temp = df_cum_temp.reset_index()
df_cln = df_cum_temp.copy()
df_cln

Unnamed: 0,species,real_area,area,segmented_image_name,vial,vial_subset,vial_subset_img
0,Coccotypes_dactyliperda,2.186159,77526,Coccotypes_dactyliperda_16296_1_0052_0,16296,16296_1,16296_1_52
1,Coccotypes_dactyliperda,2.021593,71691,Coccotypes_dactyliperda_16296_1_0052_1,16296,16296_1,16296_1_52
2,Coccotypes_dactyliperda,2.157012,76511,Coccotypes_dactyliperda_16296_1_0052_2,16296,16296_1,16296_1_52
3,Coccotypes_dactyliperda,2.483354,88073,Coccotypes_dactyliperda_16296_1_0052_3,16296,16296_1,16296_1_52
5,Coccotypes_dactyliperda,2.092544,74206,Coccotypes_dactyliperda_16296_1_0052_5,16296,16296_1,16296_1_52
...,...,...,...,...,...,...,...
31984,Xylosandrus_crassiusculus,2.113579,71302,Xylosandrus_crassiusculus_8773_5_0140_21,8773,8773_5,8773_5_0140
31986,Xylosandrus_crassiusculus,2.508262,86870,Xylosandrus_crassiusculus_8773_5_0140_23,8773,8773_5,8773_5_0140
31987,Xylosandrus_crassiusculus,2.578575,89493,Xylosandrus_crassiusculus_8773_5_0140_24,8773,8773_5,8773_5_0140
31988,Xylosandrus_crassiusculus,2.025379,69824,Xylosandrus_crassiusculus_8773_5_0140_25,8773,8773_5,8773_5_0140


In [9]:
# flag images that are clean in dataframe and save as csv
df["clean"] = False
df["clean"].loc[df_cln.index] = True
print(df["clean"].value_counts())

# save to csv
df.to_csv(meta_data_file_path)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



True     28126
False     4338
Name: clean, dtype: int64


In [10]:
df

Unnamed: 0,centroid-0,centroid-1,bbox-0,bbox-1,bbox-2,bbox-3,orientation,axis_major_length,axis_minor_length,area,...,composite_image_path,species,vial,subset,composite_image_number,segmented_image_name,vial_subset,vial_subset_img,file_name,clean
0,902.268968,2434.645990,669,2159,1176,2680,0.851181,551.446345,208.663148,77526,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_0,16296_1,16296_1_52,Coccotypes_dactyliperda_16296_1_0052_0.JPG,True
1,1126.571843,617.557308,951,321,1310,897,1.244527,533.552869,184.634309,71691,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_1,16296_1,16296_1_52,Coccotypes_dactyliperda_16296_1_0052_1.JPG,True
2,1278.681092,1165.795350,1053,916,1552,1452,-0.894788,567.996227,196.588315,76511,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_2,16296_1,16296_1_52,Coccotypes_dactyliperda_16296_1_0052_2.JPG,True
3,1605.594882,2032.423819,1344,1766,1884,2305,0.817268,584.804556,204.174837,88073,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_3,16296_1,16296_1_52,Coccotypes_dactyliperda_16296_1_0052_3.JPG,True
4,1697.276058,749.737463,1543,486,1879,1032,-1.357185,488.424417,172.302348,61679,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,1,52,Coccotypes_dactyliperda_16296_1_0052_4,16296_1,16296_1_52,Coccotypes_dactyliperda_16296_1_0052_4.JPG,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32459,1204.977287,2932.880730,677,2582,1754,3262,0.209518,1060.008301,363.079292,269539,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_3,22849_5,22849_5_0065,Platypus_cylindrus_22849_5_0065_3.JPG,False
32460,2177.058473,1305.929270,1928,788,2526,1820,1.363932,1015.882306,357.259946,258306,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_4,22849_5,22849_5_0065,Platypus_cylindrus_22849_5_0065_4.JPG,False
32461,2594.744857,2537.921139,2133,2010,2979,3023,-0.988549,1082.147025,374.123220,286138,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_5,22849_5,22849_5_0065,Platypus_cylindrus_22849_5_0065_5.JPG,False
32462,3848.317215,2570.917765,3314,2274,4412,3003,-0.173802,1095.605815,395.401014,292272,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0065,Platypus_cylindrus_22849_5_0065_6,22849_5,22849_5_0065,Platypus_cylindrus_22849_5_0065_6.JPG,False


## Visualize data AFTER cleaning

In [11]:
# counts of images
####################
# vial counts
vial_counts = []
for i in df_cln['species'].unique():
    vials = len(df_cln[df_cln['species']==i]["vial"].unique())
    vial_counts.append(vials)
df_vial = pd.DataFrame.from_dict({"species":df_cln['species'].unique(),
                                   "vial_counts":vial_counts})
df_vial = df_vial.set_index("species")
fig = df_vial.plot.bar(color=df_vial.index, 
                              title="Vial counts per species after cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=500
                 )
fig.show()

In [12]:
####################
# subset counts
subset_counts = []
for i in df_cln['species'].unique():
    subsets = len(df_cln[df_cln['species']==i]["vial_subset"].unique())
    subset_counts.append(subsets)
df_subset = pd.DataFrame.from_dict({"species":df_cln['species'].unique(),
                                   "subset_counts":subset_counts})
df_subset = df_subset.set_index("species")
fig = df_subset.plot.bar(color=df_subset.index, 
                              title="Subset counts per species before cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=1000
                 )
fig.show()

In [13]:
####################
# total image counts
df_cln_counts = df_cln['species'].value_counts().sort_index()
fig = df_cln_counts.plot.bar(color=df_cln_counts.index, 
                              title="Image counts per species after cleaning")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=500
                 )
fig.show()

In [14]:
####################
# boxplot of area per species 
fig = px.box(df_cln, x="species", y="area", color="species",
             notched=True, # used notched shape
             title="Species area values before cleaning",
             hover_data=["real_area", "area", "segmented_image_name"])
fig.update_traces(boxmean=True)
fig.update_traces(boxmean="sd")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  # height=1000
                 )
fig.show()

# Move data into testing and training folders

## Split data into testing and training sets

In [15]:
# split data function
def test_train_split_images(meta_data_file_path, random_state, test_size=0.1):
    # get image names from csv
    df = pd.read_csv(meta_data_file_path, index_col="Unnamed: 0")
    df = df[df['clean'] == True]
    # split data by each species. % of each species is added to test data
    train_df = pd.DataFrame(columns=['segmented_image_name', 'species'])
    test_df = pd.DataFrame(columns=['segmented_image_name', 'species'])
    for i in df['species'].unique():
        spcs_df = df[df['species']==i]
        train_i, test_i = train_test_split(spcs_df, test_size=test_size, random_state=random_state)
        train_df = pd.concat([train_df, train_i])
        test_df = pd.concat([test_df, test_i])
    train_df = train_df.reset_index()
    test_df = test_df.reset_index()
    return(train_df, test_df)

def test_train_split_unique_values(meta_data, random_state, unique_column='vial_subset', test_size=0.1, from_file=True): # can be used to also split on vials
    if from_file:
        # get image names from csv
        df = pd.read_csv(meta_data, index_col="Unnamed: 0")
    else:
        df = meta_data
    df = df[df['clean'] == True] # select only clean data
    # get number of values per species
    # value counts
    value_counts_lst = []
    species_values_dict = {}
    test_values_lst = []
    train_values_lst = []
    for i in df['species'].unique():
        # add value counts to list
        values = df[df['species']==i][unique_column].unique()
        values_num = len(values)
        value_counts_lst.append(values_num)
        # add value numbers to dict
        species_values_dict[i] = values
        # select random number of values from list and add to selection list
        train_lst,test_lst = train_test_split(values, test_size=test_size, random_state=random_state)
        test_values_lst = test_values_lst + test_lst.tolist()
        train_values_lst = train_values_lst + train_lst.tolist()
    # save dictionary to dataframe
    df_value_count = pd.DataFrame.from_dict({"species":df['species'].unique(),
                                       "value_counts":value_counts_lst})
    # only include records with species that have multiple values
    spcs_lst = df_value_count[df_value_count['value_counts']>1]['species'].tolist()
    # only select data from species where more than one value exists
    df_slctd_spcs = df[df['species'].isin(spcs_lst)]
    # select a value at random for each species
    df_test = df_slctd_spcs[df_slctd_spcs[unique_column].isin(test_values_lst)]
    df_train = df_slctd_spcs[df_slctd_spcs[unique_column].isin(train_values_lst)]
    return(df_train, df_test)

def copy_file(src_path, dst_path):
    shutil.copy(src_path, dst_path)

def copy_files(files_list, src_dir, dst_dir, num_threads=4):
    src_paths = [os.path.join(src_dir, f) for f in files_list]
    dst_paths = [os.path.join(dst_dir, f) for f in files_list]
    pool = ThreadPool(num_threads)
    pool.starmap(copy_file, zip(src_paths, dst_paths))
    pool.close()
    pool.join()

In [16]:
# apply split and move of data
# parameters
source_path = r"/blue/hulcr/gmarais/Beetle_data/Beetle_classification_deep_data_segmented_clean"
target_path_train = r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data"
target_path_train_t = r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data/train"
target_path_train_v = r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data/valid"
target_path_test = r"/blue/hulcr/gmarais/Beetle_data/selected_images/test_data"
# source_path = r"C:\Users\gcmar\Dropbox (UFL)\Beetle_classification_deep_data_segmented_clean"
# target_path_train = r"C:\Users\gcmar\Dropbox (UFL)\selected_images\train"
# target_path_test = r"C:\Users\gcmar\Dropbox (UFL)\selected_images\test"

# split data
# train_df, test_df = test_train_split_images(meta_data_file_path=meta_data_file_path, random_state=random_state)
train_df, test_df = test_train_split_unique_values(meta_data=meta_data_file_path, random_state=random_state, unique_column='vial_subset', test_size=0.1, from_file=True)
train_t_df, valid_df = test_train_split_unique_values(meta_data=train_df, random_state=random_state, unique_column='vial_subset', test_size=0.2, from_file=False)

In [17]:
# only use one image per subset to train.
train_t_df

Unnamed: 0,centroid-0,centroid-1,bbox-0,bbox-1,bbox-2,bbox-3,orientation,axis_major_length,axis_minor_length,area,...,composite_image_path,species,vial,subset,composite_image_number,segmented_image_name,vial_subset,vial_subset_img,file_name,clean
489,504.653701,861.414095,256,626,786,1141,-0.737493,568.609043,194.103571,77615,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,3,73,Coccotypes_dactyliperda_16296_3_0073_0,16296_3,16296_3_73,Coccotypes_dactyliperda_16296_3_0073_0.JPG,True
490,555.766425,2240.167547,357,1931,777,2532,-1.195650,561.423128,212.460699,88053,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,3,73,Coccotypes_dactyliperda_16296_3_0073_1,16296_3,16296_3_73,Coccotypes_dactyliperda_16296_3_0073_1.JPG,True
492,715.948487,1035.356500,452,744,948,1317,-0.947464,598.699972,201.905594,85066,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,3,73,Coccotypes_dactyliperda_16296_3_0073_3,16296_3,16296_3_73,Coccotypes_dactyliperda_16296_3_0073_3.JPG,True
493,764.196617,525.687379,456,377,1082,693,0.032099,570.730014,196.174708,80756,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,3,73,Coccotypes_dactyliperda_16296_3_0073_4,16296_3,16296_3_73,Coccotypes_dactyliperda_16296_3_0073_4.JPG,True
494,933.089981,1559.443668,645,1348,1206,1815,0.514009,541.703306,222.084966,82395,...,Z:\lab records\Christopher_Marais\Beetle_class...,Coccotypes_dactyliperda,16296,3,73,Coccotypes_dactyliperda_16296_3_0073_5,16296_3,16296_3_73,Coccotypes_dactyliperda_16296_3_0073_5.JPG,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32451,1196.640040,2941.167818,667,2599,1748,3263,0.178246,1059.842775,362.819849,269327,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0064,Platypus_cylindrus_22849_5_0064_3,22849_5,22849_5_0064,Platypus_cylindrus_22849_5_0064_3.JPG,True
32452,2213.763823,1533.711719,1933,1017,2600,2037,1.241913,1016.208680,357.451767,258425,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0064,Platypus_cylindrus_22849_5_0064_4,22849_5,22849_5_0064,Platypus_cylindrus_22849_5_0064_4.JPG,True
32453,2615.899500,2569.278247,2231,2014,2896,3110,-1.332653,1080.932822,373.799077,285462,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0064,Platypus_cylindrus_22849_5_0064_5,22849_5,22849_5_0064,Platypus_cylindrus_22849_5_0064_5.JPG,True
32454,3848.540923,2570.888257,3315,2273,4412,3003,-0.173257,1095.541443,395.278219,292197,...,Z:\lab records\Christopher_Marais\Beetle_class...,Platypus_cylindrus,22849,5,0064,Platypus_cylindrus_22849_5_0064_6,22849_5,22849_5_0064,Platypus_cylindrus_22849_5_0064_6.JPG,True


In [18]:
# keep only select amount of images per subset
species_values_dict = {}
vial_subset_img_lst = []
temp_df = train_t_df.drop_duplicates(['vial','subset'], keep='first').copy() # drop all duplicate images and only use first
for i in temp_df['species'].unique():
    # add value counts to list
    values = temp_df[temp_df['species']==i]["vial_subset_img"].unique()
    # add value numbers to dict
    species_values_dict[i] = values
    vial_subset_img_lst = vial_subset_img_lst + species_values_dict[i].tolist()

    
# keep only rows with one image per subset
train_t_df_limited = train_t_df[train_t_df['vial_subset_img'].isin(vial_subset_img_lst)]


# # limit to very few images
# train_t_df_limited = train_t_df_limited.drop_duplicates(['vial_subset'])
# train_t_df_limited

In [19]:
####################
# total image counts
df_cln_counts = train_t_df_limited['species'].value_counts().sort_index()
fig = df_cln_counts.plot.bar(color=df_cln_counts.index, 
                              title="Image counts per species after limiting")
fig.update_layout(autosize=True, 
                  # width=2000, 
                  height=500)
fig.show()

In [20]:
# move data
# copy_files(files_list=train_t_df["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_train_t, num_threads=4)
# copy_files(files_list=valid_df["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_train_v, num_threads=4)
# copy_files(files_list=test_df["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_test, num_threads=4)

# move images to their respective folders in train and valid
for i in train_t_df['species'].unique(): 
#     # copy limited train data
#     os.mkdir(target_path_train_t+"\\"+i)
#     copy_files(files_list=train_t_df_limited[train_t_df_limited['species']==i]["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_train_t+"\\"+i, num_threads=4)
    #####################################
    os.mkdir(target_path_train_t+"/"+i)
    copy_files(files_list=train_t_df[train_t_df['species']==i]["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_train_t+"/"+i, num_threads=4)
    os.mkdir(target_path_train_v+"/"+i)
    copy_files(files_list=valid_df[valid_df['species']==i]["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_train_v+"/"+i, num_threads=4)
    os.mkdir(target_path_test+"/"+i)
    copy_files(files_list=test_df[test_df['species']==i]["file_name"].tolist(), src_dir=source_path, dst_dir=target_path_test+"/"+i, num_threads=4)

# Atkinson data

In [21]:
# # import data
# atkinson_df = pd.read_csv(r"F:\Atkinson_data\metadata.csv")
# atkinson_df

# # get species list
# spcs_lst = train_t_df['species'].unique().tolist()
# atk_spelling_lst = ["Coccotrypes_dactyliperda",
#                     "Pycnarthrum_hispidum",
#                     "Scolytodes_schwarzi"]

# spcs_lst = spcs_lst + atk_spelling_lst
# spcs_lst = list(set(spcs_lst))

# # select only files with these species
# atkinson_selected_df = atkinson_df[atkinson_df['genus_species'].isin(spcs_lst)]
# atkinson_selected_df

# # move images to their respective folders in train and valid
# target_path_atk = r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data/train"
# source_path_atk = r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data/images_copy"
# for i in spcs_lst: 
#     # copy limited train data
#     os.mkdir(target_path_atk+"\\"+i)
#     copy_files(files_list=atkinson_selected_df[atkinson_selected_df['genus_species']==i]["full_file_name"].tolist(), src_dir=source_path_atk, dst_dir=target_path_atk+"\\"+i, num_threads=4)