This script describes the structure of the data processing for the SnakeCLEF 2021 data.

In [None]:
import shutil
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load Metadata
Load the image metadata (n=1,800) from SnakeCLEF 2021.
At this point, the data for SnakeCLEF 2021 is needed.

In [None]:
# Complete SnakeCLEF 2021 Metadata
path_to_meta = 'OWN_PATH/SnakeCLEF2021_train_metadata_PROD.csv'
df = pd.read_csv(path_to_meta)

In [None]:
df.head()

In [None]:
df_short = pd.read_csv('OWN_PATH/meta_df_short.csv')

In [None]:
df_short.head()

Since only the labels are given as txt file, the corresponding image files must be found. This is realized via the UUID. Under annotation > labels > labels the corresponding labels would be stored.

In [None]:
# Names of all labels.txt
annotations = pd.DataFrame(os.listdir('OWN_PATH/annotation/labels/labels'),columns=['name'])
annotations = annotations[annotations['name']!='classes.txt']
print('Len annotations ',len(annotations))
annotations['UUID']=annotations['name'].str.split('.',expand=True)[0]
print(annotations)

In [None]:
# Selection of the corresponding entries from the large table
df_short = df[df['UUID'].isin(annotations['UUID'])]

In [None]:
len(df_short)

In [None]:
df_short.head()

# Preparation for the 1st training run

In [None]:
train, val = train_test_split(df_short, shuffle=True, test_size=0.20, random_state=42)

In [None]:
print('Len TRAIN: ',len(train))
print('Len VALIDATION: ',len(val))

# Preprocessing
## Images
The images which are in the respective training set must now be moved from 'images/all' to 'images/train/images' or 'images/val/images'.
## Labels
The labels are identified by the UUID of the image and sorted accordingly by 'images/train/labels' or 'images/val/labels
## Classes.txt
Create a file 'classes.txt' and write 'snake' at the first position.

In [None]:
path_to_images = 'OWN_PATH/images/all'
path_to_labels = 'OWN_PATH/annotation/labels/labels'
c = 0

# TRAINING

for index, row in tqdm(train.iterrows()):
    
    # label
    src_label = '{}/{}.txt'.format(path_to_labels,row['UUID'])
    dst_label = 'OWN_PATH/images/train/labels/{}.txt'.format(row['UUID'])
    
    try:
        shutil.copy(src_label,dst_label) 
    except Exception as e:
        print('ERROR: Label could not be moved! STOP')
        print('UUID: ',row['UUID'])
        # SOFORT STOP, da sonst nur Fehler auftreten, wenn wir versuchen YOLOv5 zu starten!
        break
        
    # image
    scr_image = '{}/{}'.format(path_to_images,row['image_path'])
    dst_image = 'OWN_PATH/images/train/images/{}.jpg'.format(row['UUID'])
    
    try:
        shutil.move(scr_image,dst_image) 
    except Exception as e:
        print('TRAIN ERROR: Image could not be moved!!')
        print('UUID: ',row['UUID'],' #'+c)
        
    c+=1
# copy classes.txt
shutil.copy('{}/classes.txt'.format(path_to_labels),'OWN_PATH/images/train/labels/classes.txt')
          
# VALIDATION    
    
for index, row in tqdm(val.iterrows()):
    
    # label
    src_label = '{}/{}.txt'.format(path_to_labels,row['UUID'])
    dst_label = 'OWN_PATH/images/val/labels/{}.txt'.format(row['UUID'])
    
    try:
        shutil.copy(src_label,dst_label) 
    except Exception as e:
        print('VAL ERROR: Label could not be moved! STOP')
        print('UUID: ',row['UUID'])
        break # Stop -> to avoid later errors when starting the YOLOv5 training
        
    # image
    scr_image = '{}/{}'.format(path_to_images,row['image_path'])
    dst_image = 'OWN_PATH/images/val/images/{}.jpg'.format(row['UUID'])
    
    try:
        shutil.move(scr_image,dst_image) 
    except Exception as e:
        print('ERROR: Image could not be moved!')
        print('UUID: ',row['UUID'])
        

# copy classes.txt 
shutil.copy('{}/classes.txt'.format(path_to_labels),'OWN_PATH/images/val/labels/classes.txt')

In [None]:
# Move images to other folder
for ele in tqdm(os.listdir('OWN_PATH/images/train')):
    if ele.endswith('.jpg'):
        shutil.move('OWN_PATH/images/train/{}'.format(ele),'OWN_PATH/images/train/images/{}'.format(ele))

In [None]:
# Move images to other folder
for ele in tqdm(os.listdir('OWN_PATH/images/val')):
    if ele.endswith('.jpg'):
        shutil.move('OWN_PATH/images/val/{}'.format(ele),'OWN_PATH/images/val/images/{}'.format(ele))

In [None]:
os.listdir('OWN_PATH/images/train')

In [None]:
os.listdir('OWN_PATH/images/val')