This script describes an iterative approach with self-training to successively extend the existing image dataset with labels as well as to train better YOLOv5 models. 

# Setup

In [None]:
import torch
import statistics
from psutil import virtual_memory
import shutil
import os
import gc
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image,ImageFile, ImageDraw,ImageFont
from tqdm import tqdm
import numpy as np
import tarfile

In [None]:
# check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# Check RAM
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

# RAID
In this project, it was possible to use a RAID system. Therefore, in the following the used folder structure is described.

In [None]:
# Change to raid
%cd /raid/USER

## Load YOLOv5

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

In [None]:
# Model settings & Model meta infos

In [None]:
%cd 
# Model settings
path_to_model = 'OWN_PATH/yolov5_yaml/model_settings/model_settings' # Manuelly creatd model definition.
target_model = 'yolov5l_snake.yaml' # select your model 
shutil.copy('{}/{}'.format(path_to_model,target_model),'/raid/USER/yolov5/models') # Paste to YOLOv5 installation

# Model meta infos
path_to_meta = 'OWN_PATH/yolov5_yaml/snake_detection.yaml' # Manually created YAML

shutil.copy(path_to_meta,'/raid/USER/yolov5/data') # Past to YOLOv5 installation

%cd /raid/USER

## Prepare data structure

In [None]:
if not os.path.exists('./images'):
    os.mkdir('./images')
    
if not os.path.exists('./images/all'):
    os.mkdir('./images/all')
    
if not os.path.exists('./images/raw_labels'):
    os.mkdir('./images/raw_labels')
    
if not os.path.exists('./images/train'):
    os.mkdir('./images/train')
    os.mkdir('./images/train/images')
    os.mkdir('./images/train/labels')
    
if not os.path.exists('./images/val'):
    os.mkdir('./images/val')
    os.mkdir('./images/val/images')
    os.mkdir('./images/val/labels')
    
if not os.path.exists('./runs'):
    os.mkdir('./runs')
    
if not os.path.exists('./saved_models'):
    os.mkdir('./saved_models')

# Transfer of meta files    
    
%cd 
shutil.copy('OWN_PATH/annotation/labels/labels/classes.txt','/raid/USER/images/train/labels/classes.txt')
shutil.copy('OWN_PATH/annotation/labels/labels/classes.txt','/raid/USER/images/val/labels/classes.txt')

shutil.copy('OWN_PATH/images/all/SnakeCLEF2022-ISOxSpeciesMapping.csv','/raid/USER/images/all/SnakeCLEF2022-ISOxSpeciesMapping.csv')
shutil.copy('OWN_PATH/images/all/SnakeCLEF2022-TestMetadata.csv','/raid/USER/images/all/SnakeCLEF2022-TestMetadata.csv')
shutil.copy('OWN_PATH/images/all/SnakeCLEF2022-SampleSubmission.csv','/raid/USER/images/all/SnakeCLEF2022-SampleSubmission.csv')
shutil.copy('OWN_PATH/images/all/SnakeCLEF2022-TrainMetadata.csv','/raid/USER/images/all/SnakeCLEF2022-TrainMetadata.csv')
%cd /raid/USER

In [None]:
%cd
# TRAIN IMAGE DATA
TRAIN_DATA_DIR = './images/all/SnakeCLEF2022-large_size.tar.gz'
shutil.copy(TRAIN_DATA_DIR,'/raid/USER/images/all/') 

In [None]:
# VAL IMAGE DATA
TEST_DATA_DIR = './images/all/SnakeCLEF2022-test_images'
shutil.copytree(TEST_DATA_DIR,'/raid/USER/images/all/SnakeCLEF2022-test_images')
%cd /raid/USER

In [None]:
file = tarfile.open('./images/all/SnakeCLEF2022-large_size.tar.gz')
  
# extracting file
file.extractall('./images/all/')
  
file.close()

In [None]:
# Delete tar.gz file
!rm images/all/SnakeCLEF2022-large_size.tar.gz

## Prepare sorting the data
Here we lay the foundation for the later sorting of the bounding boxes newly found by our YOLOv5 model. This is an important step in order to maintain the SnakeCLEF 2022 snake class distribution.

In [None]:
path_to_meta = './images/all'
df_meta_train = pd.read_csv('{}/{}'.format(path_to_meta,'SnakeCLEF2022-TrainMetadata.csv'))
df_meta_test = pd.read_csv('{}/{}'.format(path_to_meta,'SnakeCLEF2022-TestMetadata.csv'))

In [None]:
# update file_path
df_meta_train['file_path'] = './images/all/SnakeCLEF2022-large_size/' +df_meta_train['file_path']


In [None]:
# Create column with the names of each image
df_meta_train['name'] = ''*len(df_meta_train)
for index, row in tqdm(df_meta_train.iterrows()):
    df_meta_train.name.iloc[index] = row['file_path'].split('/')[-1].split('.')[0]

In [None]:
df_meta_train.head()

In [None]:
df_meta_test['file_path']  = './images/all/SnakeCLEF2022-test_images/SnakeCLEF2022-large_size/'+df_meta_test['file_path']

In [None]:
df_meta_test['name'] = ''*len(df_meta_test)
for index, row in tqdm(df_meta_test.iterrows()):
    df_meta_test.name.iloc[index] = row['file_path'].split('/')[-1].split('.')[0]

In [None]:
df_meta_test.head()

In [None]:
# Create a table with the necessary data, including observation_id, file_path and name
df_all = df_meta_train[['observation_id','file_path','name']]
df_all = df_all.append(df_meta_test[['observation_id','file_path','name']])

In [None]:
df_all.head()

In [None]:
df_all.head()

In [None]:
# Decision criterion for sorting is the observation_id.
train_observation_id = df_meta_train.observation_id.unique()
val_observation_id = df_meta_test.observation_id.unique()

## Create bounding boxes on all images

In [None]:
%cd yolov5
#from yolov5 
import utils
display = utils.notebook_init()

In [None]:
# Load the best weight for from script 02 (baseline model)
shutil.copy('OWN_PATH/yolov5_yaml/saved_runs/exp_OUR_NAME_FOR_THIS_MODEL/weights/best.pt', '/raid/USER/yolov5/best.pt')



In [None]:
# Load YOLOv5 model with torch
path_to_base_model = './yolov5/best.pt'

model = torch.hub.load('ultralytics/yolov5', 'custom', path=path_to_base_model,force_reload=True)

# move to cuda
device = ("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
# take notes on alle precitions 
all_predictions = pd.DataFrame(columns=['observation_id','image_path','name','label_txt','probability'])
# Selection criteria
p = 0.7 

In [None]:
!pwd # => MUSS: /raid/USER

In [None]:
import imagesize

# Temporary saving of the found labels
path_to_raw_labelsTXT = './images/raw_labels'
error = False
counter_error = 0

for index, row in tqdm(df_all.iterrows()):
    observation_id = row['observation_id']
    file_path = row['file_path']
    name = row['name']
    
    error = False
    try: 
        # Generate prediction for each image
        results = model(file_path)
    except Exception as e:
        error = True
        print(e)
        counter_error +=1
        
    if not error:
            # Prepare result
            results = results.pandas().xyxy[0]
            # Get image dim.
            x, y = imagesize.get(file_path)
            # Save the confidence levels for a later decision
            avg_conf = []
            # Open a txt file to save the found bounding box. Later this is sorted correctly based on the observation_id
            file = open('{}/{}.txt'.format(path_to_raw_labelsTXT,file_path.split('/')[-1].split('.')[0]),'w')
            
            tmp = pd.DataFrame(columns=['observation_id','image_path','name','label_txt','probability'])
            
            for index, row in results.iterrows():
                X0 = (row['xmax'] + row['xmin']) // 2 # BBox X Center
                Y0 = (row['ymax'] + row['ymin']) // 2 # BBox Y Center
                W = row['xmax'] - row['xmin'] # Width
                H = row['ymax'] - row['ymin'] # Height

                # print('0 {} {} {} {}'.format(X0/x,Y0/y,W/x,H/y))
                # Onyl store bounding boxes that are above the specified value p.
                if row['confidence'] >= p:
                    avg_conf.append(row['confidence'])
                    file.write('0 {} {} {} {}\n'.format(X0/x,Y0/y,W/x,H/y))
                    tmp = tmp.append({'observation_id':observation_id,'image_path':file_path,'name':name,'label_txt':file_path.split('/')[-1].split('.')[0],'probability':row['confidence']},ignore_index=True)
            
            file.close()
            # If at least one bounding box is present, then:
            if len(avg_conf) > 0:
                # save the found bounding boxes for the following sorting process
                if statistics.mean(avg_conf) >= p:
                    all_predictions = all_predictions.append(tmp)


print('Errors: ', counter_error)

In [None]:
all_predictions.head()

In [None]:
len(all_predictions)

In [None]:
# Save df
all_predictions.to_csv('./images/all_predictions_run_{}.csv'.format(0))


## Sort images and bounding boxes
Later, a slightly different selection method of the bounding box is used. For the sake of completeness, tha old method is still listed here.

In [None]:
#train_observation_id val_observation_id
# Wir können an dieser Stelle, das Skript laufen lassen, obwohl Duplicate vorhanden sind. (Dieser werden über try except abgefangen)
counter_train = 0
counter_val = 0


for index, row in tqdm(all_predictions.iterrows()):
    observation_id = row['observation_id']
    image_path = row['image_path']
    label_txt_path = './images/raw_labels/' + str(row['label_txt']) + '.txt'
    probability = row['probability']

    if probability >= p:
        
        if observation_id in train_observation_id:
            # training
            try:
                shutil.move(image_path, './images/train/images/')
                shutil.move(label_txt_path, './images/train/labels/')
                counter_train +=1
            except:
                pass
            
        elif observation_id in val_observation_id:
            # val
            try:
                shutil.move(image_path, './images/val/images/')
                shutil.move(label_txt_path, './images/val/labels/')

                counter_val +=1
            except:
                pass
        else:
            print('Error in observation_id ',observation_id,image_path)
    
print('Train counter ', counter_train)
print('Val counter ', counter_val)
print('Total: ',counter_train+counter_val)

## Train new YOLOv5 model
It is important to adjust the model meta data, i.e. where YOLOv5 findes the corresponding data




In [None]:
%cd yolov5
#from yolov5 
import utils
display = utils.notebook_init()
%cd ..

In [None]:
# check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
!python yolov5/train.py --img 640 --batch 26 --cfg yolov5/models/yolov5l_snake.yaml --epochs 100 --data yolov5/data/snake_detection.yaml --weights yolov5l.pt #--cache


In [None]:
# save run
run = 'exp' # 
model_name = 'yolov5l_640' #

shutil.copytree('./yolov5/runs/train/{}'.format(run),'./saved_runs/{}'.format(model_name))

## Next iteration of predicting bounding boxes
We want to edit all images that have not yet been sorted (bad BBox value) with the new model. To do this, we first need to find out which images are already sorted. We can do this with the file ./images/all_predictions_run_0.csv. Here are all images with a BBox value >= p in it.

## < Entry point for any following iteration >

In [None]:
# Check how much data is available. This number mus be slightly larger in each iteration
print('Images TRAIN: ', len(os.listdir('./images/train/images')))
print('Labels TRAIN: ', len(os.listdir('./images/train/labels')))
print('')
print('Images VAL: ', len(os.listdir('./images/val/images')))
print('Labels VAL: ', len(os.listdir('./images/val/labels')))
print('')
print('Total images ',len(os.listdir('./images/train/images'))+len(os.listdir('./images/val/images')))

In [4]:
### Find all images that already have a matching bounding box.

In [None]:
train_file_names = os.listdir('./images/train/images')

train_file_names_df = pd.DataFrame()

for file_name in tqdm(train_file_names):
    train_file_names_df = train_file_names_df.append([file_name.split('/')[-1].split('.')[0]])

In [None]:
len(train_file_names_df)

In [None]:
test_file_names = os.listdir('./images/val/images')

test_file_names_df = pd.DataFrame()

for file_name in tqdm(test_file_names):
    test_file_names_df = test_file_names_df.append([file_name.split('.')[0]])

In [None]:
len(test_file_names_df)

In [None]:
# Make a list with all edited images names
df_file_name_all = train_file_names_df
df_file_name_all = df_file_name_all.append(test_file_names_df)
name_already_processed = df_file_name_all.values.tolist()

In [None]:
len(name_already_processed)

In [None]:
new_list = []
for i in tqdm(name_already_processed):
    for j in i:
        new_list.append(j)
name_already_processed = new_list

In [None]:
path_to_meta = './images/all'
df_meta_train = pd.read_csv('{}/{}'.format(path_to_meta,'SnakeCLEF2022-TrainMetadata.csv'))
df_meta_test = pd.read_csv('{}/{}'.format(path_to_meta,'SnakeCLEF2022-TestMetadata.csv'))

In [None]:
len(df_meta_train)

In [None]:
df_meta_train['name'] = ''*len(df_meta_train)
for index, row in tqdm(df_meta_train.iterrows()):
    df_meta_train.name.iloc[index] = row['file_path'].split('/')[-1].split('.')[0]

In [None]:
# Select all train data without the observation_id in df_already_sorted
counter_not_in = 0
counter_in = 0

df_meta_train_2 = pd.DataFrame()

for index, row in tqdm(df_meta_train.iterrows()):
    if row['name'] not in name_already_processed:
        df_meta_train_2 = df_meta_train_2.append(row)
        counter_not_in += 1
    else:
        counter_in +=1

In [None]:
print('NOT in: ',counter_not_in)
print('IN: ',counter_in)
print('total: ',counter_not_in+counter_in)

In [None]:
len(df_meta_train_2)

In [None]:
# update file_path
df_meta_train_2['file_path'] = './images/all/SnakeCLEF2022-large_size/' +df_meta_train_2['file_path']

In [None]:
df_meta_train_2.head()

In [None]:
# Save 
df_meta_train_2.to_csv('./images/all/df_meta_train_2.csv')

In [None]:
df_meta_train = pd.read_csv('./images/all/df_meta_train_2.csv')

In [None]:
df_meta_test['name'] = ''*len(df_meta_test)
for index, row in tqdm(df_meta_test.iterrows()):
    df_meta_test.name.iloc[index] = row['file_path'].split('/')[-1].split('.')[0]

In [None]:
# Select all val or test data without the observation_id in df_already_sorted
counter_not_in = 0
counter_in = 0

df_meta_test_2 = pd.DataFrame()

for index, row in tqdm(df_meta_test.iterrows()):
    if row['name'] not in name_already_processed:
        df_meta_test_2 = df_meta_test_2.append(row)
        counter_not_in += 1
    else:
        counter_in +=1

In [None]:
print('NOT in: ',counter_not_in)
print('IN: ',counter_in)
print('total: ',counter_not_in+counter_in)

In [None]:
# update file_path
df_meta_test_2['file_path']  = './images/all/SnakeCLEF2022-test_images/SnakeCLEF2022-large_size/'+df_meta_test_2['file_path']

In [None]:
df_meta_test_2.to_csv('./images/all/df_meta_test.csv')

In [None]:
df_meta_test = pd.read_csv('./images/all/df_meta_test.csv')

In [None]:
train_observation_id = df_meta_train.observation_id.unique()
val_observation_id = df_meta_test.observation_id.unique()

In [None]:
df_all = df_meta_train[['observation_id','file_path','name']]
df_all = df_all.append(df_meta_test[['observation_id','file_path','name']])

In [None]:
df_all.observation_id = df_all.observation_id.astype(int)

### Load a custom YOLOv5 with torch

In [None]:
path_to_base_model = './saved_runs/yolov5l_OUR_NAME/weights/best.pt' # SET NEW MODEL from previouse iteration
model = torch.hub.load('ultralytics/yolov5', 'custom', path=path_to_base_model,force_reload=True)
device = ("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
all_predictions = pd.DataFrame(columns=['observation_id','image_path','name','label_txt','probability'])
# sort criteria
p = 0.7 # 

In [None]:
import imagesize

path_to_raw_labelsTXT = './images/raw_labels'
error = False
counter_error = 0

for index, row in tqdm(df_all.iterrows()):
    observation_id = row['observation_id']
    file_path = row['file_path']
    name = row['name']
    
    error = False
    try: 
        results = model(file_path)
    except Exception as e:
        error = True
        print(e)
        counter_error +=1
        
    if not error:
            results = results.pandas().xyxy[0]

            x, y = imagesize.get(file_path)
            
            # print('{}/{}.txt'.format(path_to_raw_labelsTXT,file_path.split('/')[-1].split('.')[0]))
            
            file = open('{}/{}.txt'.format(path_to_raw_labelsTXT,file_path.split('/')[-1].split('.')[0]),'w')
            
            for index, row in results.iterrows():
                X0 = (row['xmax'] + row['xmin']) // 2 # BBox X Center
                Y0 = (row['ymax'] + row['ymin']) // 2 # BBox Y Center
                W = row['xmax'] - row['xmin'] # Width
                H = row['ymax'] - row['ymin'] # Height

                # print('0 {} {} {} {}'.format(X0/x,Y0/y,W/x,H/y))
                # Change selection method. Unlike the previous approch, this
                # one stores only the bounding box with the highest confidence.
                # This works because it seems that each image in the given dataset
                # represents exactly 1 snake.
                
                if row['confidence'] >= p:
                    if row['confidence'] >= max(results['confidence']):    
                        file.write('0 {} {} {} {}\n'.format(X0/x,Y0/y,W/x,H/y))
                        
            file.close()
            
            if len(results) > 0:
                all_predictions = all_predictions.append({'observation_id':observation_id,'image_path':file_path,'name':name,'label_txt':file_path.split('/')[-1].split('.')[0],'probability':row['confidence']},ignore_index=True)
            
            
print('Errors: ', counter_error)

In [None]:
print('Errors: ', counter_error)

In [None]:
all_predictions.head()

In [None]:
all_predictions = all_predictions.dropna()

In [None]:
# save
all_predictions.to_csv('./images/all_predictions_run_{}.csv'.format(4)) # change number is you have more runs

In [None]:
all_predictions = pd.read_csv('./images/all_predictions_run_4.csv')

In [None]:
all_predictions = all_predictions[['image_path','label_txt','name','observation_id','probability']]

### Sort again

In [None]:
# train_observation_id val_observation_id
counter_train = 0
counter_val = 0


for index, row in tqdm(all_predictions.iterrows()):
    observation_id = row['observation_id']
    image_path = row['image_path']
    label_txt_path = './images/raw_labels/' + str(row['label_txt']) + '.txt'
    probability = row['probability']
        
    if observation_id in train_observation_id:
        # training
        try:
            shutil.move(image_path, './images/train/images/')
            shutil.move(label_txt_path, './images/train/labels/')
            counter_train +=1
        except Exception as e:
            print(e)

    elif observation_id in val_observation_id:
        # val
        try:
            shutil.move(image_path, './images/val/images/')
            shutil.move(label_txt_path, './images/val/labels/')

            counter_val +=1
        except Exception as e:
            print(e)
    else:
        print('Error in observation_id ',observation_id,image_path)

In [None]:
print('New train images ', counter_train)
print('New val images ', counter_val)
print('Total: ',counter_train+counter_val)

### Train a new YOLOv5 model

In [None]:
%cd yolov5
#from yolov5 
import utils
display = utils.notebook_init()
%cd ..

In [None]:
!python yolov5/train.py --img 640 --batch 16 --cfg yolov5/models/yolov5l_snake.yaml --epochs 100 --data yolov5/data/snake_detection.yaml --weights yolov5l.pt #--cache


In [None]:
# save
run = 'exp'
model_name = 'yolov5l_Iteration_X'
shutil.copytree('./yolov5/runs/train/{}'.format(run),'./saved_runs/{}'.format(model_name))

At this point you can jump to the entry point and perform a new iteration. Alternatively, 'train a new YOLOv5 model' can be used to train any other model with the existing data.