# 1. Setup & Data Import

## 1.1. Data Import into the execution environment

In [1]:
%cd '/content'

%mkdir RoadDamageDataset 
%cd RoadDamageDataset

# train set
!wget -c https://mycityreport.s3-ap-northeast-1.amazonaws.com/02_RoadDamageDataset/public_data/IEEE_bigdata_RDD2020/train.tar.gz
!tar xf train.tar.gz

/content
/content/RoadDamageDataset
--2022-09-05 08:04:05--  https://mycityreport.s3-ap-northeast-1.amazonaws.com/02_RoadDamageDataset/public_data/IEEE_bigdata_RDD2020/train.tar.gz
Resolving mycityreport.s3-ap-northeast-1.amazonaws.com (mycityreport.s3-ap-northeast-1.amazonaws.com)... 3.5.154.11
Connecting to mycityreport.s3-ap-northeast-1.amazonaws.com (mycityreport.s3-ap-northeast-1.amazonaws.com)|3.5.154.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1472626254 (1.4G) [application/x-tar]
Saving to: ‘train.tar.gz’


2022-09-05 08:06:01 (12.3 MB/s) - ‘train.tar.gz’ saved [1472626254/1472626254]



## 1.2. Google Drive Connection and yolov5 Clone

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive'

# Creation of the folder
%mkdir RDD_Final 
%cd RDD_Final

# Git Clone
!git clone https://github.com/ultralytics/yolov5  # clone

Mounted at /content/drive
/content/drive/MyDrive
mkdir: cannot create directory ‘RDD_Final’: File exists
/content/drive/MyDrive/RDD_Final
fatal: destination path 'yolov5' already exists and is not an empty directory.


In [3]:
# Installing and importing libraries
%cd '/content/drive/MyDrive/RDD_Final/yolov5'
%pip install -qr requirements.txt  # install

import torch
import utils
from IPython.display import Image  # for displaying images
import os 
import random
import shutil
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET
from xml.dom import minidom
from tqdm import tqdm
from PIL import Image, ImageDraw
import numpy as np
import matplotlib.pyplot as plt
from cmath import inf

random.seed(108)

/content/drive/MyDrive/RDD_Final/yolov5
[K     |████████████████████████████████| 1.6 MB 15.2 MB/s 
[?25h

# 2. Data management

## 2.1. Moving files from country sub-folders to global folders

In [4]:
# Creating 2 folders to put together all the data (annotations and images) from the 3 countries
%cd /content/RoadDamageDataset/train/
%mkdir annotations 
%mkdir images 

/content/RoadDamageDataset/train


In [5]:
# Moving all images and annotations from Country folders to the main folders
Country = ["Czech", "India", "Japan"]

for c in Country : 

  # Annotations 

  # Define the source and destination path
  source = "/content/RoadDamageDataset/train/" + c + "/annotations/xmls/"
  destination = "/content/RoadDamageDataset/train/annotations/"
  
  # Code to move the files from sub-folder to main folder.
  files = os.listdir(source)
  for file in files:
    file_name = os.path.join(source, file)
    shutil.move(file_name, destination)

  # Images 
  
  # Define the source and destination path
  source = "/content/RoadDamageDataset/train/" + c + "/images/"
  destination = "/content/RoadDamageDataset/train/images/"
  
  # Code to move the files from sub-folder to main folder.
  files = os.listdir(source)
  for file in files:
    file_name = os.path.join(source, file)
    shutil.move(file_name, destination)


## 2.2. Annotation to YoloV5 format

In [6]:
# Function to get the data from XML Annotation

def extract_info_from_xml(xml_file):
    root = ET.parse(xml_file).getroot()
    
    # Initialise the info dict 
    info_dict = {}
    info_dict['bboxes'] = []

    # Parse the XML Tree
    for elem in root:
        # Get the file name 
        if elem.tag == "filename":
            info_dict['filename'] = elem.text
            
        # Get the image size
        elif elem.tag == "size":
            image_size = []
            for subelem in elem:
                image_size.append(int(subelem.text))
            
            info_dict['image_size'] = tuple(image_size)
        
        # Get details of the bounding box 
        elif elem.tag == "object":
            bbox = {}
            for subelem in elem:
                if subelem.tag == "name":
                    bbox["class"] = subelem.text
                    
                elif subelem.tag == "bndbox":
                    for subsubelem in subelem:
                        bbox[subsubelem.tag] = int(subsubelem.text)            
            info_dict['bboxes'].append(bbox)
    
    return info_dict
  

In [7]:
# Dictionary that maps class names to IDs

# Each DXX class corresponds to one road default ;
# DOO/D01 : longitudinal cracks
# D10/D11 : lateral cracks
# D20 : aligator cracks
# D40 : pot hole
# D43/D44 : white/yellow lines
# D50 : manholes
# D0w0 : others
class_name_to_id_mapping = {"D00": 0,
                            "D01": 0,
                           "D10": 1,
                            "D11": 1,
                           "D20": 2,
                           "D40": 3,
                           "D43": 4,
                            "D44": 4,
                            "D50": 5,
                            "D0w0": 6
                            }

# Convert the info dict to the required yolo format and write it to disk
def convert_to_yolov5(info_dict, path):
    print_buffer = []
    
    # For each bounding box
    for b in info_dict["bboxes"]:
        try:
            class_id = class_name_to_id_mapping[b["class"]]
        except KeyError:
            print("Invalid Class. Must be one from ", class_name_to_id_mapping.keys())
        
        # Transform the bbox co-ordinates as per the format required by YOLO v5
        b_center_x = (b["xmin"] + b["xmax"]) / 2 
        b_center_y = (b["ymin"] + b["ymax"]) / 2
        b_width    = (b["xmax"] - b["xmin"])
        b_height   = (b["ymax"] - b["ymin"])
        
        # Normalise the co-ordinates by the dimensions of the image
        # if len(info_dict['image_size']) == 2:
        #     image_w, image_h = info_dict["image_size"]
        # elif 'India' in info_dict['filename']:
        #     image_c, image_w, image_h = info_dict["image_size"]
        # else:
        #     image_w, image_h, image_c  = info_dict["image_size"]
        image_w = info_dict["image_size"][1]
        image_h = info_dict["image_size"][1]

        b_center_x /= image_w 
        b_center_y /= image_h 
        b_width    /= image_w 
        b_height   /= image_h 
        
        #Write the bbox details to the file 
        print_buffer.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(class_id, b_center_x, b_center_y, b_width, b_height))
        
    # Name of the file which we have to save 
    save_file_name = os.path.join(path, info_dict["filename"].replace("jpg", "txt"))
    #save_file_name = os.path.join(info_dict["filename"].replace("jpg", "txt"))
    
    # Save the annotation to disk
    print("\n".join(print_buffer), file= open(save_file_name, "w"))

In [8]:
%cd '/content/RoadDamageDataset/train'

base_path = '/content/RoadDamageDataset/train/'

cls_names = []
total_images = 0
    
file_list = [filename for filename in os.listdir(base_path + '/annotations/') if not filename.startswith('.')]

for file in file_list:

      total_images = total_images + 1
      if file =='.DS_Store':
          pass
      else:
          infile_xml = open(base_path + '/annotations/' +file)
          tree = ET.parse(infile_xml)
          root = tree.getroot()
          for obj in root.iter('object'):
              cls_name = obj.find('name').text
              cls_names.append(cls_name)

print("total")
print("# of images：" + str(total_images))
print("# of labels：" + str(len(cls_names)))

/content/RoadDamageDataset/train
total
# of images：21041
# of labels：34702


In [9]:
base_path = '/content/RoadDamageDataset/train/'

# Get the annotations
annotations = [os.path.join(base_path + '/annotations/', x) for x in os.listdir(base_path + '/annotations/') if x[-3:] == "xml"]
annotations.sort()

# Convert and save the annotations
for ann in tqdm(annotations):
  info_dict = extract_info_from_xml(ann)
  convert_to_yolov5(info_dict, os.path.join(base_path + '/annotations/'))
annotations = [os.path.join(base_path + '/annotations/', x) for x in os.listdir(base_path + '/annotations/') if x[-3:] == "txt"]

100%|██████████| 21041/21041 [00:03<00:00, 6737.54it/s]


## 2.3. Dividing data into Train, Test, Validation sets

In [10]:
# Read images and annotations
base_path = '/content/RoadDamageDataset/train/'

images = []
annotations = []

images.append([os.path.join('images', x) for x in os.listdir(os.path.join(base_path + '/images'))])
annotations.append([os.path.join('annotations', x) for x in os.listdir(os.path.join(base_path + '/annotations')) if x[-3:] == "txt"])

In [11]:
images = images[0]
annotations = annotations[0]
images.sort()
annotations.sort()

print(len(images))
# Split the dataset into train-valid-test splits 
train_images, val_images, train_annotations, val_annotations = train_test_split(images, annotations, test_size = 0.2, random_state = 2)
val_images, test_images, val_annotations, test_annotations = train_test_split(val_images, val_annotations, test_size = 0.5, random_state = 2)

21041


In [12]:
%cd /content/RoadDamageDataset/

!mkdir Train Val Test 

/content/RoadDamageDataset


In [13]:
#Utility function to copy images in designated folders
def move_files_to_folder(list_of_files, destination_folder):
    for f in list_of_files:
        try:
            shutil.move(f, destination_folder)
        except:
            print(f)
            assert False

In [14]:
%cd /content/RoadDamageDataset/train/

# Move the images splits into their folders
move_files_to_folder(train_images, '/content/RoadDamageDataset/Train')
move_files_to_folder(val_images, '/content/RoadDamageDataset/Val')
move_files_to_folder(test_images, '/content/RoadDamageDataset/Test')

/content/RoadDamageDataset/train


In [15]:
# Move the annotations splits into their folders
move_files_to_folder(train_annotations, '/content/RoadDamageDataset/Train')
move_files_to_folder(val_annotations, '/content/RoadDamageDataset/Val')
move_files_to_folder(test_annotations, '/content/RoadDamageDataset/Test')

# 3. YoloV5

Important : At that point, copy paste the file road_damage.yaml into the folder /content/drive/MyDrive/RDD_Final/yolov5/data

In [16]:
# Facultatif : Connection to WandB
%pip install wandb
!wandb login --relogin votre_code_d_acces

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 10.6 MB/s 
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.9.7-py2.py3-none-any.whl (157 kB)
[K     |████████████████████████████████| 157 kB 47.5 MB/s 
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 60.0 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-

In [17]:
%cd /content/drive/MyDrive/RDD_Final/yolov5/
# Parameters to update: 
  # Epochs 
  # Hyp : hyperparameters to consider - 3 are available hyp.scratch-high.yaml, hyp.scratch-med.yaml, hyp.scratch-low.yaml but you can also consider to create your own .yaml with your own hyperparameters
  # Weights : weights to consider - 5 are available yolov5s.pt, yolov5m.pt, yolov5l.pt, yolov5x.pt, yolov5n.pt
  # Project : Name of the folder where you will keep the results of your different models
  # Name : Name of that specific model you are running
!python train.py --hyp hyp.scratch-low.yaml --epochs 100 --data road_damage.yaml --weights yolov5l.pt --cache --project 'RDD - Yolov5' --name 'third_training_5l_100e_scrlow'

/content/drive/MyDrive/RDD_Final/yolov5
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: (30 second timeout) 
[34m[1mwandb[0m: W&B disabled due to login timeout.
[34m[1mtrain: [0mweights=yolov5l.pt, cfg=, data=road_damage.yaml, hyp=hyp.scratch-low.yaml, epochs=100, batch_size=16, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=ram, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=RDD - Yolov5, name=third_training_5l_100e_scrlow, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
Command 'git fetch origin' timed out after 5 seconds
YOLOv5 🚀 v6.2-94-g1ae