Based on this tutorial [Train Your Own YoloV5 Object Detection Model | analyticsvidhya.com](https://www.analyticsvidhya.com/blog/2021/08/train-your-own-yolov5-object-detection-model/#h2_3)

## Creating Dataset

### Create annotations Using the VIA tool
Create annotations by using [this VIA (VGG Image Annotator) tool](https://drive.google.com/file/d/1rJx0fNgnnhODM7H3GP9RQQ5QEsWdkYEd/view?usp=sharing)

using tool:
first create attribute named char  
upload files
create annotations
and export annotations by clicking menu > annotations > export annotations as CSV


### Convert CSV annotations to COCO format

To convert CSV annotations to COCO format you can use the following code chunk:

Start with importing dependencies to create COCO dataset.

In [None]:
import os
import numpy as np 
import pandas as pd
import shutil as sh
from PIL import Image
from tqdm.auto import tqdm
from pathlib import Path

In [None]:
!mkdir '/content/images/'
!mkdir '/content/images/train'

upload annotation .csv 
 files to above second  path

Upload images zip file on main path

In [None]:
!unzip -qq '/content/first_250_dataset.zip'
!cd '/content/first_250_dataset/' && cp -r * '/content/images/'

In [None]:
data_path = '/content/images/'
df = pd.read_csv(data_path+'train/csv250_1.csv')
## create x, y, w, h columns 
x, y, w, h = [], [], [], []
count = 0
for row in df['region_shape_attributes']:
    count = count + 1
    row = row.replace('{}', '').replace('}', '')
    row = row.split(',')
    x.append(int(row[1].split(':')[-1]))
    y.append(int(row[2].split(':')[-1]))
    w.append(int(row[3].split(':')[-1]))
    h.append(int(row[4].split(':')[-1]))
## calculating x, y, width and height coordinates
df['x'], df['y'], df['w'], df['h'] = x, y, w, h
## creating a column name image_id having images names as id 
df['image_id'] = [name.split('.')[0] for name in df['filename']]
## creating two columns for storing x and y center values
df['x_center'] = df['x'] + df['w']/2
df['y_center'] = df['y'] + df['h']/2
## define number of classes 
labels = df['region_attributes'].unique()
labels_to_dict = dict(zip(labels, range(0, len(labels))))
print('Lables Directory:', labels_to_dict)
df['classes'] = df['region_attributes']
df.replace({'classes':labels_to_dict}, inplace=True)
df = df[['image_id','x', 'y', 'w', 'h','x_center','y_center','classes']]
## set index of images
index = list(set(df.image_id))

Lables Directory: {'{"char":"2"}': 0, '{"char":"A"}': 1, '{"char":"H"}': 2, '{"char":"N"}': 3, '{"char":"3"}': 4, '{"char":"L"}': 5, '{"char":"S"}': 6, '{"char":"G"}': 7, '{"char":"P"}': 8, '{"char":"6"}': 9, '{"char":"T"}': 10, '{"char":"J"}': 11, '{"char":"F"}': 12, '{"char":"R"}': 13, '{"char":"Q"}': 14, '{"char":"D"}': 15, '{"char":"5"}': 16, '{"char":"B"}': 17, '{"char":"7"}': 18, '{"char":"8"}': 19, '{"char":"E"}': 20, '{"char":"9"}': 21, '{"char":"Y"}': 22, '{"char":"4"}': 23, '{"char":"W"}': 24, '{"char":"K"}': 25, '{"char":"U"}': 26, '{"char":"X"}': 27, '{"char":"V"}': 28, '{"char":"C"}': 29, '{"char":"M"}': 30, '{"char":" Q"}': 31}


In [None]:
if True:
    for fold in [0]:
        val_index = index[len(index) * fold // 5 : len(index) * (fold + 1) // 5]
        for name, mini in tqdm(df.groupby("image_id")):
            if name in val_index:
                path2save = "val2017/"
            else:
                path2save = "train2017/"
            if not os.path.exists("convertor/fold{}/labels/".format(fold) + path2save):
                os.makedirs("convertor/fold{}/labels/".format(fold) + path2save)
            with open(
                "convertor/fold{}/labels/".format(fold) + path2save + name + ".txt",
                "w+",
            ) as f:
                row = (
                    mini[["classes", "x_center", "y_center", "w", "h"]]
                    .astype(float)
                    .values
                )
                imagename = data_path + "{}.png".format(name)
                ext = ["png", "JPG", "PNG", "jpg"]
                for ext_ in ext:
                    imagename = data_path + "{}.{}".format(name, ext_)
                    if os.path.exists(imagename):
                        break
                check_image_width_height = Image.open(imagename)
                img_width, img_height = check_image_width_height.size
                for r in row:
                    r[1] = r[1] / img_width
                    r[2] = r[2] / img_height
                    r[3] = r[3] / img_width
                    r[4] = r[4] / img_height
                row = row.astype(str)
                for j in range(len(row)):
                    # print(row[j], "n")
                    row[j][0] = str(int(float(row[j][0])))
                    text = " ".join(row[j])
                    f.write(text)
                    f.write("\n")
            if not os.path.exists("convertor/fold{}/images/{}".format(fold, path2save)):
                os.makedirs("convertor/fold{}/images/{}".format(fold, path2save))
            sh.copy(
                imagename,
                "convertor/fold{}/images/{}/{}.{}".format(fold, path2save, name, ext_),
            )


  0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
!git clone https://github.com/ultralytics/yolov5

Cloning into 'yolov5'...
remote: Enumerating objects: 11142, done.[K
remote: Total 11142 (delta 0), reused 0 (delta 0), pack-reused 11142[K
Receiving objects: 100% (11142/11142), 11.14 MiB | 33.74 MiB/s, done.
Resolving deltas: 100% (7703/7703), done.


In [None]:
%cd '/content/yolov5'

/content/yolov5


In [None]:

# Path to the data directory
# data_dir = Path("/content/temp")    # for 4k dataset
data_dir = Path("/content/images")   # for 11k dataset


# Get list of all the images
images = sorted(list(map(str, list(data_dir.glob("*.png")))))
labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
characters = set(char for label in labels for char in label)
all_characters = list(char for label in labels for char in label)

print("Number of images found: ", len(images))
print("Number of labels found: ", len(labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", sorted(characters))

# Batch size for training and validation
batch_size = 450

# Desired image dimensions
img_width = 300
img_height = 90

# Factor by which the image is going to be downsampled
# by the convolutional blocks. We will be using two
# convolution blocks and each block will have
# a pooling layer which downsample the features by a factor of 2.
# Hence total downsampling factor would be 4.
downsample_factor = 4

# Maximum length of any captcha in the dataset
max_length = max([len(label) for label in labels])


Number of images found:  236
Number of labels found:  236
Number of unique characters:  31
Characters present:  ['2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y']


### Creating YAML file for training

In [None]:
# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]


In [None]:
%%writefile data/coco.yml
train: /content/convertor/fold0/images/train2017
val: /content/convertor/fold0/images/val2017
nc: 32 # number of classes
names: [2,A,H,N,3,L,S,G,P,6,T,J,F,R,Q,D,5,B,7,8,E,9,Y,4,W,K,U,X,V,C,M,Q]   # sorting should be accoring to Lables Directory: in preprocessing part

## Training

In [None]:
!python train.py --batch 12 --epochs 50 --data ./data/coco.yml --weights ./weights/yolov5x.pt  #./runs/train/exp2/weights/best.pt

[34m[1mtrain: [0mweights=./weights/yolov5x.pt, cfg=, data=./data/coco.yml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=50, batch_size=12, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
YOLOv5 🚀 v6.1-11-g63ddb6f torch 1.10.0+cu111 CUDA:0 (Tesla T4, 15110MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=0.5, cls_pw=1.0, obj=1.0, obj_pw=1.0, iou_t=0.2, anchor_t=4.0, fl_gamma=0.0, hsv_h=0.015, hsv_s=0

## Inference

In [None]:
!python detect.py --img 300 --source /content/convertor/fold0/images/train2017 --weights ./runs/train/exp3/weights/best.pt --conf-thres 0.2

[34m[1mdetect: [0mweights=['./runs/train/exp3/weights/best.pt'], source=/content/convertor/fold0/images/train2017, data=data/coco128.yaml, imgsz=[300, 300], conf_thres=0.2, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False
YOLOv5 🚀 v6.1-11-g63ddb6f torch 1.10.0+cu111 CUDA:0 (Tesla T4, 15110MiB)

Fusing layers... 
Model Summary: 444 layers, 86382013 parameters, 0 gradients
image 1/200 /content/convertor/fold0/images/train2017/2AHN3L.png: 96x320 1 2, Done. (0.034s)
image 2/200 /content/convertor/fold0/images/train2017/2AHSGS.png: 96x320 1 2, 1 S, Done. (0.025s)
image 3/200 /content/convertor/fold0/images/train2017/2ARQLL.png: 96x320 1 2, 1 G, 1 Q, Done. (0.024s)
image 4/200 /content/convertor/fold0/images/train2017/2A

## Testing

In [None]:
!python val.py --data ./data/coco.yml --weights ./runs/train/exp3/weights/best.pt

[34m[1mval: [0mdata=./data/coco.yml, weights=['./runs/train/exp3/weights/best.pt'], batch_size=32, imgsz=640, conf_thres=0.001, iou_thres=0.6, task=val, device=, workers=8, single_cls=False, augment=False, verbose=False, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs/val, name=exp, exist_ok=False, half=False, dnn=False
YOLOv5 🚀 v6.1-11-g63ddb6f torch 1.10.0+cu111 CUDA:0 (Tesla T4, 15110MiB)

Fusing layers... 
Model Summary: 444 layers, 86382013 parameters, 0 gradients
[34m[1mval: [0mScanning '/content/convertor/fold0/labels/val2017.cache' images and labels... 50 found, 0 missing, 0 empty, 0 corrupt: 100% 50/50 [00:00<?, ?it/s]
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100% 2/2 [00:02<00:00,  1.01s/it]
                 all         50        300      0.334      0.669       0.43      0.249
                   2         50         55      0.864      0.982      0.988      0.631
                   A        