In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

'''import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))'''

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# https://www.freecodecamp.org/news/how-to-detect-objects-in-images-using-yolov8/
# https://underactuated.mit.edu/pend.html
# https://manipulation.csail.mit.edu/
# https://deepnote.com/workspace/Underactuated-2ed1518a-973b-4145-bd62-1768b49956a8/project/52e7e101-429f-4aef-a373-e4cca7980cfe/notebook/intro-8ca11815e8354b658378157cd15c1725
# https://www.amazon.com/Nonlinear-Dynamics-Chaos-Applications-Nonlinearity/dp/0738204536
# https://www.youtube.com/playlist?list=PLkx8KyIQkMfU5szP43GlE_S1QGSPQfL9s


In [None]:
import os
HOME = os.chdir('/kaggle/input')

!ls

In [None]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

![image.png](attachment:5d5ce76b-708a-4def-b367-c5f2a2f5d336.png)

![image.png](attachment:978d5e3a-4c0b-46d8-966e-391d6cbcbecf.png)![image.png](attachment:f89fc233-c61d-4c24-9375-b069697b9f9e.png)

In [None]:
import os
import pandas as pd

from tabulate import tabulate
from shutil import copyfile
from glob import glob
from xml.etree import ElementTree as et
from sklearn.model_selection import train_test_split

In [None]:
def extract_img_text(xml_path: str) -> list[dict]:
    tree = et.parse(xml_path)
    root = tree.getroot()

    list_dicts = []

    objects = root.findall('object')
    for ind, obj in enumerate(objects):
        dict_info = {}

        dict_info['img_name'] = root.find('filename').text
        dict_info['img_width'] = root.find('size').find('width').text
        dict_info['img_height'] = root.find('size').find('height').text

        bndbox = obj.find('bndbox')
        dict_info['obj_name'] = obj.find('name').text
        dict_info['x_min'] = bndbox.find('xmin').text
        dict_info['x_max'] = bndbox.find('xmax').text
        dict_info['y_min'] = bndbox.find('ymin').text
        dict_info['y_max'] = bndbox.find('ymax').text

        list_dicts.append(dict_info)

    return list_dicts

In [None]:
annotations_dir = r'/kaggle/input/pcb-defects/PCB_DATASET/Annotations'
images_dir = r'/kaggle/input/pcb-defects/PCB_DATASET/images'

# Get source paths
list_anotations_src_dirs = [os.path.join(annotations_dir, class_name) for class_name in os.listdir(annotations_dir)]
replace_text = lambda x: x.replace('\\', '/')
list_anotations_src_dirs = list(map(replace_text, list_anotations_src_dirs))

list_images_src_dirs = [os.path.join(images_dir, class_name) for class_name in os.listdir(images_dir)]
list_images_src_dirs = list(map(replace_text, list_images_src_dirs))


# get all xml files paths
all_ann_files = []
for each_class in list_anotations_src_dirs:
    xml_files = glob(each_class + '/*.xml')
    all_ann_files += list(map(replace_text, xml_files))


# get all images files paths
all_img_files = []
for each_class in list_images_src_dirs:
    img_files = glob(each_class + '/*.jpg')
    all_img_files += list(map(replace_text, img_files))


# print(extract_img_text('kaggle_pcb_dataset/Annotations/Missing_hole/01_missing_hole_01.xml'))
all_annotations = []
for xml_file in all_ann_files:
    all_annotations += extract_img_text(xml_file)

df_all = pd.DataFrame(all_annotations)

# check dataframe info

In [None]:
    print('---------------------# Dataframe Head #---------------------\n')
    print(tabulate(df_all.head(), headers='keys', tablefmt='psql'))
    print('\n\n---------------------# Dataframe info #---------------------\n')
    print(df_all.info())
    print(f'\n\n## Columns: {df_all.columns}')
    print(f'\n\n# Total num objects: {df_all.shape[0]}')
    print('\n\n---------------------# Amount each object #---------------------\n')
    print(df_all['obj_name'].value_counts())


# fix dataframe types

In [None]:
class_map = {'missing_hole': 0, 'mouse_bite': 1, 'open_circuit': 2, 'short': 3, 'spur': 4, 'spurious_copper': 5}
def get_class_id(class_name: str) -> int:
    return class_map[class_name]

In [None]:
df_all['class_id'] = df_all['obj_name'].apply(get_class_id)

int_cols = ['img_width', 'img_height', 'x_min', 'x_max', 'y_min', 'y_max', 'class_id']
str_cols = ['img_name', 'obj_name']

df_all[int_cols] = df_all[int_cols].astype(int)
df_all[str_cols] = df_all[str_cols].astype(str)
print('\n\n---------------------# Dataframe info #---------------------\n')
print(df_all.info())

# Create YOLO format box coodinates and add class ids

In [None]:
df_all['center_x'] = ((df_all['x_min'] + df_all['x_max'])/2)/df_all['img_width']
df_all['center_y'] = ((df_all['y_min'] + df_all['y_max'])/2)/df_all['img_height']
df_all['box_width'] = (df_all['x_max'] - df_all['x_min'])/df_all['img_width']
df_all['box_height'] = (df_all['y_max'] - df_all['y_min'])/df_all['img_height']

In [None]:
print('\n\n---------------------# Dataframe Head #---------------------\n')
print(tabulate(df_all.head(), headers='keys', tablefmt='psql'))

# CREATING YOLO TRAIN / TEST / VALID FOLDERS

In [None]:
train_images, val_images = train_test_split(all_img_files, test_size=0.3, random_state=9) # 70% train
val_images, test_images = train_test_split(val_images, test_size=0.5, random_state=9) # 15% test / 15% validation

print(f'\n\n# Total Images: {len(all_img_files)}')
print(f'# test size: {len(train_images)} - {len(train_images) / len(all_img_files):.2f}')
print(f'# test size: {len(test_images)} - {len(test_images) / len(all_img_files):.2f}')
print(f'# test size: {len(val_images)} - {len(val_images) / len(all_img_files):.2f}')

## start creating YOLO_DATASET

In [None]:
os.chdir('/kaggle/working')
os.getcwd()

In [None]:
folders = ['', 'train', 'test', 'val']
for folder in folders:
    path = rf'/kaggle/working/yolo_pcb_dataset/{folder}'
    if not os.path.isdir(path):
        os.mkdir(path)

    if folder != '':
        img_path_exist = os.path.isdir(img_path := os.path.join(path, 'images'))
        ann_path_exist = os.path.isdir(ann_path := os.path.join(path, 'labels'))
        if not img_path_exist:
            os.mkdir(img_path)
        if not ann_path_exist:
            os.mkdir(ann_path)

## Copie images from original dataset to the new yolo dataset

In [None]:
def create_label(img_name: str, df: pd.DataFrame, dest_path: str) -> None:
    text_file = os.path.join(dest_path, img_name.replace("jpg","txt"))
    text_file = text_file.replace('\\', '/')

    cols2save = ['img_name', 'class_id', 'center_x', 'center_y', 'box_width', 'box_height']
    groupby_obj = df[cols2save].groupby('img_name')
    #print(tabulate(groupby_obj.get_group(img_name), headers='keys', tablefmt='psql'))
    groupby_obj.get_group(img_name).set_index('img_name').to_csv(text_file, sep=' ', index=False, header=False)

In [None]:
img_src_dict = {'train': train_images, 'test': test_images, 'val': val_images}
print_buffer = []
for folder in folders[1:]:
    path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/images'
    for src_path in img_src_dict[folder]:
        dest_path = os.path.join(path, src_path.split('/')[-1])
        dest_path = replace_text(dest_path)
        copyfile(src_path, dest_path)
        print_buffer.append(f'copied \'{src_path}\' to \'{dest_path}\'')

with open('copies_log.txt', "w") as cpy_file:
    print("\n".join(print_buffer), file=cpy_file)

for folder in folders[1:]:
    img_path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/images'
    ann_path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/labels'
    for image_src_path in img_src_dict[folder]:
        img_name = image_src_path.split('/')[-1]
        create_label(img_name=img_name, df=df_all, dest_path=ann_path)

## create data.yaml file

In [None]:
import yaml

In [None]:
data = {
    'train': '/kaggle/working/yolo_pcb_dataset/train/images',
    'val': '/kaggle/working/yolo_pcb_dataset/val/images',
    'test': '/kaggle/working/yolo_pcb_dataset/test/images',
    'nc': 6,
    'names': ['missing_hole', 'mouse_bite', 'open_circuit', 'short', 'spur', 'spurious_copper']}

with open('/kaggle/working/data.yaml', 'w') as file:
    yaml.dump(data, file, default_flow_style=False)

## Tranning the yolo model

In [None]:
# !git clone https://github.com/ultralytics/ultralytics -b main

In [None]:
!yolo mode=train data='/kaggle/working/data.yaml' model=yolov8l.pt batch=8 name=pcb_err_model epochs=50 lr0=0.01

# downloading the trained model

In [None]:
import shutil
shutil.make_archive('err_pcb_model', 'zip', '/kaggle/working/runs')

In [15]:
annotations_dir = r'/kaggle/input/pcb-defects/PCB_DATASET/Annotations'
images_dir = r'/kaggle/input/pcb-defects/PCB_DATASET/images'

# Get source paths
list_anotations_src_dirs = [os.path.join(annotations_dir, class_name) for class_name in os.listdir(annotations_dir)]
replace_text = lambda x: x.replace('\\', '/')
list_anotations_src_dirs = list(map(replace_text, list_anotations_src_dirs))

list_images_src_dirs = [os.path.join(images_dir, class_name) for class_name in os.listdir(images_dir)]
list_images_src_dirs = list(map(replace_text, list_images_src_dirs))


# get all xml files paths
all_ann_files = []
for each_class in list_anotations_src_dirs:
    xml_files = glob(each_class + '/*.xml')
    all_ann_files += list(map(replace_text, xml_files))


# get all images files paths
all_img_files = []
for each_class in list_images_src_dirs:
    img_files = glob(each_class + '/*.jpg')
    all_img_files += list(map(replace_text, img_files))


# print(extract_img_text('kaggle_pcb_dataset/Annotations/Missing_hole/01_missing_hole_01.xml'))
all_annotations = []
for xml_file in all_ann_files:
    all_annotations += extract_img_text(xml_file)

df_all = pd.DataFrame(all_annotations)

# check dataframe info

In [16]:
    print('---------------------# Dataframe Head #---------------------\n')
    print(tabulate(df_all.head(), headers='keys', tablefmt='psql'))
    print('\n\n---------------------# Dataframe info #---------------------\n')
    print(df_all.info())
    print(f'\n\n## Columns: {df_all.columns}')
    print(f'\n\n# Total num objects: {df_all.shape[0]}')
    print('\n\n---------------------# Amount each object #---------------------\n')
    print(df_all['obj_name'].value_counts())


---------------------# Dataframe Head #---------------------

+----+----------------------+-------------+--------------+------------+---------+---------+---------+---------+
|    | img_name             |   img_width |   img_height | obj_name   |   x_min |   x_max |   y_min |   y_max |
|----+----------------------+-------------+--------------+------------+---------+---------+---------+---------|
|  0 | 01_mouse_bite_11.jpg |        3034 |         1586 | mouse_bite |    1199 |    1234 |     966 |    1004 |
|  1 | 01_mouse_bite_11.jpg |        3034 |         1586 | mouse_bite |    2042 |    2073 |     755 |     786 |
|  2 | 01_mouse_bite_11.jpg |        3034 |         1586 | mouse_bite |    1472 |    1507 |     607 |     639 |
|  3 | 01_mouse_bite_17.jpg |        3034 |         1586 | mouse_bite |    2168 |    2200 |     854 |     887 |
|  4 | 01_mouse_bite_17.jpg |        3034 |         1586 | mouse_bite |    1513 |    1545 |     919 |     957 |
+----+----------------------+-------------

# fix dataframe types

In [17]:
class_map = {'missing_hole': 0, 'mouse_bite': 1, 'open_circuit': 2, 'short': 3, 'spur': 4, 'spurious_copper': 5}
def get_class_id(class_name: str) -> int:
    return class_map[class_name]

In [18]:
df_all['class_id'] = df_all['obj_name'].apply(get_class_id)

int_cols = ['img_width', 'img_height', 'x_min', 'x_max', 'y_min', 'y_max', 'class_id']
str_cols = ['img_name', 'obj_name']

df_all[int_cols] = df_all[int_cols].astype(int)
df_all[str_cols] = df_all[str_cols].astype(str)
print('\n\n---------------------# Dataframe info #---------------------\n')
print(df_all.info())



---------------------# Dataframe info #---------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2953 entries, 0 to 2952
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   img_name    2953 non-null   object
 1   img_width   2953 non-null   int64 
 2   img_height  2953 non-null   int64 
 3   obj_name    2953 non-null   object
 4   x_min       2953 non-null   int64 
 5   x_max       2953 non-null   int64 
 6   y_min       2953 non-null   int64 
 7   y_max       2953 non-null   int64 
 8   class_id    2953 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 207.8+ KB
None


# Create YOLO format box coodinates and add class ids

In [19]:
df_all['center_x'] = ((df_all['x_min'] + df_all['x_max'])/2)/df_all['img_width']
df_all['center_y'] = ((df_all['y_min'] + df_all['y_max'])/2)/df_all['img_height']
df_all['box_width'] = (df_all['x_max'] - df_all['x_min'])/df_all['img_width']
df_all['box_height'] = (df_all['y_max'] - df_all['y_min'])/df_all['img_height']

In [20]:
print('\n\n---------------------# Dataframe Head #---------------------\n')
print(tabulate(df_all.head(), headers='keys', tablefmt='psql'))



---------------------# Dataframe Head #---------------------

+----+----------------------+-------------+--------------+------------+---------+---------+---------+---------+------------+------------+------------+-------------+--------------+
|    | img_name             |   img_width |   img_height | obj_name   |   x_min |   x_max |   y_min |   y_max |   class_id |   center_x |   center_y |   box_width |   box_height |
|----+----------------------+-------------+--------------+------------+---------+---------+---------+---------+------------+------------+------------+-------------+--------------|
|  0 | 01_mouse_bite_11.jpg |        3034 |         1586 | mouse_bite |    1199 |    1234 |     966 |    1004 |          1 |   0.400956 |   0.621059 |   0.0115359 |    0.0239596 |
|  1 | 01_mouse_bite_11.jpg |        3034 |         1586 | mouse_bite |    2042 |    2073 |     755 |     786 |          1 |   0.678148 |   0.485813 |   0.0102175 |    0.019546  |
|  2 | 01_mouse_bite_11.jpg |       

# CREATING YOLO TRAIN / TEST / VALID FOLDERS

In [21]:
train_images, val_images = train_test_split(all_img_files, test_size=0.3, random_state=9) # 70% train
val_images, test_images = train_test_split(val_images, test_size=0.5, random_state=9) # 15% test / 15% validation

print(f'\n\n# Total Images: {len(all_img_files)}')
print(f'# test size: {len(train_images)} - {len(train_images) / len(all_img_files):.2f}')
print(f'# test size: {len(test_images)} - {len(test_images) / len(all_img_files):.2f}')
print(f'# test size: {len(val_images)} - {len(val_images) / len(all_img_files):.2f}')



# Total Images: 693
# test size: 485 - 0.70
# test size: 104 - 0.15
# test size: 104 - 0.15


## start creating YOLO_DATASET

In [22]:
os.chdir('/kaggle/working')
os.getcwd()

'/kaggle/working'

In [23]:
folders = ['', 'train', 'test', 'val']
for folder in folders:
    path = rf'/kaggle/working/yolo_pcb_dataset/{folder}'
    if not os.path.isdir(path):
        os.mkdir(path)

    if folder != '':
        img_path_exist = os.path.isdir(img_path := os.path.join(path, 'images'))
        ann_path_exist = os.path.isdir(ann_path := os.path.join(path, 'labels'))
        if not img_path_exist:
            os.mkdir(img_path)
        if not ann_path_exist:
            os.mkdir(ann_path)

## Copie images from original dataset to the new yolo dataset

In [24]:
def create_label(img_name: str, df: pd.DataFrame, dest_path: str) -> None:
    text_file = os.path.join(dest_path, img_name.replace("jpg","txt"))
    text_file = text_file.replace('\\', '/')

    cols2save = ['img_name', 'class_id', 'center_x', 'center_y', 'box_width', 'box_height']
    groupby_obj = df[cols2save].groupby('img_name')
    #print(tabulate(groupby_obj.get_group(img_name), headers='keys', tablefmt='psql'))
    groupby_obj.get_group(img_name).set_index('img_name').to_csv(text_file, sep=' ', index=False, header=False)

In [25]:
img_src_dict = {'train': train_images, 'test': test_images, 'val': val_images}
print_buffer = []
for folder in folders[1:]:
    path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/images'
    for src_path in img_src_dict[folder]:
        dest_path = os.path.join(path, src_path.split('/')[-1])
        dest_path = replace_text(dest_path)
        copyfile(src_path, dest_path)
        print_buffer.append(f'copied \'{src_path}\' to \'{dest_path}\'')

with open('copies_log.txt', "w") as cpy_file:
    print("\n".join(print_buffer), file=cpy_file)

for folder in folders[1:]:
    img_path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/images'
    ann_path = rf'/kaggle/working/yolo_pcb_dataset/{folder}/labels'
    for image_src_path in img_src_dict[folder]:
        img_name = image_src_path.split('/')[-1]
        create_label(img_name=img_name, df=df_all, dest_path=ann_path)

## create data.yaml file

In [26]:
import yaml

In [27]:
data = {
    'train': '/kaggle/working/yolo_pcb_dataset/train/images',
    'val': '/kaggle/working/yolo_pcb_dataset/val/images',
    'test': '/kaggle/working/yolo_pcb_dataset/test/images',
    'nc': 6,
    'names': ['missing_hole', 'mouse_bite', 'open_circuit', 'short', 'spur', 'spurious_copper']}

with open('/kaggle/working/data.yaml', 'w') as file:
    yaml.dump(data, file, default_flow_style=False)

## Tranning the yolo model

In [2]:
# !git clone https://github.com/ultralytics/ultralytics -b main

Cloning into 'ultralytics'...
remote: Enumerating objects: 26708, done.[K
remote: Counting objects: 100% (1364/1364), done.[K
remote: Compressing objects: 100% (760/760), done.[K
remote: Total 26708 (delta 864), reused 969 (delta 600), pack-reused 25344[K
Receiving objects: 100% (26708/26708), 15.74 MiB | 27.00 MiB/s, done.
Resolving deltas: 100% (18818/18818), done.


In [32]:
!yolo mode=train data='/kaggle/working/data.yaml' model=yolov8l.pt batch=8 name=pcb_err_model epochs=50 lr0=0.01

Ultralytics YOLOv8.2.20 🚀 Python-3.10.13 torch-2.1.2 CUDA:0 (Tesla T4, 15102MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8l.pt, data=/kaggle/working/data.yaml, epochs=50, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=pcb_err_model4, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_co

# downloading the trained model

In [33]:
import shutil
shutil.make_archive('err_pcb_model', 'zip', '/kaggle/working/runs')

'/kaggle/working/err_pcb_model.zip'