In [None]:
import os
import shutil
import random

import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from ultralytics import YOLO

In [None]:
# Set seeds for reproducibility
random.seed(42)

# Paths
DATA_ROOT = "traffic_sign_data"  # root of TT100K dataset
OUTPUT_ROOT = "tt100k_yolo"

# DATA_ROOT = "/kaggle/input/tt100k-test/traffic_sign_data"
# OUTPUT_ROOT = "/kaggle/working/tt100k_yolo"

os.makedirs(OUTPUT_ROOT, exist_ok=True)

In [None]:
for split in ["train", "val", "test"]:
    os.makedirs(f"{OUTPUT_ROOT}/images/{split}", exist_ok=True)
    os.makedirs(f"{OUTPUT_ROOT}/labels/{split}", exist_ok=True)

In [None]:
train_df = pd.read_csv(f"{DATA_ROOT}/Train.csv")
test_df = pd.read_csv(f"{DATA_ROOT}/Test.csv")
meta_df = pd.read_csv(f"{DATA_ROOT}/Meta.csv")

train_df.head()

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,27,26,5,5,22,20,20,Train/20/00020_00000_00000.png
1,28,27,5,6,23,22,20,Train/20/00020_00000_00001.png
2,29,26,6,5,24,21,20,Train/20/00020_00000_00002.png
3,28,27,5,6,23,22,20,Train/20/00020_00000_00003.png
4,28,26,5,5,23,21,20,Train/20/00020_00000_00004.png


In [None]:
def to_yolo(row):
    w, h = row["Width"], row["Height"]

    x_center = ((row["Roi.X1"] + row["Roi.X2"]) / 2) / w
    y_center = ((row["Roi.Y1"] + row["Roi.Y2"]) / 2) / h
    bw = (row["Roi.X2"] - row["Roi.X1"]) / w
    bh = (row["Roi.Y2"] - row["Roi.Y1"]) / h

    return f"{int(row['ClassId'])} {x_center:.6f} {y_center:.6f} {bw:.6f} {bh:.6f}"

In [None]:
train_df, val_df = train_test_split(
    train_df, test_size=0.2, random_state=42, stratify=train_df["ClassId"]
)

In [None]:
def process_split(df, split_name):
    for img_path, group in tqdm(df.groupby("Path")):
        img_name = os.path.basename(img_path)
        label_name = os.path.splitext(img_name)[0] + ".txt"

        # Write label
        label_file = f"{OUTPUT_ROOT}/labels/{split_name}/{label_name}"
        with open(label_file, "w") as f:
            for _, row in group.iterrows():
                f.write(to_yolo(row) + "\n")

        # Copy image from Kaggle input → working
        src_img = os.path.join(DATA_ROOT, img_path)
        dst_img = f"{OUTPUT_ROOT}/images/{split_name}/{img_name}"

        if os.path.exists(src_img):
            shutil.copy(src_img, dst_img)


process_split(train_df, "train")
process_split(val_df, "val")
process_split(test_df, "test")

100%|██████████| 31367/31367 [03:14<00:00, 160.99it/s] 
100%|██████████| 7842/7842 [00:50<00:00, 154.63it/s] 
100%|██████████| 12630/12630 [02:00<00:00, 104.80it/s]


In [None]:
class_ids = sorted(train_df["ClassId"].unique())
names = {int(cid): f"class_{cid}" for cid in class_ids}

yaml_text = f"""
    path: {OUTPUT_ROOT}
    train: images/train
    val: images/val
    test: images/test
    
    nc: {len(class_ids)}
    names: {names}
"""

with open(f"{OUTPUT_ROOT}/data.yaml", "w") as f:
    f.write(yaml_text)

print(yaml_text)


    path: /kaggle/working/tt100k_yolo
    train: images/train
    val: images/val
    test: images/test
    
    nc: 43
    names: {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6', 7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11', 12: 'class_12', 13: 'class_13', 14: 'class_14', 15: 'class_15', 16: 'class_16', 17: 'class_17', 18: 'class_18', 19: 'class_19', 20: 'class_20', 21: 'class_21', 22: 'class_22', 23: 'class_23', 24: 'class_24', 25: 'class_25', 26: 'class_26', 27: 'class_27', 28: 'class_28', 29: 'class_29', 30: 'class_30', 31: 'class_31', 32: 'class_32', 33: 'class_33', 34: 'class_34', 35: 'class_35', 36: 'class_36', 37: 'class_37', 38: 'class_38', 39: 'class_39', 40: 'class_40', 41: 'class_41', 42: 'class_42'}



In [None]:
model = YOLO("yolov8n.yaml")

model.train(data=f"{OUTPUT_ROOT}/data.yaml", epochs=30, imgsz=640, batch=16, device=0)

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Ultralytics 8.3.241 🚀 Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/kaggle/working/tt100k_yolo/data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=64

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7b3f42ea8140>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.

In [None]:
model = YOLO("runs/detect/train/weights/best.pt")

model.predict(source=f"{OUTPUT_ROOT}/images/test", conf=0.25, save=True)


Inference results will accumulate in RAM unless `stream=True` is passed, which can cause out-of-memory errors for large
sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

image 1/12630 /kaggle/working/tt100k_yolo/images/test/00000.png: 640x640 1 class_16, 6.0ms
image 2/12630 /kaggle/working/tt100k_yolo/images/test/00001.png: 640x608 1 class_1, 40.9ms
image 3/12630 /kaggle/working/tt100k_yolo/images/test/00002.png: 640x608 1 class_12, 5.1ms
image 4/12630 /kaggle/working/tt100k_yolo/images/test/00003.png: 640x608 1 class_2, 5.2ms
image 5/12630 /kaggle/working/tt100k_yolo/images/test/00004.png: 608x640 1 class_11, 38.8ms
image 6/12630 /

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'class_0', 1: 'class_1', 2: 'class_2', 3: 'class_3', 4: 'class_4', 5: 'class_5', 6: 'class_6', 7: 'class_7', 8: 'class_8', 9: 'class_9', 10: 'class_10', 11: 'class_11', 12: 'class_12', 13: 'class_13', 14: 'class_14', 15: 'class_15', 16: 'class_16', 17: 'class_17', 18: 'class_18', 19: 'class_19', 20: 'class_20', 21: 'class_21', 22: 'class_22', 23: 'class_23', 24: 'class_24', 25: 'class_25', 26: 'class_26', 27: 'class_27', 28: 'class_28', 29: 'class_29', 30: 'class_30', 31: 'class_31', 32: 'class_32', 33: 'class_33', 34: 'class_34', 35: 'class_35', 36: 'class_36', 37: 'class_37', 38: 'class_38', 39: 'class_39', 40: 'class_40', 41: 'class_41', 42: 'class_42'}
 obb: None
 orig_img: array([[[173, 138, 115],
         [172, 138, 116],
         [169, 137, 116],
         ...,
         [135, 110,  86],
         [126, 101,  80],
         [105,  82