# 0. Imports and Load Data

In [7]:
import os
import glob
import random
import json
from pathlib import Path
import pandas as pd
import yaml
import cv2
import matplotlib.pyplot as plt
from collections import Counter
from ultralytics import YOLO
import warnings

warnings.filterwarnings('ignore')

In [None]:
# Para ejecuci√≥n local (Mac)
DATA_YAML_PATH = "../data/data.yaml"

assert os.path.exists(DATA_YAML_PATH), f"data.yaml not found in {DATA_YAML_PATH}"

with open(DATA_YAML_PATH, "r") as f:
    data_cfg = yaml.safe_load(f)

data_cfg

In [9]:
train_dir = data_cfg.get("train")
val_dir   = data_cfg.get("val")
test_dir  = data_cfg.get("test", None)
class_names = data_cfg.get("names", [])
nc = int(data_cfg.get("nc", len(class_names)))

print("Train images dir:", train_dir)
print("Val images dir  :", val_dir)
print("Test images dir :", test_dir)
print("Classes (nc)     :", nc)
print("Classes name:", class_names)

# 1. Data Analysis: First Sight

In [10]:
def yolo_label_paths(img_dir: str) -> list:
    img_paths = []
    for ext in ("*.jpg", "*.jpeg", "*.png", "*.bmp"):
        img_paths.extend(glob.glob(os.path.join(img_dir, ext)))
    label_paths = []
    for ip in img_paths:
        lp = ip.replace(os.sep + "images" + os.sep, os.sep + "labels" + os.sep)
        lp = os.path.splitext(lp)[0] + ".txt"
        label_paths.append((ip, lp))
    return label_paths

def read_yolo_labels(label_path: str):
    if not os.path.exists(label_path):
        return []
    rows = []
    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                cls_id = int(float(parts[0]))
                cx, cy, w, h = map(float, parts[1:5])
                rows.append((cls_id, cx, cy, w, h))
    return rows


In [11]:
def analyze_split(img_dir: str, class_names: list):
    pairs = yolo_label_paths(img_dir)
    per_class = Counter()
    objs_per_image = []
    areas = []

    for _, lbl_path in pairs:
        labels = read_yolo_labels(lbl_path)
        objs_per_image.append(len(labels))
        for (cls_id, cx, cy, w, h) in labels:
            per_class[cls_id] += 1
            areas.append(w * h)

    df_classes = pd.DataFrame({
        "class_id": list(per_class.keys()),
        "count": list(per_class.values())
    })
    df_classes["class_name"] = df_classes["class_id"].apply(lambda i: class_names[i] if i < len(class_names) else str(i))

    df_objs = pd.DataFrame({"objects_per_image": objs_per_image})
    df_areas = pd.DataFrame({"bbox_area_norm": areas})

    return df_classes.sort_values("count", ascending=False), df_objs, df_areas

df_classes_train, df_objs_train, df_areas_train = analyze_split(train_dir, class_names)
df_classes_train.head(), df_objs_train.describe(), df_areas_train.describe()

### How is the date distribuited?

In [12]:
plt.figure(figsize=(10,5))
plt.bar(df_classes_train["class_name"], df_classes_train["count"])
plt.xticks(rotation=45, ha='right')
plt.title("Object count per class (train)")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.hist(df_objs_train["objects_per_image"], bins=20)
plt.title("Objects per image (train)")
plt.xlabel("# objects")
plt.ylabel("frecuency")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.hist(df_areas_train["bbox_area_norm"], bins=30)
plt.title("Normalized bbox area distribution (train)")
plt.xlabel("w*h (normalized)")
plt.ylabel("frecuency")
plt.tight_layout()
plt.show()

In [13]:
def draw_yolo_bbox(img, bbox, color=(0,255,0), thickness=2):
    H, W = img.shape[:2]
    cx, cy, bw, bh = bbox
    x1 = int((cx - bw/2) * W)
    y1 = int((cy - bh/2) * H)
    x2 = int((cx + bw/2) * W)
    y2 = int((cy + bh/2) * H)
    cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
    return img

def visualize_samples(img_dir: str, class_names: list, n=4, seed=42):
    rng = random.Random(seed)
    pairs = yolo_label_paths(img_dir)
    sample = rng.sample(pairs, min(n, len(pairs)))
    fig, axes = plt.subplots(1, len(sample), figsize=(4*len(sample), 4))
    if len(sample) == 1:
        axes = [axes]
    for ax, (img_path, lbl_path) in zip(axes, sample):
        img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
        labels = read_yolo_labels(lbl_path)
        for (cls_id, cx, cy, w, h) in labels:
            img = draw_yolo_bbox(img, (cx, cy, w, h))
            name = class_names[cls_id] if cls_id < len(class_names) else str(cls_id)
            ax.text(5, 15, name, color='yellow', bbox=dict(facecolor='black', alpha=0.5))
        ax.imshow(img)
        ax.set_title(Path(img_path).name)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

visualize_samples(train_dir, class_names, n=4, seed=7)

### What methods would you use to verify the reliability of the labels?

To verify the reliability of the labels, I would start by looking directly at a sample of images from each class and checking whether the boxes and classes actually make sense, paying extra attention to the rare categories. This helps catch obvious issues like misaligned boxes, incorrect classes, or missing annotations.

I‚Äôd also rely on some basic statistical checks to spot anomalies things like bounding boxes that are way too large or too small, images that seem to have an unusually high number of objects, or class distributions that don‚Äôt match what we would expect. These patterns often reveal systematic labeling mistakes. 
Another useful step is to train a simple model and review the cases where the model is very confident but contradicts the label.
Also, if the dataset was labeled by multiple people, I‚Äôd compare their annotations to see how consistent they were. High agreement suggests reliable labeling, low agreement means we should review those samples.

From the current analysis, some red flags already stand out strong class imbalance, extremely rare categories that might lack consistent labeling, and images with a very high number of objects. All of these deserve a closer look.

# 2. Training

In [None]:
# ==============================
# üîß HYPERPARAMETERS ‚Äî EXPERIMENTAL SETUP
# ==============================

# TODO: Fill in the hyperparameters based on your dataset analysis.
# Justify your choices in the Markdown cell above.

EPOCHS = 20     # Adjusted for hardware constraints (Google Colab)
IMGSZ  = 640    # Standard YOLO size, good balance for industrial scenes  
BATCH  = 16     # Optimized for Google Colab
DEVICE = "cuda"  # Google Colab GPU acceleration T4

# Try YOLO11; if not available use YOLOv8
weights_try = ["yolo11n.pt", "yolov8n.pt"]
model = None
for w in weights_try:
    try:
        model = YOLO(w)
        print("Using:", w)
        break
    except Exception as e:
        print(f"Failed to load {w}: {e}")

assert model is not None, "Could not load a base model (yolo11n.pt / yolov8n.pt). Install ultralytics and make sure you have an active internet connection to download the weights."


# ==============================
# üöÄ TRAINING ‚Äî BASELINE EXPERIMENT
# ==============================
# The results object contains metrics, charts, and run directory info.
# Feel free to adjust and rerun with different hyperparameters.

results = model.train(data=DATA_YAML_PATH, epochs=EPOCHS, imgsz=IMGSZ, batch=BATCH, device=DEVICE)
results

### ‚úèÔ∏è Discussion

**1. Why did you choose these hyperparameters?**

I chose these hyperparameters based on the dataset size and the hardware I currently have available. With around 20,000 images, I needed a number of epochs that would allow the model to converge well without making the training process excessively long, so I went with 20, thinking about an overnight run. The resolution of 640 works well for industrial scenes where very large and very small objects coexist, and it‚Äôs also the YOLO standard that provides the best balance. The batch size of 8 helps avoid memory issues on Apple Silicon, and using mps allows me to take advantage of GPU acceleration, which noticeably speeds things up compared to CPU.

**2. How do they affect training time, GPU/CPU usage, and accuracy?**

In practice, training with MPS at 640px and a relatively small batch takes about 6‚Äì7 hours for 20 epochs. If I reduced the batch size even more, training would take longer, and lowering the resolution would speed things up but at the cost of worse performance on small objects. The GPU stays quite active during training (around 70‚Äì90%), memory usage is between 8 and 12 GB, and the CPU is mostly involved only in preprocessing. In terms of accuracy, higher resolutions could help capture finer details, but they significantly slow down training; and increasing the number of epochs might slightly improve mAP while also increasing the risk of overfitting.

**3. What would you try differently if you had more time or resources?**

With a bit more time, I would try larger models like YOLO11s or YOLO11m, adjust the loss function to compensate for class imbalance, and explore augmentations tailored to the lighting conditions in this dataset. With even more time, I would consider a two-stage training approach (pre-training and fine-tuning), combining models for more robust predictions, incorporating an active learning loop to identify hard cases, and running an automatic hyperparameter search to better optimize the configuration. I would also test TTA to improve inference performance.
If I also had more resources especially more powerful hardware I would train at higher resolutions (such as 1024) to improve small-object detection, use larger batch sizes for more stable training, and experiment with bigger architectures like YOLO11l or YOLO11x. I could also run parallel experiments or explore more computationally expensive techniques such as complex ensembles or advanced hyperparameter search methods that would be too slow with my current setup.

# 3. Evaluation


> üëâ **Task:** Evaluate your trained model using the validation set defined in `data.yaml`.

Run the following cell to compute key performance metrics
Then, summarize your results and provide your interpretation.

**Guidelines for your analysis:**
- **Quantitative metrics** 
- **Error analysis**
- **Next steps**

In [None]:
# Put Here your model metrics
# TODO:
# - Run model validation on the dataset below.
# - Capture metrics and save a summary to artifacts/metrics_summary.json.
# - Optionally, add visual analysis (PR curves, confusion matrix).

# Put your model evaluation code here üëá

metrics = model.val(data=DATA_YAML_PATH, imgsz=IMGSZ, device=DEVICE)
try:
    summary = {
        "metrics/mAP50-95(B)": float(metrics.box.map if hasattr(metrics, "box") else getattr(metrics, "map", float("nan"))),
        "metrics/mAP50(B)"   : float(getattr(metrics, "map50", float("nan"))),
        "nc": int(nc),
        "classes": class_names,
        "epochs": EPOCHS,
        "imgsz": IMGSZ,
    }
except Exception as e:
    summary = {"error": str(e)}
    
os.makedirs("artifacts", exist_ok=True)
with open("artifacts/metrics_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

summary


### ‚úèÔ∏è Metrics Interpretation and Analysis

1. **Quantitative Summary:**
   - What are your `mAP50` and `mAP50-95` values?
   - Which classes achieved the highest and lowest detection performance?

The model achieved 27.5% mAP50 and 18.8% mAP50-95 after 20 epochs. Although these values are below what we typically expect in more controlled datasets, they make sense given the severe class imbalance and the limited training time due to hardware constraints.
Looking at the per-class results, a clear pattern appears: classes with more data such as forklift (24,213 samples) and person (20,480 samples) performed noticeably better, reaching 72.6% and 47.4% mAP50. Classes with a moderate amount of data show mid-range performance, whereas the ones with very few annotations like gloves, traffic light, or van, all under 30 samples essentially failed to learn. The relationship is direct: fewer than 100 samples per class usually leads to little or no meaningful learning.

2. **Qualitative Analysis:**
   - Describe common failure cases (e.g., small objects missed, overlapping detections, background confusion).
   - Were there any label quality issues or inconsistencies you observed?

Beyond the metrics, several factors help explain these results. The most obvious is the extreme class imbalance: the gap between the most common class and the rarest one is enormous (over 2,000 to 1). This pushes the model to focus on what it sees all the time while ignoring classes that appear only occasionally.
There are also challenges with small object detection, such as license plates, QR codes, and gloves. Many of these objects occupy less than 1% of the image, which is difficult for such a lightweight model trained at 640px resolution.
On top of that, training for only 20 epochs was likely not enough for the model to fully converge, especially on underrepresented classes. And after inspecting the dataset, some label inconsistencies were observed, particularly in scenes with many objects or partial occlusions.

3. **Improvement Proposals:**
   - Suggest at least two improvements (data augmentation, loss tuning, class balancing, etc.).
   - How would you validate whether these changes actually help?

To improve performance, the first step is addressing class imbalance. This can be done by increasing the frequency of rare classes during training and using Focal Loss, which helps the model focus on harder examples. The goal would be to raise rare-class performance to at least ~15% mAP50.
It would also help to train at multiple scales and increase the input resolution to 1024px, which gives the model a better chance to capture small objects. Complementing this with stronger data augmentation for rare classes can reduce false negatives. Using TTA during inference can also improve robustness.
Another key improvement is extending training to 80-100 epochs with a learning-rate scheduler like cosine annealing, while enabling early stopping to avoid overfitting. If hardware allows it, switching to a larger model variant (YOLO11s or YOLO11m) could make a big difference.
Finally, there are some quick wins worth considering: generating synthetic samples for rare classes, adjusting confidence thresholds per class, or even adding a second detection stage specialized in small objects.

# 4. Inference

In [None]:
VAL_SHOW_N = 4
val_imgs = []
for ext in ("*.jpg", "*.jpeg", "*.png"):
    val_imgs.extend(glob.glob(os.path.join(val_dir, ext)))
val_imgs = val_imgs[:VAL_SHOW_N]

# Force CPU device to avoid MPS/CUDA compatibility issues with downloaded weights
pred = model.predict(source=val_imgs, imgsz=IMGSZ, conf=0.25, device='cpu')
# Mostrar con matplotlib (usamos 'plot' de ultralytics para guardar)
out_dir = "runs/predict_display"
os.makedirs(out_dir, exist_ok=True)

fig, axes = plt.subplots(1, len(val_imgs), figsize=(4*len(val_imgs), 4))
if len(val_imgs) == 1:
    axes = [axes]
for ax, r in zip(axes, pred):
    im = r.plot()  # numpy array con anotaciones
    ax.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
    ax.axis('off')
plt.tight_layout()
plt.show()

# 5. Export and deploy

In [None]:

export_dir = Path("artifacts")
export_dir.mkdir(parents=True, exist_ok=True)

best_ckpt = None
for p in Path("runs/detect").rglob("weights/best.pt"):
    best_ckpt = p
    break

if best_ckpt and best_ckpt.exists():
    target = export_dir / "model_best.pt"
    target.write_bytes(best_ckpt.read_bytes())
    print("Wheight export to:", target)
else:
    print("'best.pt' not found")

with open(export_dir / "classes.json", "w") as f:
    json.dump({"nc": int(nc), "names": class_names}, f, indent=2)

try:
    _ = model.export(format="onnx", imgsz=IMGSZ)
    onnx_file = None
    for p in Path(".").rglob("*.onnx"):
        onnx_file = p
        break
    if onnx_file:
        (export_dir / "model.onnx").write_bytes(onnx_file.read_bytes())
        print("ONNX export to:", export_dir / "model.onnx")
except Exception as e:
    print("Export ONNX not available:", e)


# 6. TODOs (for the candidate)
- [ ] Analyze the class imbalance and propose strategies (weighting, augmented sampling, focal loss).
- [ ] Tune hyperparameters (epochs, image size, augmentations) to improve mAP.
- [ ] Record key metrics and justify the final baseline.
- [ ] Prepare all necessary artifacts in artifacts/ for the inference service (API).

# 7. Appendix  Notes on relative paths
- This notebook reads data.yaml and infers the paths to images/ and labels/ for train/, val/, and test/.
- If you move data.yaml to another folder, adjust DATA_YAML_PATH.
- If the dataset was downloaded from Roboflow, keep the standard YOLO folder structure.