Paquetes necesarios

In [2]:
import csv
import math
import os

import cv2
import easyocr
import numpy as np
import torch
from IPython.display import Video, display
from matplotlib import pyplot as plt
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from ultralytics import YOLO

  from .autonotebook import tqdm as notebook_tqdm


Extraemos las clases del modelo YOLO 11

In [None]:
model = YOLO('yolo11n.pt')

vid = cv2.VideoCapture("C0142.MP4")

names = None

while vid.isOpened():
    ret, frame = vid.read()

    if ret:
        results = model(frame, show=False)
        if names is None:
            names = results[0].names
        annotated_frame = results[0].plot()
        cv2.imshow("Deteccion de YOLO", annotated_frame)

        # Salir del vídeo cuando presionamos ESC
        if cv2.waitKey(1) & 0xFF == 27 or cv2.getWindowProperty("Deteccion de YOLO", cv2.WND_PROP_VISIBLE) < 1:
            break
    else:
        # El vídeo ya se terminó
        break

vid.release()
cv2.destroyAllWindows()

# Leemos las posibles clases
with open("classes.txt", "w") as f:
    f.write(str(names))

### Mostramos el funcionamiento de nuestro modelo entrenado

In [None]:
model = YOLO('best.pt')

vid = cv2.VideoCapture("C0142.MP4")

while vid.isOpened():
    ret, frame = vid.read()

    if ret:
        results = model(frame, show=False)
        annotated_frame = results[0].plot()
        cv2.imshow("Deteccion de YOLO", annotated_frame)

        # Salir del vídeo cuando presionamos ESC
        if cv2.waitKey(1) & 0xFF == 27 or cv2.getWindowProperty("Deteccion de YOLO", cv2.WND_PROP_VISIBLE) < 1:
            break
    else:
        # El vídeo ya se terminó
        break

vid.release()
cv2.destroyAllWindows()

### Usamos el modelo pre-entrenado de YOLO y el nuestro en conjunto 
Utilizamos el modelo pre-entrenado para detectar personas y vehículos, posteriormente, cuando hayamos detectado un vehículo, se lo pasamos a nuestro modelo entrenado en matrículas para que le detecte la matrícula.

Código para las detecciones de OCR

In [None]:
# Código necesario para el VLM
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
device = "cpu"  # or "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if device == "cuda" else "eager").to(device)

In [None]:
import re

# Solo se está usando en el Tesseract (Se elimina?)
def preprocess_for_ocr(img_crop, escala=4):

    img = cv2.resize(img_crop, None, fx=escala, fy=escala, interpolation=cv2.INTER_CUBIC)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    gray_blur = cv2.GaussianBlur(gray, (3, 3), 0)

    thresh = cv2.adaptiveThreshold(
        gray_blur,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        13,
        2
    )

    return thresh

reader = easyocr.Reader(['es'], gpu=False) 

def ocr_easy(placa_crop, frame, x1, y1, last_plate=None):
    escala = 3
    placa_crop = cv2.resize(placa_crop, None, fx=escala, fy=escala, interpolation=cv2.INTER_CUBIC)

    gray = cv2.cvtColor(placa_crop, cv2.COLOR_BGR2GRAY)
    gray = cv2.equalizeHist(gray)
    gray = cv2.convertScaleAbs(gray, alpha=1.5, beta=0)

    ocr_result = reader.readtext(
        gray,
        allowlist='ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',
        detail=1
    )

    text = ""

    if len(ocr_result) > 0:
        text = ocr_result[0][1].strip()
        prob = ocr_result[0][2]

        if len(text) >= 4 and prob > 0.5 and text != last_plate:
            last_plate = text
            timestamp = vid.get(cv2.CAP_PROP_POS_MSEC) / 1000
            plate_pattern = re.compile("^[0-9]{4}[BCDFGHJKLMNPRSTVWXYZ]{3}$")
            if plate_pattern.match(text.strip()):
                print(f"[{timestamp:.2f}s] Matrícula: {text} (Conf: {prob:.2f})")
                cv2.putText(frame, f'{text}', (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)
            else:
                return
   
        return text

def ocr_vlm(crop, frame, x1, y1, x2, y2):
    plate_img = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Read the text on this license plate."}
            ]
        },
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[plate_img], return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=10)
        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        plate_text = generated_texts[0].strip()

        if "Assistant: " in plate_text:
            raw_text = plate_text.split("Assistant: ")[1]
        else:
            plate_text = raw_text

    plate_pattern = re.compile("^[0-9]{4}[BCDFGHJKLMNPRSTVWXYZ]{3}$")
    if plate_pattern.match(plate_text.strip()):
        cv2.putText(frame, raw_text, (x1, max(30, y1 - 10)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
    else:
        return
    
    return raw_text

# NO SE USA (Lo eliminamos cuando esté confirmado que usamos los otros dos)
def ocr_tesseract(placa_crop, frame_count, cap, crop_dir="crops/", last_texts=set()):
    if placa_crop.size > 0:
        gray = preprocess_for_ocr(placa_crop)

        # Usando Tesseract
        ocr_result = pytesseract.image_to_data(
            gray,
            output_type=Output.DICT,
            config='--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
        )

        n_boxes = len(ocr_result['text'])
        for i in range(n_boxes):
            text = ocr_result['text'][i].strip().replace(" ", "")
            conf = float(ocr_result['conf'][i])
            if len(text) >= 7 and conf > 60 and text not in last_texts:
                last_texts.add(text)
                timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
                print(f"[{timestamp:.2f}s] Matrícula detectada: {text} (Conf: {conf:.2f})")

                cv2.putText(frame, text, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2, cv2.LINE_AA)

                crop_filename = os.path.join(crop_dir, f"{text}_{frame_count}.jpg")
                cv2.imwrite(crop_filename, gray)
                print(f"Guardada imagen: {crop_filename}")
    return text

Using CPU. Note: This module is much faster with a GPU.


Código principal

In [None]:

BASE_MODEL_PATH = 'yolo11n.pt'
OUR_MODEL_PATH = 'best.pt'

VIDEO_PATH = "prueba_coches.mp4"
base_model = YOLO(BASE_MODEL_PATH)
our_model = YOLO(OUR_MODEL_PATH)
vid = cv2.VideoCapture(VIDEO_PATH)

frame_width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(vid.get(cv2.CAP_PROP_FPS))

output_path = 'resultados.mp4'

fourcc = cv2.VideoWriter_fourcc(*'mp4v') 
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

def data_to_csv(registros):
    columnas = [
        "fotograma", "tipo_objeto", "confianza", "id_tracking",
        "x1", "y1", "x2", "y2",
        "matricula_detectada", "conf_ocr",
        "mx1", "my1", "mx2", "my2",
        "texto_matricula"
    ]

    with open("resultados.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(columnas)    
        writer.writerows(registros)  

    print("Archivo 'resultados.csv' creado correctamente.")


classes = [0, 2, 3, 5, 7]    # Person, car, motorcycle, bus, truck

car_boxes = []
car_boxes_left_coords = []

track_ids = set()
count_classes = {"person": 0, "car": 0, "motorcycle": 0, "bus": 0, "truck": 0}

save_csv = []
frame_count = 0

while vid.isOpened():
    ret, frame = vid.read()
    frame_count += 1

    if ret:
        base_results = base_model.track(frame, persist=True, show=False, classes=classes)
        plates_result = None
        annotated_frame = base_results[0].plot()
        boxes = list()

        # Mostramos un recuadro arriba a la izquierda que muestre las matrículas que se vayan detectando
        text_box_w = int(frame.shape[1]*0.2)
        text_box_h = int(frame.shape[0]*0.09)
        
        cv2.rectangle(annotated_frame, (0, 0), (text_box_w, text_box_h), (0, 0, 0), -1)

        last_plate = ""
        show_plate_text = ""
        
        for result in base_results:
            boxes += result.boxes
        for box in boxes:
            bounding_box = box.xyxy.tolist()
            name = result[0].names[box.cls.int().item()]
            conf = box.conf
            track_id = str(int(box.id[0].tolist()))
            if track_id not in track_ids:
                track_ids.add(track_id)
                count_classes[name] += 1
            x1, y1, x2, y2 = [int(item) for item in bounding_box[0]]
            plate, plate_conf, px1, py1, px2, py2, plate_text = "", "", "", "", "", "", ""
            if name != "person":
                vehicle_box = frame[y1:y2, x1:x2]
                plates_result = our_model(vehicle_box, show=False)
                if len(plates_result[0].boxes) > 0:
                    plate_conf = plates_result[0].boxes.conf
                    plate_detection = (plates_result[0].boxes.xyxy).tolist()
                    px1, py1, px2, py2 = [int(item) for item in plate_detection[0]]
                    plate = vehicle_box[py1:py2, px1:px2]
                    real_x1 = px1+x1
                    real_y1 = py1+y1
                    real_x2 = px2+x1
                    real_y2 = py2+y1
                    cv2.rectangle(annotated_frame, (real_x1, real_y1), (real_x2, real_y2), (0, 255, 0), 2)
                    plate_text = ocr_easy(plate, frame, real_x1, real_y1)
                    plate_text = plate_text.strip()
                    if plate_text is not None:
                        show_plate_text = plate_text
            save_csv.append(["frame", name, conf, track_id, x1, y1, x2, y2, "plate", plate_conf, px1, py1, px2, py2, plate_text])
            if show_plate_text != last_plate:
                font = cv2.FONT_HERSHEY_SIMPLEX
                (text_width, text_height), baseline = cv2.getTextSize(plate_text, font, 0.8, 2)
                text_x = (text_box_w - text_width) // 2
                text_y = (text_box_h + text_height) // 2 - baseline
                cv2.putText(annotated_frame, plate_text, (text_x, text_y), font, 0.8, (255, 255, 255), 2)
                last_plate = show_plate_text               
            
        out.write(annotated_frame)
        """cv2.imshow("Deteccion de YOLO", annotated_frame)
        
        if cv2.waitKey(1) & 0xFF == 27 or cv2.getWindowProperty("Deteccion de YOLO", cv2.WND_PROP_VISIBLE) < 1:
            break"""
    else:
        # El vídeo ya se terminó
        break

vid.release()
out.release()
cv2.destroyAllWindows()

# fotograma, tipo_objeto, confianza, identificador_tracking, x1, y1, x2, y2, matrícula_en_su_caso, confianza, mx1,my1,mx2,my2, texto_matricula
print(save_csv)
print(count_classes)

### Pruebas OCR

#### Easy

In [None]:
import cv2
import numpy as np
import time
from ultralytics import YOLO
import easyocr
from IPython.display import Video, display
import os


model_path = "best.pt"
video_path = "C0142.MP4"
crop_dir   = "crops/"  
os.makedirs(crop_dir, exist_ok=True)

model = YOLO(model_path)
reader = easyocr.Reader(['en'], gpu=True)

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception(f"No se pudo abrir el vídeo: {video_path}")

width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps_in = cap.get(cv2.CAP_PROP_FPS) or 20.0

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('resultado.mp4', fourcc, fps_in, (width, height))

margin = 10
frame_count = 0
last_texts = set()

def preprocess_for_ocr(img_crop, escala=4):

    img = cv2.resize(img_crop, None, fx=escala, fy=escala, interpolation=cv2.INTER_CUBIC)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    gray_blur = cv2.GaussianBlur(gray, (3, 3), 0)

    thresh = cv2.adaptiveThreshold(
        gray_blur,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        13,
        2
    )

    return thresh

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1

    results = model(frame, verbose=False)
    detections = results[0].boxes

    for box in detections:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        conf = float(box.conf[0])
        if conf < 0.2:
            continue

        w, h = x2 - x1, y2 - y1
        extra = int(max(w, h) * 0.15)
        x1m = max(0, x1 - margin - extra)
        y1m = max(0, y1 - margin - extra)
        x2m = min(frame.shape[1], x2 + margin + extra)
        y2m = min(frame.shape[0], y2 + margin + extra)

        placa_crop = frame[y1m:y2m, x1m:x2m]

        if placa_crop.size > 0:

            gray = preprocess_for_ocr(placa_crop)

            ocr_result = reader.readtext(
                gray,
                allowlist='ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',
                detail=1,
                text_threshold=0.2
            )

            if len(ocr_result) > 0:
                for (bbox, text, prob) in ocr_result:
                    text_clean = text.strip().replace(" ", "")
                    if prob > 0.7 and len(text_clean) >= 7 and text_clean not in last_texts:
                        last_texts.add(text_clean)
                        timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

                        print(f"[{timestamp:.2f}s] Matrícula detectada: {text_clean} (Conf: {prob:.2f})")

                        cv2.putText(frame, text_clean, (x1, y1 - 10),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2, cv2.LINE_AA)

                        crop_filename = os.path.join(crop_dir, f"{text_clean}_{frame_count}.jpg")
                        cv2.imwrite(crop_filename, gray)
                        print(f"Guardada imagen: {crop_filename}")
            else:
                pass


        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    out.write(frame)

cap.release()
out.release()
print(f"Procesamiento completado. Total frames: {frame_count}")
display(Video('resultado.mp4', embed=True))

#### Tesseract

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
import pytesseract
from pytesseract import Output
from IPython.display import Video, display
import os

# Configura la ruta si Tesseract no está en PATH
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract'

model_path = "best.pt"
video_path = "C0142.MP4"
crop_dir   = "crops/"  
os.makedirs(crop_dir, exist_ok=True)

model = YOLO(model_path)

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception(f"No se pudo abrir el vídeo: {video_path}")

width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps_in = cap.get(cv2.CAP_PROP_FPS) or 20.0

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('resultado_tesseract.mp4', fourcc, fps_in, (width, height))

margin = 10
frame_count = 0
last_texts = set()

def preprocess_for_ocr(img_crop, escala=4):
    # Escala y convierte a gris
    img = cv2.resize(img_crop, None, fx=escala, fy=escala, interpolation=cv2.INTER_CUBIC)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray_blur = cv2.GaussianBlur(gray, (3, 3), 0)
    # Umbral adaptativo
    thresh = cv2.adaptiveThreshold(
        gray_blur,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        13,
        2
    )
    return thresh

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1

    results = model(frame, verbose=False)
    detections = results[0].boxes

    for box in detections:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        conf = float(box.conf[0])
        if conf < 0.2:
            continue

        w, h = x2 - x1, y2 - y1
        extra = int(max(w, h) * 0.15)
        x1m = max(0, x1 - margin - extra)
        y1m = max(0, y1 - margin - extra)
        x2m = min(frame.shape[1], x2 + margin + extra)
        y2m = min(frame.shape[0], y2 + margin + extra)

        placa_crop = frame[y1m:y2m, x1m:x2m]

        if placa_crop.size > 0:
            gray = preprocess_for_ocr(placa_crop)

            # Usando Tesseract
            ocr_result = pytesseract.image_to_data(
                gray,
                output_type=Output.DICT,
                config='--psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            )

            n_boxes = len(ocr_result['text'])
            for i in range(n_boxes):
                text = ocr_result['text'][i].strip().replace(" ", "")
                conf = float(ocr_result['conf'][i])
                if len(text) >= 7 and conf > 60 and text not in last_texts:
                    last_texts.add(text)
                    timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
                    print(f"[{timestamp:.2f}s] Matrícula detectada: {text} (Conf: {conf:.2f})")

                    cv2.putText(frame, text, (x1, y1 - 10),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2, cv2.LINE_AA)

                    crop_filename = os.path.join(crop_dir, f"{text}_{frame_count}.jpg")
                    cv2.imwrite(crop_filename, gray)
                    print(f"Guardada imagen: {crop_filename}")

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    out.write(frame)

cap.release()
out.release()
print(f"Procesamiento completado. Total frames: {frame_count}")
display(Video('resultado_tesseract.mp4', embed=True))

In [11]:
import csv

registros = [
    [1, "auto", 0.92, 3, 120, 200, 360, 480, True, 0.88, 135, 215, 345, 465, "ABC1234"],
    [1, "moto", 0.85, 5, 400, 220, 500, 380, False, 0.00, 0, 0, 0, 0, ""],
    [2, "auto", 0.95, 3, 125, 205, 365, 485, True, 0.90, 140, 220, 350, 470, "ABC1234"],
    [2, "camioneta", 0.88, 7, 600, 250, 900, 550, True, 0.75, 620, 270, 880, 530, "XYZ9876"],
    [3, "auto", 0.93, 3, 130, 210, 370, 490, True, 0.85, 145, 225, 355, 475, "ABC1234"],
    [3, "moto", 0.80, 5, 405, 225, 505, 385, False, 0.00, 0, 0, 0, 0, ""]
]
def data_to_csv(registros):
    columnas = [
        "fotograma", "tipo_objeto", "confianza", "id_tracking",
        "x1", "y1", "x2", "y2",
        "matricula_detectada", "conf_ocr",
        "mx1", "my1", "mx2", "my2",
        "texto_matricula"
    ]

    with open("resultados.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(columnas)    
        writer.writerows(registros)  

    print("Archivo 'resultados.csv' creado correctamente.")

data_to_csv(registros)

Archivo 'resultados.csv' creado correctamente.


### Comparativa entre modelos

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from PIL import Image

# Initialize OCR models
reader = easyocr.Reader(['en'])

# Load dataset
df = pd.read_csv("roboflow_labels.csv")  # or your path
df['ocr_tesseract'] = ""
df['ocr_easyocr'] = ""

# Process each image
for i, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row['image_path']
    gt = row['label']
    img = Image.open(img_path)

    # Tesseract OCR
    df.at[i, 'ocr_tesseract'] = pytesseract.image_to_string(img).strip().replace(" ", "").upper()

    # EasyOCR
    result = reader.readtext(img)
    text_easy = "".join([res[1] for res in result]).strip().replace(" ", "").upper()
    df.at[i, 'ocr_easyocr'] = text_easy

# Save results
df.to_csv("ocr_results.csv", index=False)
