<a href="https://colab.research.google.com/github/Far-ch/Signals-and-Systems-Project/blob/main/Signal_and_Systems_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#YOLO
from ultralytics import YOLO
import cv2, torch, numpy as np, matplotlib.pyplot as plt

VIDEO_IN  = "person1.mp4"
SNAPSHOT  = "first_detected.jpg"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL  = "yolov8m.pt"
model  = YOLO(MODEL).to(DEVICE)
CLASSES = {
    0: "person",1: "bicycle",2: "car",3: "motorcycle",4: "airplane",5: "bus",6: "train",7: "truck",8: "boat",9: "traffic light",10: "fire hydrant",
11: "stop sign", 12: "parking meter",13: "bench",14: "bird",15: "cat",16: "dog",17: "horse",18: "sheep",19: "cow",20: "elephant",21: "bear",
22: "zebra",23: "giraffe",24: "backpack",25: "umbrella",26: "handbag",27: "tie",28: "suitcase",29: "frisbee",30: "skis",31: "snowboard",32: "sports ball",
33: "kite",34: "baseball bat",35: "baseball glove",36: "skateboard",37: "surfboard",38: "tennis racket",39: "bottle",40: "wine glass",41: "cup",
42: "fork",43: "knife",44: "spoon",45: "bowl",46: "banana",47: "apple",48: "sandwich",49: "orange",50: "broccoli",51: "carrot",
52: "hot dog", 53: "pizza",54: "donut",55: "cake",56: "chair",57: "couch",58: "potted plant",59: "bed",60: "dining table",61: "toilet",
62: "tv",63: "laptop",64: "mouse",65: "remote",66: "keyboard",67: "cell phone",68: "microwave",69: "oven",70: "toaster",71: "sink",72: "refrigerator",
73: "book",74: "clock",75: "vase",76: "scissors",77: "teddy bear",78: "hair drier",79: "toothbrush"}

CLASS_IDS = list(CLASSES.keys())
rng = np.random.default_rng(42)
COLORS = {cls_id: tuple(rng.integers(64, 256, 3).tolist()) for cls_id in CLASS_IDS}

BOX_THICK = 3
FONT_SCALE = 1.0
FONT_THICK = 2


def detect(img_rgb, imgsz=960, conf=0.3):
    """Detect objects and return list of boxes: (x1, y1, x2, y2, class_id, conf)"""
    res = model.predict(img_rgb, imgsz=imgsz, conf=conf,
                        classes=CLASS_IDS, device=DEVICE,
                        verbose=False)[0]
    out = []
    for b in res.boxes:
        out.append((*map(int, b.xyxy[0]), int(b.cls[0]), float(b.conf[0])))
    return out


def robust_detect(frame_bgr):
    H, W = frame_bgr.shape[:2]

    #Full-frame
    boxes = detect(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB), 1280, 0.3)
    if boxes:
        return boxes

    #Center crop + upscale
    cw, ch = int(W * 0.6), int(H * 0.7)
    x0, y0 = (W - cw) // 2, (H - ch) // 2
    crop = frame_bgr[y0:y0+ch, x0:x0+cw]
    crop_up = cv2.resize(crop, (1280, 1280))
    boxes_crop = detect(cv2.cvtColor(crop_up, cv2.COLOR_BGR2RGB), 1280, 0.25)
    if boxes_crop:
        sx, sy = cw / 1280, ch / 1280
        return [(int(x1*sx + x0), int(y1*sy + y0),
                 int(x2*sx + x0), int(y2*sy + y0), cls, conf)
                for x1, y1, x2, y2, cls, conf in boxes_crop]

    #Sliding tiles
    out = []
    tile, stride = 640, 320
    for y in range(0, H, stride):
        for x in range(0, W, stride):
            tile_bgr = frame_bgr[y:y+tile, x:x+tile]
            boxes_tile = detect(cv2.cvtColor(tile_bgr, cv2.COLOR_BGR2RGB), 640, 0.25)
            for x1, y1, x2, y2, cls, conf in boxes_tile:
                out.append((x1+x, y1+y, x2+x, y2+y, cls, conf))
    return out


cap = cv2.VideoCapture(VIDEO_IN)
ok, frame = cap.read()
cap.release()
assert ok, f" Couldn't read first frame of {VIDEO_IN}"

boxes = robust_detect(frame.copy())
for x1, y1, x2, y2, cls, conf in boxes:
    label_name = CLASSES.get(cls, "unknown")
    label_text = f"{label_name} {conf:.2f}"

    color = COLORS.get(cls, (0, 255, 255))
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, BOX_THICK)

    (tw, th), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX,
                                  FONT_SCALE, FONT_THICK)
    cv2.rectangle(frame, (x1, y1 - th - 8), (x1 + tw, y1), color, -1)
    cv2.putText(frame, label_text, (x1, y1 - 5),
                cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE,
                (255, 255, 255), FONT_THICK, cv2.LINE_AA)


cv2.imwrite(SNAPSHOT, frame)
plt.figure(figsize=(12,6))
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

In [None]:
#Fast-R-CNN
!pip -q install --upgrade torch torchvision torchaudio
import cv2, torch, matplotlib.pyplot as plt
import numpy as np
from torchvision import models, transforms

# COCO classes
CLASSES = {
    0:"background", 1:"person", 2:"bicycle", 3:"car", 4:"motorcycle", 5:"airplane",
    6:"bus",7:"train",8:"truck",9:"boat",10:"trafficlight",11:"firehydrant",
    13:"bench",14:"bird",15:"cat",16:"dog",17:"horse",18:"sheep",19:"cow",
    20:"elephant",21:"bear",22:"zebra",23:"giraffe",24:"backpack",25:"umbrella",
    27:"handbag",28:"tie",31:"snowboard",32:"sportsball",33:"kite",34:"baseballbat",
    35:"baseballglove",36:"skateboard",37:"surfboard",38:"tennisracket",39:"bottle",
    40:"wineglass",41:"cup",42:"fork",43:"knife",44:"spoon",45:"bowl",46:"banana",
    47:"apple",48:"sandwich",49:"orange",50:"broccoli",51:"carrot",52:"hotdog",
    53:"pizza",54:"donut",55:"cake",56:"chair",57:"couch",58:"pottedplant",59:"bed",
    60:"diningtable",61:"toilet",62:"tv",63:"laptop",64:"mouse",65:"remote",
    66:"keyboard",67:"cellphone",68:"microwave",69:"oven",70:"toaster",71:"sink",
    72:"refrigerator",73:"book",74:"clock",75:"vase",76:"scissors",77:"teddybear",
    78:"hairdrier",79:"toothbrush"
}

VIDEO_PATH = "person1.mp4"
cap = cv2.VideoCapture(VIDEO_PATH)
ok, frame_bgr = cap.read()
cap.release()
assert ok, "Could not read first frame"

frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.to(device).eval()

transform = transforms.Compose([transforms.ToTensor()])   # converts to [0,1] + CHW
img_tensor = transform(frame_rgb).to(device)


with torch.no_grad():
    preds = model([img_tensor])[0]   # dict with boxes, labels, scores

boxes   = preds["boxes"].cpu().numpy()
labels  = preds["labels"].cpu().numpy()
scores  = preds["scores"].cpu().numpy()

CONF_TH = 0.50
keep = scores >= CONF_TH

boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

OUT = frame_bgr.copy()
for (x1,y1,x2,y2), cls_id, conf in zip(boxes, labels, scores):
    cls_id = int(cls_id)
    name   = CLASSES.get(cls_id, f"id{cls_id}")
    label  = f"{name} {conf:.2f}"

    color = (0,255,0)   # green boxes; change if you like
    cv2.rectangle(OUT, (int(x1),int(y1)), (int(x2),int(y2)), color, 2)
    (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
    cv2.rectangle(OUT, (int(x1), int(y1)-th-8), (int(x1)+tw, int(y1)), color, -1)
    cv2.putText(OUT, label, (int(x1), int(y1)-4),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,0), 2)

cv2.imwrite("fast_rcnn_detect.jpg", OUT[:, :, ::-1])  # BGR→RGB file

plt.figure(figsize=(12,6))
plt.imshow(cv2.cvtColor(OUT, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.title("Fast-R-CNN detection ")
plt.show()


In [None]:
#KCF_Tracker
from ultralytics import YOLO
import cv2
import numpy as np
import time
import uuid
from pathlib import Path

# Paths
downloads = Path.home() / "Downloads"
video_path = downloads / "videos" / "person1.mp4"  # ← change to your file if needed
output_path = downloads / "output.mp4"

# Parameters
YOLO_PERIOD = 5
IOU_MATCH = 0.5
IOU_MERGE = 0.45
CONFIDENCE = 0.15
MIN_AREA_FRAC = 0.01
SPEED_THRESH = 20
ROI_SCALE_FAST = 1.4
HIST_THRESH = 0.6
MAX_MISSES = 30
MEMORY_AGE = 300
HIST_BINS = (16, 16, 16)

# --- Helper functions ---
def iou(b1, b2):
    xA = max(b1[0], b2[0])
    yA = max(b1[1], b2[1])
    xB = min(b1[2], b2[2])
    yB = min(b1[3], b2[3])
    inter = max(0, xB - xA) * max(0, yB - yA)
    if inter == 0: return 0.0
    a1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
    a2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
    return inter / (a1 + a2 - inter)

def hsv_hist(img, bins=HIST_BINS):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    cv2.normalize(h, h)
    return h.flatten()

def hist_corr(h1, h2):
    return cv2.compareHist(h1.astype(np.float32), h2.astype(np.float32), cv2.HISTCMP_CORREL)

def create_kalman(cx, cy):
    kf = cv2.KalmanFilter(4, 2)
    kf.transitionMatrix = np.array([[1, 0, 1, 0],
                                     [0, 1, 0, 1],
                                     [0, 0, 1, 0],
                                     [0, 0, 0, 1]], np.float32)
    kf.measurementMatrix = np.eye(2, 4, dtype=np.float32)
    kf.processNoiseCov = np.eye(4, dtype=np.float32) * 1e-2
    kf.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1e-1
    kf.statePre = np.array([[cx], [cy], [0], [0]], np.float32)
    kf.statePost = kf.statePre.copy()
    return kf

# Load model
model = YOLO('yolov8m.pt')  # more accurate than yolov8n.pt

cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
    raise FileNotFoundError(f"Cannot open: {video_path}")

W = int(cap.get(3))
H = int(cap.get(4))
fps_in = cap.get(cv2.CAP_PROP_FPS) or 30
out = cv2.VideoWriter(str(output_path), cv2.VideoWriter_fourcc(*'mp4v'), fps_in, (W, H))
min_area = MIN_AREA_FRAC * (W * H)

tracks = []
lost = []
frame_idx = 0
t0 = time.perf_counter()

# Main loop
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1

    for tr in tracks:
        pred = tr["kalman"].predict()
        px, py = pred[0], pred[1]
        enlarge = ROI_SCALE_FAST if tr["speed"] > SPEED_THRESH else 1.0

        success, bbox = tr["tracker"].update(frame)
        if not success:
            tr["miss"] += 1
            w = tr["box"][2] - tr["box"][0]
            h = tr["box"][3] - tr["box"][1]
            w *= enlarge
            h *= enlarge
            x1, y1 = int(px - w / 2), int(py - h / 2)
            x2, y2 = int(px + w / 2), int(py + h / 2)
            tr["box"] = [max(0, x1), max(0, y1), min(W - 1, x2), min(H - 1, y2)]
        else:
            x, y, w, h = map(int, bbox)
            cx, cy = x + w // 2, y + h // 2
            tr["kalman"].correct(np.array([[cx], [cy]], np.float32))
            dx, dy = cx - tr["cx"], cy - tr["cy"]
            tr["speed"] = float(np.hypot(dx, dy))
            tr["cx"], tr["cy"] = cx, cy
            tr["box"] = [x, y, x + w, y + h]
            tr["miss"] = 0
            patch = frame[y:y + h, x:x + w]
            if patch.size:
                tr["hist"] = 0.8 * tr["hist"] + 0.2 * hsv_hist(patch)

    tracks = [tr for tr in tracks if tr["miss"] <= MAX_MISSES or lost.append({
        "id": tr["id"], "label": tr["label"], "hist": tr["hist"], "last": frame_idx
    }) is None]
    lost = [lt for lt in lost if frame_idx - lt["last"] <= MEMORY_AGE]

    if frame_idx % YOLO_PERIOD == 0 or any(tr["miss"] > 0 for tr in tracks):
        res = model.predict(frame, conf=CONFIDENCE, verbose=False)[0]
        dets = res.boxes.xyxy.cpu().numpy().astype(int)
        cls = res.boxes.cls.cpu().numpy().astype(int)
        dets = [(x1, y1, x2, y2, int(c)) for (x1, y1, x2, y2), c in zip(dets, cls)
                if (x2 - x1) * (y2 - y1) > min_area]

        matched = set()
        for tr in tracks:
            best_iou, best_j = 0, -1
            for j, (x1, y1, x2, y2, cid) in enumerate(dets):
                if j in matched:
                    continue
                iou_val = iou(tr["box"], (x1, y1, x2, y2))
                if iou_val > best_iou:
                    best_iou, best_j = iou_val, j
            if best_iou > IOU_MATCH:
                x1, y1, x2, y2, cid = dets[best_j]
                trk = cv2.TrackerKCF_create()
                trk.init(frame, (x1, y1, x2 - x1, y2 - y1))
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
                tr.update(dict(tracker=trk, box=[x1, y1, x2, y2],
                               cx=cx, cy=cy, miss=0))
                matched.add(best_j)

        for j, (x1, y1, x2, y2, cid) in enumerate(dets):
            if j in matched:
                continue
            patch = frame[y1:y2, x1:x2]
            hh = hsv_hist(patch)
            best_score, best_lt = 0, None
            for lt in lost:
                score = hist_corr(hh, lt["hist"])
                if score > best_score:
                    best_score = score
                    best_lt = lt
            if best_score > HIST_THRESH:
                id_, label = best_lt["id"], best_lt["label"]
                lost.remove(best_lt)
            else:
                id_, label = str(uuid.uuid4())[:8], model.names.get(cid, f"class{cid}")
            trk = cv2.TrackerKCF_create()
            trk.init(frame, (x1, y1, x2 - x1, y2 - y1))
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
            tracks.append(dict(id=id_, label=label, tracker=trk,
                               box=[x1, y1, x2, y2], hist=hh, miss=0,
                               speed=0, cx=cx, cy=cy,
                               kalman=create_kalman(cx, cy)))

    tracks.sort(key=lambda t: t["miss"])
    filtered = []
    for tr in tracks:
        if any(iou(tr["box"], f["box"]) > IOU_MERGE for f in filtered):
            continue
        filtered.append(tr)
    tracks = filtered

    for tr in tracks:
        x1, y1, x2, y2 = map(int, tr["box"])
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, tr["label"], (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    out.write(frame)

# Done
elapsed = time.perf_counter() - t0
print(f"Done. Frames: {frame_idx}, Time: {elapsed:.2f}s, Avg FPS: {frame_idx / elapsed:.2f}")
print(f"Output saved to: {output_path}")
cap.release()
out.release()


In [None]:
#MOSSE_Tracker
import time
import uuid
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from pathlib import Path

#PATHS
downloads = Path.home() / "Downloads"
VIDEO_PATH = downloads / "videos" / "car1.mp4"
OUTPUT_PATH = downloads / "car1_mosse_final_output.mp4"

#PARAMETERS
CONF_TH = 0.5
MAX_LOST = 250
HIST_THRESH = 0.6
HIST_BINS = (16, 16, 16)

#COCO LABELS
COCO = {i: name for i, name in enumerate([
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', '', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife',
    'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', '', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'])}

#Histogram functions
def get_histogram(patch):
    hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, HIST_BINS, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def compare_hist(hist1, hist2):
    return cv2.compareHist(hist1.astype(np.float32), hist2.astype(np.float32), cv2.HISTCMP_CORREL)

#Kalman Filter
class KalmanFilter:
    def __init__(self, x, y):
        self.kf = cv2.KalmanFilter(8, 4)
        self.kf.transitionMatrix = np.eye(8, dtype=np.float32)
        for i in range(4):
            self.kf.transitionMatrix[i, i + 4] = 1
        self.kf.measurementMatrix = np.zeros((4, 8), np.float32)
        for i in range(4):
            self.kf.measurementMatrix[i, i] = 1
        self.kf.processNoiseCov = np.eye(8, dtype=np.float32) * 1e-3
        self.kf.measurementNoiseCov = np.eye(4, dtype=np.float32) * 1e-2
        self.kf.statePost = np.array([[x], [y], [0], [0], [0], [0], [0], [0]], np.float32)

    def predict(self):
        pred = self.kf.predict()
        return int(pred[0, 0]), int(pred[1, 0])

    def correct(self, x, y):
        self.kf.correct(np.array([[x], [y], [0], [0]], np.float32))

#Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT").to(device).eval()
transform = transforms.Compose([transforms.ToTensor()])

#Load video
cap = cv2.VideoCapture(str(VIDEO_PATH))
assert cap.isOpened(), f"Cannot open video: {VIDEO_PATH}"
W, H = int(cap.get(3)), int(cap.get(4))
fps_in = cap.get(cv2.CAP_PROP_FPS) or 25
out = cv2.VideoWriter(str(OUTPUT_PATH), cv2.VideoWriter_fourcc(*'mp4v'), fps_in, (W, H))

#Initialize
tracks = []
frame_idx = 0
start_time = time.time()
ret, frame = cap.read()
frame_idx += 1
assert ret, "Can't read the first frame"
img_tensor = transform(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).to(device)

with torch.no_grad():
    preds = model([img_tensor])[0]

boxes = preds["boxes"].cpu().numpy()
labels = preds["labels"].cpu().numpy()
scores = preds["scores"].cpu().numpy()

for box, cls, score in zip(boxes, labels, scores):
    if score < CONF_TH:
        continue
    x1, y1, x2, y2 = map(int, box)
    w, h = x2 - x1, y2 - y1
    patch = frame[y1:y2, x1:x2]
    if patch.size == 0:
        continue
    tracker = cv2.legacy.TrackerMOSSE_create()
    tracker.init(frame, (x1, y1, w, h))
    kalman = KalmanFilter(x1 + w // 2, y1 + h // 2)
    hist = get_histogram(patch)
    label = COCO.get(int(cls), f"class{int(cls)}")
    tracks.append({
        "id": str(uuid.uuid4())[:8],
        "tracker": tracker,
        "label": label,
        "kalman": kalman,
        "hist": hist,
        "lost": 0
    })
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

out.write(frame)

#Track across video
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1
    new_tracks = []

    for tr in tracks:
        ok, bbox = tr["tracker"].update(frame)
        x, y, w, h = map(int, bbox)
        x2, y2 = x + w, y + h

        if not ok or w <= 0 or h <= 0:
            tr["lost"] += 1
            pred_x, pred_y = tr["kalman"].predict()
            if tr["lost"] <= MAX_LOST:
                cv2.circle(frame, (pred_x, pred_y), 4, (255, 255, 0), -1)
                new_tracks.append(tr)
            continue

        patch = frame[y:y+h, x:x+w]
        if patch.size > 0:
            hist_new = get_histogram(patch)
            sim = compare_hist(tr["hist"], hist_new)
            if sim >= HIST_THRESH:
                tr["kalman"].correct(x + w // 2, y + h // 2)
                tr["hist"] = 0.8 * tr["hist"] + 0.2 * hist_new
                tr["lost"] = 0
            else:
                tr["lost"] += 1
                if tr["lost"] <= MAX_LOST:
                    pred_x, pred_y = tr["kalman"].predict()
                    cv2.circle(frame, (pred_x, pred_y), 4, (255, 0, 0), -1)
                    new_tracks.append(tr)
                continue

        cv2.rectangle(frame, (x, y), (x2, y2), (255, 0, 0), 2)
        cv2.putText(frame, tr["label"], (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
        new_tracks.append(tr)

    tracks = new_tracks

    elapsed = time.time() - start_time
    fps = frame_idx / elapsed
    cv2.putText(frame, f"FPS: {fps:.1f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    out.write(frame)


cap.release()
out.release()
print(f"✅ Done. Output saved to: {OUTPUT_PATH}")
print(f"📈 Average FPS: {frame_idx / (time.time() - start_time):.2f}")

In [None]:
#CSRT_Tracker
from ultralytics import YOLO
import cv2
import numpy as np
import time

def extract_hsv_hist(patch, bins=(16,16,16)):
    hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0,1,2], None, bins, [0,180,0,256,0,256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        return 0
    boxAArea = (boxA[2]-boxA[0]) * (boxA[3]-boxA[1])
    boxBArea = (boxB[2]-boxB[0]) * (boxB[3]-boxB[1])
    return interArea / float(boxAArea + boxBArea - interArea)

class Track:
    def __init__(self, bbox, label, frame):
        self.tracker = cv2.TrackerCSRT_create()
        self.tracker.init(frame, bbox)
        x,y,w,h = map(int, bbox)
        patch = frame[y:y+h, x:x+w]
        self.feature = extract_hsv_hist(patch)
        self.label = label
        self.miss_count = 0

        cx, cy = x + w/2, y + h/2
        self.kalman = cv2.KalmanFilter(4,2)
        self.kalman.transitionMatrix = np.array([[1,0,1,0],[0,1,0,1],[0,0,1,0],[0,0,0,1]], np.float32)
        self.kalman.measurementMatrix = np.eye(2,4, dtype=np.float32)
        self.kalman.processNoiseCov = np.eye(4, dtype=np.float32) * 1e-2
        self.kalman.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1e-1
        self.kalman.statePre  = np.array([[cx],[cy],[0],[0]], np.float32)
        self.kalman.statePost = self.kalman.statePre.copy()

    def predict(self):
        return self.kalman.predict()

    def correct(self, cx, cy):
        self.kalman.correct(np.array([[cx],[cy]], np.float32))

model = YOLO('yolov8n.pt')
cap = cv2.VideoCapture('person4.mp4')
W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps_in = cap.get(cv2.CAP_PROP_FPS)
out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps_in, (W,H))

tracks = []
frame_idx = 0
t_start = time.perf_counter()
DETECTION_INTERVAL = 20
MAX_MISSES = 10

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1

    if frame_idx % DETECTION_INTERVAL == 0:
        res = model.predict(frame, conf=0.3, classes=[0,3], verbose=False)[0]
        boxes = res.boxes.xyxy.cpu().numpy().astype(int)
        classes = res.boxes.cls.cpu().numpy().astype(int)

        matches = set()
        for box, cls in zip(boxes, classes):
            x1,y1,x2,y2 = box
            w, h = x2 - x1, y2 - y1
            patch = frame[y1:y2, x1:x2]
            hist = extract_hsv_hist(patch)

            best_match = None
            best_score = 0.5
            for i, tr in enumerate(tracks):
                if i in matches:
                    continue
                tx,ty,tw,th = tr.tracker.getROI()
                tiou = iou([x1,y1,x2,y2], [tx,ty,tx+tw,ty+th])
                cosine = np.dot(hist, tr.feature) / (np.linalg.norm(hist)*np.linalg.norm(tr.feature) + 1e-6)
                score = 0.5 * tiou + 0.5 * cosine
                if score > best_score:
                    best_score = score
                    best_match = i

            if best_match is not None:
                tr = tracks[best_match]
                tr.tracker = cv2.TrackerCSRT_create()
                tr.tracker.init(frame, (x1,y1,w,h))
                tr.correct(x1 + w/2, y1 + h/2)
                tr.feature = 0.7 * tr.feature + 0.3 * hist
                tr.miss_count = 0
                matches.add(best_match)
            else:
                tracks.append(Track((x1,y1,w,h), model.names[int(cls)], frame))

    # Track update
    new_tracks = []
    for tr in tracks:
        pc = tr.predict()
        ok, bbox = tr.tracker.update(frame)
        if ok:
            x,y,w,h = map(int, bbox)
            cx, cy = x + w/2, y + h/2
            tr.correct(cx, cy)
            cv2.rectangle(frame, (x,y), (x+w,y+h), (0,255,0), 2)
            cv2.putText(frame, tr.label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
            cv2.circle(frame, (int(pc[0]),int(pc[1])), 4, (0,0,255), -1)
            tr.miss_count = 0
        else:
            tr.miss_count += 1

        if tr.miss_count < MAX_MISSES:
            new_tracks.append(tr)

    tracks = new_tracks
    out.write(frame)

t_end = time.perf_counter()
print(f"Processed {frame_idx} frames in {t_end - t_start:.2f}s — Avg FPS: {frame_idx / (t_end - t_start):.2f}")
cap.release()
out.release()
cv2.destroyAllWindows()