<a href="https://colab.research.google.com/github/Far-ch/Signals-and-Systems-Project/blob/main/Signal_and_Systems_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#YOLO
from ultralytics import YOLO
import cv2, torch, numpy as np, matplotlib.pyplot as plt

VIDEO_IN  = "person1.mp4"
SNAPSHOT  = "first_detected.jpg"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL  = "yolov8m.pt"
model  = YOLO(MODEL).to(DEVICE)
CLASSES = {
    0: "person",1: "bicycle",2: "car",3: "motorcycle",4: "airplane",5: "bus",6: "train",7: "truck",8: "boat",9: "traffic light",10: "fire hydrant",
11: "stop sign", 12: "parking meter",13: "bench",14: "bird",15: "cat",16: "dog",17: "horse",18: "sheep",19: "cow",20: "elephant",21: "bear",
22: "zebra",23: "giraffe",24: "backpack",25: "umbrella",26: "handbag",27: "tie",28: "suitcase",29: "frisbee",30: "skis",31: "snowboard",32: "sports ball",
33: "kite",34: "baseball bat",35: "baseball glove",36: "skateboard",37: "surfboard",38: "tennis racket",39: "bottle",40: "wine glass",41: "cup",
42: "fork",43: "knife",44: "spoon",45: "bowl",46: "banana",47: "apple",48: "sandwich",49: "orange",50: "broccoli",51: "carrot",
52: "hot dog", 53: "pizza",54: "donut",55: "cake",56: "chair",57: "couch",58: "potted plant",59: "bed",60: "dining table",61: "toilet",
62: "tv",63: "laptop",64: "mouse",65: "remote",66: "keyboard",67: "cell phone",68: "microwave",69: "oven",70: "toaster",71: "sink",72: "refrigerator",
73: "book",74: "clock",75: "vase",76: "scissors",77: "teddy bear",78: "hair drier",79: "toothbrush"}

CLASS_IDS = list(CLASSES.keys())
rng = np.random.default_rng(42)
COLORS = {cls_id: tuple(rng.integers(64, 256, 3).tolist()) for cls_id in CLASS_IDS}

BOX_THICK = 3
FONT_SCALE = 1.0
FONT_THICK = 2


def detect(img_rgb, imgsz=960, conf=0.3):
    """Detect objects and return list of boxes: (x1, y1, x2, y2, class_id, conf)"""
    res = model.predict(img_rgb, imgsz=imgsz, conf=conf,
                        classes=CLASS_IDS, device=DEVICE,
                        verbose=False)[0]
    out = []
    for b in res.boxes:
        out.append((*map(int, b.xyxy[0]), int(b.cls[0]), float(b.conf[0])))
    return out


def robust_detect(frame_bgr):
    H, W = frame_bgr.shape[:2]

    #Full-frame
    boxes = detect(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB), 1280, 0.3)
    if boxes:
        return boxes

    #Center crop + upscale
    cw, ch = int(W * 0.6), int(H * 0.7)
    x0, y0 = (W - cw) // 2, (H - ch) // 2
    crop = frame_bgr[y0:y0+ch, x0:x0+cw]
    crop_up = cv2.resize(crop, (1280, 1280))
    boxes_crop = detect(cv2.cvtColor(crop_up, cv2.COLOR_BGR2RGB), 1280, 0.25)
    if boxes_crop:
        sx, sy = cw / 1280, ch / 1280
        return [(int(x1*sx + x0), int(y1*sy + y0),
                 int(x2*sx + x0), int(y2*sy + y0), cls, conf)
                for x1, y1, x2, y2, cls, conf in boxes_crop]

    #Sliding tiles
    out = []
    tile, stride = 640, 320
    for y in range(0, H, stride):
        for x in range(0, W, stride):
            tile_bgr = frame_bgr[y:y+tile, x:x+tile]
            boxes_tile = detect(cv2.cvtColor(tile_bgr, cv2.COLOR_BGR2RGB), 640, 0.25)
            for x1, y1, x2, y2, cls, conf in boxes_tile:
                out.append((x1+x, y1+y, x2+x, y2+y, cls, conf))
    return out


cap = cv2.VideoCapture(VIDEO_IN)
ok, frame = cap.read()
cap.release()
assert ok, f" Couldn't read first frame of {VIDEO_IN}"

boxes = robust_detect(frame.copy())
for x1, y1, x2, y2, cls, conf in boxes:
    label_name = CLASSES.get(cls, "unknown")
    label_text = f"{label_name} {conf:.2f}"

    color = COLORS.get(cls, (0, 255, 255))
    cv2.rectangle(frame, (x1, y1), (x2, y2), color, BOX_THICK)

    (tw, th), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX,
                                  FONT_SCALE, FONT_THICK)
    cv2.rectangle(frame, (x1, y1 - th - 8), (x1 + tw, y1), color, -1)
    cv2.putText(frame, label_text, (x1, y1 - 5),
                cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE,
                (255, 255, 255), FONT_THICK, cv2.LINE_AA)


cv2.imwrite(SNAPSHOT, frame)
plt.figure(figsize=(12,6))
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

In [None]:
#Fast-R-CNN
!pip -q install --upgrade torch torchvision torchaudio
import cv2, torch, matplotlib.pyplot as plt
import numpy as np
from torchvision import models, transforms

# COCO classes
CLASSES = {
    0:"background", 1:"person", 2:"bicycle", 3:"car", 4:"motorcycle", 5:"airplane",
    6:"bus",7:"train",8:"truck",9:"boat",10:"trafficlight",11:"firehydrant",
    13:"bench",14:"bird",15:"cat",16:"dog",17:"horse",18:"sheep",19:"cow",
    20:"elephant",21:"bear",22:"zebra",23:"giraffe",24:"backpack",25:"umbrella",
    27:"handbag",28:"tie",31:"snowboard",32:"sportsball",33:"kite",34:"baseballbat",
    35:"baseballglove",36:"skateboard",37:"surfboard",38:"tennisracket",39:"bottle",
    40:"wineglass",41:"cup",42:"fork",43:"knife",44:"spoon",45:"bowl",46:"banana",
    47:"apple",48:"sandwich",49:"orange",50:"broccoli",51:"carrot",52:"hotdog",
    53:"pizza",54:"donut",55:"cake",56:"chair",57:"couch",58:"pottedplant",59:"bed",
    60:"diningtable",61:"toilet",62:"tv",63:"laptop",64:"mouse",65:"remote",
    66:"keyboard",67:"cellphone",68:"microwave",69:"oven",70:"toaster",71:"sink",
    72:"refrigerator",73:"book",74:"clock",75:"vase",76:"scissors",77:"teddybear",
    78:"hairdrier",79:"toothbrush"
}

VIDEO_PATH = "person1.mp4"
cap = cv2.VideoCapture(VIDEO_PATH)
ok, frame_bgr = cap.read()
cap.release()
assert ok, "Could not read first frame"

frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.to(device).eval()

transform = transforms.Compose([transforms.ToTensor()])   # converts to [0,1] + CHW
img_tensor = transform(frame_rgb).to(device)


with torch.no_grad():
    preds = model([img_tensor])[0]   # dict with boxes, labels, scores

boxes   = preds["boxes"].cpu().numpy()
labels  = preds["labels"].cpu().numpy()
scores  = preds["scores"].cpu().numpy()

CONF_TH = 0.50
keep = scores >= CONF_TH

boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

OUT = frame_bgr.copy()
for (x1,y1,x2,y2), cls_id, conf in zip(boxes, labels, scores):
    cls_id = int(cls_id)
    name   = CLASSES.get(cls_id, f"id{cls_id}")
    label  = f"{name} {conf:.2f}"

    color = (0,255,0)   # green boxes; change if you like
    cv2.rectangle(OUT, (int(x1),int(y1)), (int(x2),int(y2)), color, 2)
    (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
    cv2.rectangle(OUT, (int(x1), int(y1)-th-8), (int(x1)+tw, int(y1)), color, -1)
    cv2.putText(OUT, label, (int(x1), int(y1)-4),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,0), 2)

cv2.imwrite("fast_rcnn_detect.jpg", OUT[:, :, ::-1])  # BGR→RGB file

plt.figure(figsize=(12,6))
plt.imshow(cv2.cvtColor(OUT, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.title("Fast-R-CNN detection ")
plt.show()


In [None]:
#MOSSE_Tracker
import time
import uuid
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from pathlib import Path

#PATHS
downloads = Path.home() / "Downloads"
VIDEO_PATH = downloads / "videos" / "car1.mp4"
OUTPUT_PATH = downloads / "car1_mosse_final_output.mp4"

#PARAMETERS
CONF_TH = 0.5
MAX_LOST = 250
HIST_THRESH = 0.6
HIST_BINS = (16, 16, 16)

#COCO LABELS
COCO = {i: name for i, name in enumerate([
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', '', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife',
    'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', '', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'])}

#Histogram functions
def get_histogram(patch):
    hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, HIST_BINS, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def compare_hist(hist1, hist2):
    return cv2.compareHist(hist1.astype(np.float32), hist2.astype(np.float32), cv2.HISTCMP_CORREL)

#Kalman Filter
class KalmanFilter:
    def __init__(self, x, y):
        self.kf = cv2.KalmanFilter(8, 4)
        self.kf.transitionMatrix = np.eye(8, dtype=np.float32)
        for i in range(4):
            self.kf.transitionMatrix[i, i + 4] = 1
        self.kf.measurementMatrix = np.zeros((4, 8), np.float32)
        for i in range(4):
            self.kf.measurementMatrix[i, i] = 1
        self.kf.processNoiseCov = np.eye(8, dtype=np.float32) * 1e-3
        self.kf.measurementNoiseCov = np.eye(4, dtype=np.float32) * 1e-2
        self.kf.statePost = np.array([[x], [y], [0], [0], [0], [0], [0], [0]], np.float32)

    def predict(self):
        pred = self.kf.predict()
        return int(pred[0, 0]), int(pred[1, 0])

    def correct(self, x, y):
        self.kf.correct(np.array([[x], [y], [0], [0]], np.float32))

#Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT").to(device).eval()
transform = transforms.Compose([transforms.ToTensor()])

#Load video
cap = cv2.VideoCapture(str(VIDEO_PATH))
assert cap.isOpened(), f"Cannot open video: {VIDEO_PATH}"
W, H = int(cap.get(3)), int(cap.get(4))
fps_in = cap.get(cv2.CAP_PROP_FPS) or 25
out = cv2.VideoWriter(str(OUTPUT_PATH), cv2.VideoWriter_fourcc(*'mp4v'), fps_in, (W, H))

#Initialize
tracks = []
frame_idx = 0
start_time = time.time()
ret, frame = cap.read()
frame_idx += 1
assert ret, "Can't read the first frame"
img_tensor = transform(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).to(device)

with torch.no_grad():
    preds = model([img_tensor])[0]

boxes = preds["boxes"].cpu().numpy()
labels = preds["labels"].cpu().numpy()
scores = preds["scores"].cpu().numpy()

for box, cls, score in zip(boxes, labels, scores):
    if score < CONF_TH:
        continue
    x1, y1, x2, y2 = map(int, box)
    w, h = x2 - x1, y2 - y1
    patch = frame[y1:y2, x1:x2]
    if patch.size == 0:
        continue
    tracker = cv2.legacy.TrackerMOSSE_create()
    tracker.init(frame, (x1, y1, w, h))
    kalman = KalmanFilter(x1 + w // 2, y1 + h // 2)
    hist = get_histogram(patch)
    label = COCO.get(int(cls), f"class{int(cls)}")
    tracks.append({
        "id": str(uuid.uuid4())[:8],
        "tracker": tracker,
        "label": label,
        "kalman": kalman,
        "hist": hist,
        "lost": 0
    })
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

out.write(frame)

#Track across video
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1
    new_tracks = []

    for tr in tracks:
        ok, bbox = tr["tracker"].update(frame)
        x, y, w, h = map(int, bbox)
        x2, y2 = x + w, y + h

        if not ok or w <= 0 or h <= 0:
            tr["lost"] += 1
            pred_x, pred_y = tr["kalman"].predict()
            if tr["lost"] <= MAX_LOST:
                cv2.circle(frame, (pred_x, pred_y), 4, (255, 255, 0), -1)
                new_tracks.append(tr)
            continue

        patch = frame[y:y+h, x:x+w]
        if patch.size > 0:
            hist_new = get_histogram(patch)
            sim = compare_hist(tr["hist"], hist_new)
            if sim >= HIST_THRESH:
                tr["kalman"].correct(x + w // 2, y + h // 2)
                tr["hist"] = 0.8 * tr["hist"] + 0.2 * hist_new
                tr["lost"] = 0
            else:
                tr["lost"] += 1
                if tr["lost"] <= MAX_LOST:
                    pred_x, pred_y = tr["kalman"].predict()
                    cv2.circle(frame, (pred_x, pred_y), 4, (255, 0, 0), -1)
                    new_tracks.append(tr)
                continue

        cv2.rectangle(frame, (x, y), (x2, y2), (255, 0, 0), 2)
        cv2.putText(frame, tr["label"], (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
        new_tracks.append(tr)

    tracks = new_tracks

    elapsed = time.time() - start_time
    fps = frame_idx / elapsed
    cv2.putText(frame, f"FPS: {fps:.1f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    out.write(frame)


cap.release()
out.release()
print(f"✅ Done. Output saved to: {OUTPUT_PATH}")
print(f"📈 Average FPS: {frame_idx / (time.time() - start_time):.2f}")

In [None]:
#CSRT_Tracker
from pathlib import Path
import time

import cv2
import numpy as np
from ultralytics import YOLO

#PATHS
downloads   = Path.home() / "Downloads"
VIDEO_PATH  = downloads / "videos" / "car1.mp4"
OUTPUT_PATH = downloads / "car1_CSRT_final_output.mp4"

#HYPER‑PARAMETERS
CONF_THRESHOLD  = 0.05
IOU_THRESH      = 0.50     # skip duplicate tracks if IoU > this
HIST_BINS       = (16,16,16)
MAX_LOST_FRAMES = 60       # keep predicting this many missed frames

def extract_hsv_hist(patch, bins=HIST_BINS):
    hsv  = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,bins,[0,180,0,256,0,256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def iou(a, b):
    xa,ya = max(a[0],b[0]), max(a[1],b[1])
    xb,yb = min(a[2],b[2]), min(a[3],b[3])
    inter = max(0, xb-xa) * max(0, yb-ya)
    if inter == 0: return 0.0
    area_a = (a[2]-a[0])*(a[3]-a[1])
    area_b = (b[2]-b[0])*(b[3]-b[1])
    return inter / (area_a + area_b - inter)
class Track:
    def __init__(self, bbox_xywh, label, frame):
        self.tracker = cv2.TrackerCSRT_create()
        self.tracker.init(frame, bbox_xywh)

        x,y,w,h = map(int, bbox_xywh)
        self.bbox  = [x,y,w,h]
        self.label = label
        self.lost  = 0

        self.feature = extract_hsv_hist(frame[y:y+h, x:x+w])

        # Kalman state (cx,cy,vx,vy)
        cx, cy = x + w/2, y + h/2
        self.kalman = cv2.KalmanFilter(4,2)
        self.kalman.transitionMatrix   = np.array(
            [[1,0,1,0],[0,1,0,1],[0,0,1,0],[0,0,0,1]], np.float32)
        self.kalman.measurementMatrix  = np.eye(2,4, dtype=np.float32)
        self.kalman.processNoiseCov    = np.eye(4, dtype=np.float32)*1e-2
        self.kalman.measurementNoiseCov= np.eye(2, dtype=np.float32)*1e-1
        self.kalman.statePre  = np.array([[cx],[cy],[0],[0]], np.float32)
        self.kalman.statePost = self.kalman.statePre.copy()

    def predict_center(self):
        p = self.kalman.predict()
        return float(p[0,0]), float(p[1,0])

    def correct(self, cx, cy):
        self.kalman.correct(np.array([[cx],[cy]], np.float32))

    def box_xyxy(self):
        x,y,w,h = self.bbox
        return [x, y, x+w, y+h]

model = YOLO("yolov8n.pt")

cap = cv2.VideoCapture(str(VIDEO_PATH))
assert cap.isOpened(), f"Cannot open {VIDEO_PATH}"
W, H   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps_in = cap.get(cv2.CAP_PROP_FPS)

out = cv2.VideoWriter(str(OUTPUT_PATH),
                      cv2.VideoWriter_fourcc(*"mp4v"),
                      fps_in, (W,H))

tracks      = []
prev_gray   = None
prev_pts    = None
frame_idx   = 0
start_time  = time.perf_counter()

feature_params = dict(maxCorners=200, qualityLevel=0.3,
                      minDistance=7, blockSize=7)
lk_params = dict(winSize=(15,15), maxLevel=2,
                 criteria=(cv2.TERM_CRITERIA_EPS|cv2.TERM_CRITERIA_COUNT,
                           10, 0.03))

#MAIN LOOP
while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1
    t0 = time.perf_counter()
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    #First frame: detect & spawn CSRT trackers
    if frame_idx == 1:
        det = model.predict(frame, conf=CONF_THRESHOLD,
                            classes=[0,3], verbose=False)[0]
        for box, cls in zip(det.boxes.xyxy.cpu().numpy().astype(int),
                            det.boxes.cls.cpu().numpy().astype(int)):
            if any(tr.label == model.names[int(cls)] and
                   iou(box, tr.box_xyxy()) > IOU_THRESH for tr in tracks):
                continue
            x1,y1,x2,y2 = box
            tracks.append(Track((x1,y1,x2-x1,y2-y1),
                                model.names[int(cls)], frame))
        prev_gray = gray.copy()
        prev_pts  = cv2.goodFeaturesToTrack(prev_gray, mask=None,
                                            **feature_params)
    #Subsequent frames
    else:
        # 1) global motion compensation
        curr_pts, st, _ = cv2.calcOpticalFlowPyrLK(prev_gray, gray,
                                                  prev_pts, None, **lk_params)
        good_prev = prev_pts[st.flatten()==1]
        good_curr = curr_pts[st.flatten()==1]
        M = np.eye(2,3, dtype=np.float32)
        if len(good_prev) >= 6:
            M,_ = cv2.estimateAffinePartial2D(good_prev, good_curr,
                                              method=cv2.RANSAC)
        stab = cv2.warpAffine(frame, M, (W,H))

        # 2) update each track
        new_tracks = []
        for tr in tracks:
            pcx, pcy = tr.predict_center()
            ok, bbox = tr.tracker.update(stab)
            if ok:
                tr.lost = 0
                tr.bbox = bbox
                x,y,w,h = map(int,bbox)
                tr.correct(x + w/2, y + h/2)
            else:
                tr.lost += 1
                if tr.lost > MAX_LOST_FRAMES:
                    continue  # give up on this track
                # fabricate a bbox around predicted center
                w,h = tr.bbox[2], tr.bbox[3]
                x,y = int(pcx - w/2), int(pcy - h/2)
                tr.bbox = [x,y,w,h]

            # draw back onto original (unstabilised) frame
            x,y,w,h = tr.bbox
            pts  = np.array([[x,y], [x+w,y+h]], np.float32).reshape(-1,1,2)
            invM = cv2.invertAffineTransform(M)
            ox1,oy1, ox2,oy2 = cv2.transform(pts, invM).reshape(-1,2).flatten()
            cv2.rectangle(frame, (int(ox1),int(oy1)),
                          (int(ox2),int(oy2)), (0,255,0), 2)
            cv2.putText(frame, tr.label, (int(ox1),int(oy1)-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
            cv2.circle(frame, (int(pcx),int(pcy)), 4, (0,0,255), -1)
            new_tracks.append(tr)
        tracks = new_tracks

    #FPS overlay
    fps = 1.0 / (time.perf_counter() - t0)
    cv2.putText(frame, f"FPS: {fps:.1f}",
                (10,30), cv2.FONT_HERSHEY_SIMPLEX,
                0.9, (0,255,255), 2)

    out.write(frame)
    prev_gray = gray.copy()
    prev_pts  = cv2.goodFeaturesToTrack(prev_gray, mask=None,
                                        **feature_params)

total = time.perf_counter() - start_time
print(f"Processed {frame_idx} frames in {total:.2f}s — "
      f"Avg FPS: {frame_idx/total:.2f}")
print(f"Output saved to: {OUTPUT_PATH}")

cap.release()
out.release()
cv2.destroyAllWindows()


In [None]:
#KCF_tracker
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from pathlib import Path
import uuid
import time
#PATHS
downloads = Path.home() / "Downloads"
VIDEO_PATH = downloads / "videos" / "car1.mp4"
OUTPUT_PATH = downloads / "car1_kcf_final_output.mp4"

#CONSTANTS
CONF_TH = 0.6
MAX_LOST = 180
HIST_THRESHOLD = 0.5
HIST_BINS = (16, 16, 16)

#COCO LABELS
COCO = {i: name for i, name in enumerate([
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', '', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife',
    'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', '', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'])}

def get_histogram(patch, bins=HIST_BINS):
    hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def compare_hist(hist1, hist2):
    return cv2.compareHist(hist1.astype(np.float32), hist2.astype(np.float32), cv2.HISTCMP_CORREL)

class StrongKalmanFilter:
    def __init__(self, x, y):
        self.kf = cv2.KalmanFilter(8, 4)
        self.kf.transitionMatrix = np.eye(8, dtype=np.float32)
        for i in range(4):
            self.kf.transitionMatrix[i, i+4] = 1.0
        self.kf.measurementMatrix = np.zeros((4, 8), np.float32)
        for i in range(4):
            self.kf.measurementMatrix[i, i] = 1.0
        self.kf.processNoiseCov = np.eye(8, dtype=np.float32) * 1e-2
        self.kf.measurementNoiseCov = np.eye(4, dtype=np.float32) * 1e-1
        self.kf.errorCovPost = np.eye(8, dtype=np.float32)
        self.kf.statePost = np.array([[x], [y], [0], [0], [0], [0], [0], [0]], dtype=np.float32)

    def predict(self):
        pred = self.kf.predict()
        return int(pred[0]), int(pred[1])

    def correct(self, x, y):
        self.kf.correct(np.array([[x], [y], [0], [0]], dtype=np.float32))

#MODEL
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.to(device).eval()
transform = transforms.Compose([transforms.ToTensor()])

#VIDEO
cap = cv2.VideoCapture(str(VIDEO_PATH))
assert cap.isOpened(), f"Could not open {VIDEO_PATH}"
W, H = int(cap.get(3)), int(cap.get(4))
fps = cap.get(cv2.CAP_PROP_FPS) or 25
out = cv2.VideoWriter(str(OUTPUT_PATH), cv2.VideoWriter_fourcc(*'mp4v'), fps, (W, H))

tracks = []
frame_idx = 0
t0 = prev = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_idx += 1

    if frame_idx == 1:
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img_tensor = transform(rgb).to(device)
        with torch.no_grad():
            preds = model([img_tensor])[0]

        boxes = preds['boxes'].cpu().numpy()
        labels_raw = preds['labels'].cpu().numpy()
        scores = preds['scores'].cpu().numpy()
        keep = scores >= CONF_TH
        boxes, labels_raw = boxes[keep], labels_raw[keep]

        for box, cls_id in zip(boxes, labels_raw):
            x1, y1, x2, y2 = map(int, box)
            w, h = x2 - x1, y2 - y1
            patch = frame[y1:y2, x1:x2]
            if patch.size == 0:
                continue
            tracker = cv2.TrackerKCF_create()
            tracker.init(frame, (x1, y1, w, h))
            hist = get_histogram(patch)
            kalman = StrongKalmanFilter(x1 + w//2, y1 + h//2)
            tracks.append({
                "id": str(uuid.uuid4())[:8],
                "tracker": tracker,
                "label": COCO.get(int(cls_id), f"class{cls_id}"),
                "hist": hist,
                "lost": 0,
                "kalman": kalman
            })
    else:
        new_tracks = []
        for tr in tracks:
            ok, bbox = tr["tracker"].update(frame)
            x, y, w, h = map(int, bbox)
            x2, y2 = x + w, y + h

            if not ok or x < 0 or y < 0 or x2 > W or y2 > H:
                tr["lost"] += 1
                pred_x, pred_y = tr["kalman"].predict()
                if tr["lost"] <= MAX_LOST:
                    cv2.circle(frame, (pred_x, pred_y), 4, (0, 0, 255), -1)
                    new_tracks.append(tr)
                continue

            patch = frame[y:y+h, x:x+w]
            if patch.size > 0:
                hist_new = get_histogram(patch)
                similarity = compare_hist(tr["hist"], hist_new)
                if similarity < HIST_THRESHOLD:
                    tr["lost"] += 1
                    if tr["lost"] <= MAX_LOST:
                        new_tracks.append(tr)
                    continue
                else:
                    tr["lost"] = 0
                    tr["hist"] = 0.8 * tr["hist"] + 0.2 * hist_new
                    tr["kalman"].correct(x + w//2, y + h//2)

            cv2.rectangle(frame, (x, y), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, tr["label"], (x, y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
            new_tracks.append(tr)

        tracks = new_tracks

    #FPS Overlay
    now = time.time()
    fps_now = 1.0 / (now - prev + 1e-6)
    prev = now
    cv2.putText(frame, f"FPS: {fps_now:.1f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    out.write(frame)


cap.release()
out.release()
total_time = time.time() - t0
avg_fps = frame_idx / total_time
print(f"✅ Saved to: {OUTPUT_PATH}")
print(f"Average FPS: {avg_fps:.2f}")


In [None]:
#KJF_Tracker_Main
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from collections import deque
from pathlib import Path
import time

# ----------- COCO classes -----------
CLASSES = {
    0:"background", 1:"person", 2:"bicycle", 3:"car", 4:"motorcycle", 5:"airplane",
    6:"bus",7:"train",8:"truck",9:"boat",10:"trafficlight",11:"firehydrant",
    13:"bench",14:"bird",15:"cat",16:"dog",17:"horse",18:"sheep",19:"cow",
    20:"elephant",21:"bear",22:"zebra",23:"giraffe",24:"backpack",25:"umbrella",
    27:"handbag",28:"tie",31:"snowboard",32:"sportsball",33:"kite",34:"baseballbat",
    35:"skis",36:"skateboard",37:"surfboard",38:"tennisracket",39:"bottle",
    40:"wineglass",41:"cup",42:"fork",43:"knife",44:"spoon",45:"bowl",46:"banana",
    47:"apple",48:"sandwich",49:"orange",50:"broccoli",51:"carrot",52:"hotdog",
    53:"pizza",54:"donut",55:"cake",56:"chair",57:"couch",58:"pottedplant",59:"bed",
    60:"diningtable",61:"toilet",62:"tv",63:"laptop",64:"mouse",65:"remote",
    66:"keyboard",67:"cellphone",68:"microwave",69:"oven",70:"toaster",71:"sink",
    72:"refrigerator",73:"book",74:"clock",75:"vase",76:"scissors",77:"teddybear",
    78:"hairdrier",79:"toothbrush"
}

# ----------- Settings -----------
VIDEO_PATH = str(Path.home()/"Downloads"/"videos"/"person4.mp4")
OUTPUT_PATH = str(Path.home()/"Downloads"/"person4_frcnn_kjf_output.mp4")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CONF_TH = 0.5         # Detection confidence threshold
MIN_LK_POINTS = 3     # Minimum good LK points to trust tracking
REINIT_KP = 2        # Frames before reinit keypoints
DROP_LIMIT = 500      # Drop track if lost too long
GRID = (4, 4)
MAX_CORNERS = [4000, 3000]
MIN_DIST = [3, 3]

# ----------- Kalman Filter Class -----------
class KalmanBox:
    count = 0
    def __init__(self, box):
        self.id = KalmanBox.count; KalmanBox.count += 1
        self.lost = 0
        self.kf = cv2.KalmanFilter(10, 4)
        M = np.zeros((4,10), np.float32)
        M[[0,1,2,3],[0,1,6,7]] = 1
        self.kf.measurementMatrix = M
        dt, dt2 = 1/30.0, 0.5*(1/30.0)**2
        T = np.eye(10, dtype=np.float32)
        T[0,2],T[0,4]=dt,dt2; T[1,3],T[1,5]=dt,dt2
        T[2,4],T[3,5]=dt,dt;   T[6,8],T[7,9]=dt,dt
        self.kf.transitionMatrix    = T
        self.kf.processNoiseCov     = np.eye(10, dtype=np.float32)*0.01
        self.kf.measurementNoiseCov = np.eye(4,  dtype=np.float32)*0.02
        x1,y1,x2,y2 = box
        cx,cy = (x1+x2)/2,(y1+y2)/2
        w,h   = x2-x1,    y2-y1
        self.kf.statePost = np.array([cx,cy,0,0,0,0,w,h,0,0],np.float32).reshape(-1,1)
        self.last = self._to_box(self.kf.statePost.flatten())
    def _to_box(self, s):
        cx,cy,w,h = s[0],s[1],s[6],s[7]
        return (int(cx-w/2), int(cy-h/2), int(cx+w/2), int(cy+h/2))
    def predict(self):
        s = self.kf.predict().flatten()
        self.lost += 1
        self.last = self._to_box(s)
        return self.last
    def update(self, box):
        x1,y1,x2,y2 = box
        cx,cy = (x1+x2)/2,(y1+y2)/2
        w,h   = x2-x1,    y2-y1
        meas  = np.array([cx,cy,w,h],np.float32).reshape(-1,1)
        self.kf.correct(meas)
        self.lost = 0
        self.last = self._to_box(self.kf.statePost.flatten())

# ----------- Helper Functions -----------
def get_distributed_kps(gray, grid, max_c, min_d):
    h,w=gray.shape; gh,gw=grid; pts=[]
    for i in range(gh):
        for j in range(gw):
            roi=gray[i*h//gh:(i+1)*h//gh, j*w//gw:(j+1)*w//gw]
            c=cv2.goodFeaturesToTrack(roi,
                maxCorners=max_c//(gh*gw),
                qualityLevel=0.05, minDistance=min_d)
            if c is not None:
                for [[x,y]] in c:
                    pts.append([x+j*w//gw, y+i*h//gh])
    return np.array(pts,np.float32).reshape(-1,1,2) if pts else None

def filter_pts(nw,old,max_m=25,thr=2.08,sk=100):
    if len(nw)==0 or len(old)==0: return np.array([],bool)
    mv=np.linalg.norm(nw-old,axis=1)<max_m
    d =np.linalg.norm(nw-nw.mean(0),axis=1)
    cl=d<=thr*np.mean(np.delete(d,0))
    sp=np.linalg.norm(nw-old,axis=1)
    sd=(sp>=sp.mean()-sk*sp.std())&(sp<=sp.mean()+sk*sp.std())
    return mv & cl & sd

# ----------- Main Script -----------
cap = cv2.VideoCapture(VIDEO_PATH)
ret, frame_bgr = cap.read()
if not ret:
    raise RuntimeError("Cannot read video")

frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
model = models.detection.fasterrcnn_resnet50_fpn(
    weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT
).to(DEVICE).eval()
transform = transforms.Compose([transforms.ToTensor()])
img_t = transform(frame_rgb).to(DEVICE)

with torch.no_grad():
    preds = model([img_t])[0]

boxes  = preds["boxes"].cpu().numpy().astype(int)
labels = preds["labels"].cpu().numpy()
scores = preds["scores"].cpu().numpy()

keep = scores >= CONF_TH
boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

h0, w0 = frame_bgr.shape[:2]
out = cv2.VideoWriter(
    OUTPUT_PATH,
    cv2.VideoWriter_fourcc(*'mp4v'),
    cap.get(cv2.CAP_PROP_FPS),
    (w0, h0)
)
# Draw detection boxes on first frame
for (x1,y1,x2,y2), cls_id, sc in zip(boxes, labels, scores):
    name = CLASSES.get(int(cls_id), str(int(cls_id)))
    label = f"{name} {sc:.2f}"
    cv2.rectangle(frame_bgr, (x1,y1), (x2,y2), (0,255,0), 2)
    (tw,th),_ = cv2.getTextSize(label,
                  cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
    cv2.rectangle(frame_bgr,
                  (x1,y1-th-8),(x1+tw,y1),
                  (0,255,0), -1)
    cv2.putText(frame_bgr, label,
                (x1,y1-4),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.7, (0,0,0), 2)
out.write(frame_bgr)

gray0 = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
trackers = []
for i, (b, cid) in enumerate(zip(boxes, labels)):
    kf = KalmanBox(tuple(b))
    x1,y1,x2,y2 = b
    kp = get_distributed_kps(
        gray0[y1:y2, x1:x2],
        GRID,
        MAX_CORNERS[min(i,len(MAX_CORNERS)-1)],
        MIN_DIST   [min(i,len(MIN_DIST)-1)]
    )
    if kp is not None and len(kp) >= MIN_LK_POINTS:
        kp[:,:,0] += x1; kp[:,:,1] += y1
        history = deque([kp.reshape(-1,2)], maxlen=REINIT_KP)
    else:
        kp, history = None, deque([], maxlen=REINIT_KP)
    trackers.append({
        'kf': kf,
        'p0': kp,
        'history': history,
        'cls': int(cid),
        'lost': 0,
        'age': 0   # age field for track memory
    })

old_gray = gray0.copy()
frame_count = 1
start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret: break
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame_count += 1

    for obj in trackers[:]:
        kf = obj['kf']
        x1,y1,x2,y2 = kf.last
        obj['age'] += 1

        # LK Optical Flow
        if obj['p0'] is not None:
            p1, st, _ = cv2.calcOpticalFlowPyrLK(
                old_gray, gray, obj['p0'], None,
                winSize=(15,15), maxLevel=2,
                criteria=(cv2.TERM_CRITERIA_EPS|
                          cv2.TERM_CRITERIA_COUNT,10,0.03)
            )
        else:
            p1, st = None, 0

        # If enough points: update box tightly to hull
        if p1 is not None and st.sum() >= MIN_LK_POINTS:
            newp = p1[st==1].reshape(-1,2)
            oldp = obj['p0'][st==1].reshape(-1,2)
            pts  = newp[filter_pts(newp,oldp)]
            if len(pts) >= MIN_LK_POINTS:
                obj['history'].append(pts)
                allpts = np.vstack(obj['history'])
                hull   = cv2.convexHull(allpts.astype(np.float32))
                bx,by,bw,bh = cv2.boundingRect(hull)
                # Update Kalman from the hull-based box
                kf.update((bx,by,bx+bw,by+bh))
                x1, y1, x2, y2 = bx, by, bx+bw, by+bh
                obj['p0'] = pts.reshape(-1,1,2)
                obj['lost'] = 0
            else:
                x1,y1,x2,y2 = kf.predict()
                obj['p0'], obj['lost'] = None, obj['lost']+1
        else:
            x1,y1,x2,y2 = kf.predict()
            obj['p0'], obj['lost'] = None, obj['lost']+1

        # Re-init keypoints after several lost frames
        if obj['lost'] == REINIT_KP:
            xx1,yy1 = max(0,x1), max(0,y1)
            xx2,yy2 = min(w0,x2), min(h0,y2)
            newkp = get_distributed_kps(
                gray[yy1:yy2,xx1:xx2],
                GRID, 2000, 5
            )
            if newkp is not None and len(newkp) >= MIN_LK_POINTS:
                newkp[:,:,0]+=xx1; newkp[:,:,1]+=yy1
                obj['history'].clear()
                obj['history'].append(newkp.reshape(-1,2))
                obj['p0'], obj['lost'] = newkp, 0

        # Drop track if lost for too long
        if obj['lost'] >= DROP_LIMIT:
            trackers.remove(obj)
            continue

        # Draw TIGHT bounding box always around the object
        color = (0,255,0)
        if obj['p0'] is not None and len(obj['p0']) >= MIN_LK_POINTS:
            pts = obj['p0'].reshape(-1,2)
            hull = cv2.convexHull(pts.astype(np.float32))
            bx, by, bw, bh = cv2.boundingRect(hull)
            cv2.rectangle(frame, (bx,by), (bx+bw,by+bh), color, 2)
            label = CLASSES.get(obj['cls'], str(obj['cls']))
            cv2.putText(frame, f"{label} #{obj['kf'].id} age:{obj['age']}", (bx, by-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
            for pt in obj['p0']:
                cv2.circle(frame, tuple(pt[0].astype(int)), 2, (0,0,255), -1)
        else:
            # Draw predicted box if no points
            x1, y1, x2, y2 = kf.last
            cv2.rectangle(frame, (x1,y1), (x2,y2), (255,0,0), 2)
            label = CLASSES.get(obj['cls'], str(obj['cls']))
            cv2.putText(frame, f"{label} #{obj['kf'].id} age:{obj['age']}", (x1, y1-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2)

    out.write(frame)
    old_gray = gray.copy()

cap.release()
out.release()
cv2.destroyAllWindows()

total_time = time.time() - start_time
fps = frame_count / total_time
print(f"✅ Done – saved to {OUTPUT_PATH}")
print(f"Average FPS: {fps:.2f}")

In [None]:
#KJF_unsuccessful
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from torchvision.models import ResNet18_Weights
from collections import deque
from pathlib import Path
import time

#COCO classes
CLASSES = {
    0:"background", 1:"person", 2:"bicycle", 3:"car", 4:"motorcycle", 5:"airplane",
    6:"bus",7:"train",8:"truck",9:"boat",10:"trafficlight",11:"firehydrant",
    13:"bench",14:"bird",15:"cat",16:"dog",17:"horse",18:"sheep",19:"cow",
    20:"elephant",21:"bear",22:"zebra",23:"giraffe",24:"backpack",25:"umbrella",
    27:"handbag",28:"tie",31:"snowboard",32:"sportsball",33:"kite",34:"baseballbat",
    35:"skis",36:"skateboard",37:"surfboard",38:"tennisracket",39:"bottle",
    40:"wineglass",41:"cup",42:"fork",43:"knife",44:"spoon",45:"bowl",46:"banana",
    47:"apple",48:"sandwich",49:"orange",50:"broccoli",51:"carrot",52:"hotdog",
    53:"pizza",54:"donut",55:"cake",56:"chair",57:"couch",58:"pottedplant",59:"bed",
    60:"diningtable",61:"toilet",62:"tv",63:"laptop",64:"mouse",65:"remote",
    66:"keyboard",67:"cellphone",68:"microwave",69:"oven",70:"toaster",71:"sink",
    72:"refrigerator",73:"book",74:"clock",75:"vase",76:"scissors",77:"teddybear",
    78:"hairdrier",79:"toothbrush"
}

#Settings
VIDEO_PATH = str(Path.home()/"Downloads"/"videos"/"person2.mp4")
OUTPUT_PATH = str(Path.home()/"Downloads"/"person2_frcnn_kjf_output.mp4")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CONF_TH = 0.5
MIN_LK_POINTS = 3
REINIT_KP = 2
DROP_LIMIT = 500
GRID = (4, 4)
MAX_CORNERS = [4000, 3000]
MIN_DIST = [3, 3]
REID_MAX_AGE = 60
REID_SIM_THRESH = 0.30
HOG_SIM_THRESH = 0.6
DEEP_SIM_THRESH = 0.85  # Cosine similarity for deep features (higher = more similar)

#Deep feature extractor (ResNet-18, remove classifier)
deep_model = models.resnet18(weights=ResNet18_Weights.DEFAULT)
deep_model.fc = torch.nn.Identity()  # Remove the last layer
deep_model = deep_model.eval().to(DEVICE)
deep_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((128, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])
def extract_deep_feature(img, box):
    x1, y1, x2, y2 = [int(e) for e in box]
    x1 = max(0, x1); y1 = max(0, y1); x2 = min(img.shape[1]-1, x2); y2 = min(img.shape[0]-1, y2)
    patch = img[y1:y2, x1:x2]
    if patch.size == 0 or patch.shape[0] < 16 or patch.shape[1] < 16:
        return None
    try:
        inp = deep_transform(patch).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            feat = deep_model(inp).cpu().numpy().flatten()
        feat = feat / (np.linalg.norm(feat) + 1e-8)
        return feat
    except Exception:
        return None

def deep_sim(f1, f2):
    if f1 is None or f2 is None or len(f1) != len(f2): return 0
    return float(np.dot(f1, f2) / (np.linalg.norm(f1)+1e-8) / (np.linalg.norm(f2)+1e-8))

#Color histogram
def color_hist(img, box):
    x1, y1, x2, y2 = [int(e) for e in box]
    x1 = max(0, x1); y1 = max(0, y1); x2 = min(img.shape[1]-1, x2); y2 = min(img.shape[0]-1, y2)
    patch = img[y1:y2, x1:x2]
    if patch.size == 0 or patch.shape[0] < 5 or patch.shape[1] < 5:
        return None
    hsv = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0,1,2], None, [8,8,8], [0,180,0,256,0,256])
    cv2.normalize(hist, hist)
    return hist.flatten()
def hist_sim(h1, h2):
    if h1 is None or h2 is None: return 1.0
    return cv2.compareHist(h1.reshape(8,8,8), h2.reshape(8,8,8), cv2.HISTCMP_BHATTACHARYYA)

#HOG descriptor
def compute_hog(img, box):
    x1, y1, x2, y2 = [int(e) for e in box]
    x1 = max(0, x1); y1 = max(0, y1); x2 = min(img.shape[1]-1, x2); y2 = min(img.shape[0]-1, y2)
    patch = img[y1:y2, x1:x2]
    WIN_W, WIN_H = 64, 128
    if patch.size == 0 or patch.shape[0] < 16 or patch.shape[1] < 16:
        return None
    gray = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (WIN_W, WIN_H))
    hog = cv2.HOGDescriptor(_winSize=(WIN_W, WIN_H),
                            _blockSize=(16,16),
                            _blockStride=(8,8),
                            _cellSize=(8,8),
                            _nbins=9)
    desc = hog.compute(resized)
    return desc.flatten() if desc is not None else None
def hog_sim(h1, h2):
    if h1 is None or h2 is None or len(h1) != len(h2): return 0
    h1 = h1 / (np.linalg.norm(h1) + 1e-8)
    h2 = h2 / (np.linalg.norm(h2) + 1e-8)
    return float(np.dot(h1, h2))

#Kalman Filter Class
class KalmanBox:
    count = 0
    def __init__(self, box, hist, hog_desc, deep_feat):
        self.id = KalmanBox.count; KalmanBox.count += 1
        self.lost = 0
        self.kf = cv2.KalmanFilter(10, 4)
        M = np.zeros((4,10), np.float32)
        M[[0,1,2,3],[0,1,6,7]] = 1
        self.kf.measurementMatrix = M
        dt, dt2 = 1/30.0, 0.5*(1/30.0)**2
        T = np.eye(10, dtype=np.float32)
        T[0,2],T[0,4]=dt,dt2; T[1,3],T[1,5]=dt,dt2
        T[2,4],T[3,5]=dt,dt;   T[6,8],T[7,9]=dt,dt
        self.kf.transitionMatrix    = T
        self.kf.processNoiseCov     = np.eye(10, dtype=np.float32)*0.01
        self.kf.measurementNoiseCov = np.eye(4,  dtype=np.float32)*0.02
        x1,y1,x2,y2 = box
        cx,cy = (x1+x2)/2,(y1+y2)/2
        w,h   = x2-x1,    y2-y1
        self.kf.statePost = np.array([cx,cy,0,0,0,0,w,h,0,0],np.float32).reshape(-1,1)
        self.last = self._to_box(self.kf.statePost.flatten())
        # Memory for appearance
        self.hist = hist
        self.hog_desc = hog_desc
        self.deep_feat = deep_feat
        self.memory = [hist] if hist is not None else []
        self.memory_hog = [hog_desc] if hog_desc is not None else []
        self.memory_deep = [deep_feat] if deep_feat is not None else []
        self.max_mem = 5
        self.age = 0
        self.last_frame = 0
        self.last_box = box

    def _to_box(self, s):
        cx,cy,w,h = s[0],s[1],s[6],s[7]
        return (int(cx-w/2), int(cy-h/2), int(cx+w/2), int(cy+h/2))
    def predict(self):
        s = self.kf.predict().flatten()
        self.lost += 1
        self.last = self._to_box(s)
        return self.last
    def update(self, box, hist=None, hog_desc=None, deep_feat=None):
        x1,y1,x2,y2 = box
        cx,cy = (x1+x2)/2,(y1+y2)/2
        w,h   = x2-x1,    y2-y1
        meas  = np.array([cx,cy,w,h],np.float32).reshape(-1,1)
        self.kf.correct(meas)
        self.lost = 0
        self.last = self._to_box(self.kf.statePost.flatten())
        self.last_box = box
        if hist is not None:
            self.hist = hist
            self.memory.append(hist)
            if len(self.memory) > self.max_mem:
                self.memory.pop(0)
        if hog_desc is not None:
            self.hog_desc = hog_desc
            self.memory_hog.append(hog_desc)
            if len(self.memory_hog) > self.max_mem:
                self.memory_hog.pop(0)
        if deep_feat is not None:
            self.deep_feat = deep_feat
            self.memory_deep.append(deep_feat)
            if len(self.memory_deep) > self.max_mem:
                self.memory_deep.pop(0)
    def get_hist(self):
        if len(self.memory) == 0: return self.hist
        return np.mean(self.memory, axis=0)
    def get_hog(self):
        if len(self.memory_hog) == 0: return self.hog_desc
        return np.mean(self.memory_hog, axis=0)
    def get_deep(self):
        if len(self.memory_deep) == 0: return self.deep_feat
        return np.mean(self.memory_deep, axis=0)

#Helper Functions
def get_distributed_kps(gray, grid, max_c, min_d):
    h,w=gray.shape; gh,gw=grid; pts=[]
    for i in range(gh):
        for j in range(gw):
            roi=gray[i*h//gh:(i+1)*h//gh, j*w//gw:(j+1)*w//gw]
            c=cv2.goodFeaturesToTrack(roi,
                maxCorners=max_c//(gh*gw),
                qualityLevel=0.05, minDistance=min_d)
            if c is not None:
                for [[x,y]] in c:
                    pts.append([x+j*w//gw, y+i*h//gh])
    return np.array(pts,np.float32).reshape(-1,1,2) if pts else None

def filter_pts(nw,old,max_m=25,thr=2.08,sk=100):
    if len(nw)==0 or len(old)==0: return np.array([],bool)
    mv=np.linalg.norm(nw-old,axis=1)<max_m
    d =np.linalg.norm(nw-nw.mean(0),axis=1)
    cl=d<=thr*np.mean(np.delete(d,0))
    sp=np.linalg.norm(nw-old,axis=1)
    sd=(sp>=sp.mean()-sk*sp.std())&(sp<=sp.mean()+sk*sp.std())
    return mv & cl & sd

#Main Script
cap = cv2.VideoCapture(VIDEO_PATH)
ret, frame_bgr = cap.read()
if not ret:
    raise RuntimeError("Cannot read video")

frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
model = models.detection.fasterrcnn_resnet50_fpn(
    weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT
).to(DEVICE).eval()
transform = transforms.Compose([transforms.ToTensor()])
img_t = transform(frame_rgb).to(DEVICE)

with torch.no_grad():
    preds = model([img_t])[0]

boxes  = preds["boxes"].cpu().numpy().astype(int)
labels = preds["labels"].cpu().numpy()
scores = preds["scores"].cpu().numpy()

keep = scores >= CONF_TH
boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

h0, w0 = frame_bgr.shape[:2]
out = cv2.VideoWriter(
    OUTPUT_PATH,
    cv2.VideoWriter_fourcc(*'mp4v'),
    cap.get(cv2.CAP_PROP_FPS),
    (w0, h0)
)
# Draw detection boxes on first frame
for (x1,y1,x2,y2), cls_id, sc in zip(boxes, labels, scores):
    name = CLASSES.get(int(cls_id), str(int(cls_id)))
    label = f"{name} {sc:.2f}"
    cv2.rectangle(frame_bgr, (x1,y1), (x2,y2), (0,255,0), 2)
    (tw,th),_ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
    cv2.rectangle(frame_bgr, (x1,y1-th-8),(x1+tw,y1), (0,255,0), -1)
    cv2.putText(frame_bgr, label, (x1,y1-4), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,0), 2)
out.write(frame_bgr)

gray0 = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
trackers = []
lost_tracks = []  # memory for lost tracks

for i, (b, cid) in enumerate(zip(boxes, labels)):
    hist = color_hist(frame_bgr, b)
    hog = compute_hog(frame_bgr, b)
    deep = extract_deep_feature(frame_bgr, b)
    kf = KalmanBox(tuple(b), hist, hog, deep)
    x1,y1,x2,y2 = b
    kp = get_distributed_kps(
        gray0[y1:y2, x1:x2],
        GRID,
        MAX_CORNERS[min(i,len(MAX_CORNERS)-1)],
        MIN_DIST   [min(i,len(MIN_DIST)-1)]
    )
    if kp is not None and len(kp) >= MIN_LK_POINTS:
        kp[:,:,0] += x1; kp[:,:,1] += y1
        history = deque([kp.reshape(-1,2)], maxlen=REINIT_KP)
    else:
        kp, history = None, deque([], maxlen=REINIT_KP)
    trackers.append({
        'kf': kf,
        'p0': kp,
        'history': history,
        'cls': int(cid),
        'lost': 0,
        'age': 0,
        'missed': 0
    })

old_gray = gray0.copy()
frame_count = 1
start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret: break
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame_count += 1

    active_ids = set()
    for obj in trackers[:]:
        kf = obj['kf']
        x1,y1,x2,y2 = kf.last
        obj['age'] += 1

        # LK Optical Flow
        if obj['p0'] is not None:
            p1, st, _ = cv2.calcOpticalFlowPyrLK(
                old_gray, gray, obj['p0'], None,
                winSize=(15,15), maxLevel=2,
                criteria=(cv2.TERM_CRITERIA_EPS|
                          cv2.TERM_CRITERIA_COUNT,10,0.03)
            )
        else:
            p1, st = None, 0

        if p1 is not None and st.sum() >= MIN_LK_POINTS:
            newp = p1[st==1].reshape(-1,2)
            oldp = obj['p0'][st==1].reshape(-1,2)
            pts  = newp[filter_pts(newp,oldp)]
            if len(pts) >= MIN_LK_POINTS:
                obj['history'].append(pts)
                allpts = np.vstack(obj['history'])
                hull   = cv2.convexHull(allpts.astype(np.float32))
                bx,by,bw,bh = cv2.boundingRect(hull)
                hist = color_hist(frame, (bx,by,bx+bw,by+bh))
                hog  = compute_hog(frame, (bx,by,bx+bw,by+bh))
                deep = extract_deep_feature(frame, (bx,by,bx+bw,by+bh))
                kf.update((bx,by,bx+bw,by+bh), hist, hog, deep)
                x1, y1, x2, y2 = bx, by, bx+bw, by+bh
                obj['p0'] = pts.reshape(-1,1,2)
                obj['lost'] = 0
                obj['missed'] = 0
                active_ids.add(kf.id)
            else:
                x1,y1,x2,y2 = kf.predict()
                obj['p0'], obj['lost'] = None, obj['lost']+1
                obj['missed'] += 1
        else:
            x1,y1,x2,y2 = kf.predict()
            obj['p0'], obj['lost'] = None, obj['lost']+1
            obj['missed'] += 1

        # Wider search window for keypoint reinit
        if obj['lost'] == REINIT_KP:
            pad = 20  # search outside previous box
            xx1,yy1 = max(0,x1-pad), max(0,y1-pad)
            xx2,yy2 = min(w0,x2+pad), min(h0,y2+pad)
            newkp = get_distributed_kps(
                gray[yy1:yy2,xx1:xx2],
                GRID, 2000, 5
            )
            if newkp is not None and len(newkp) >= MIN_LK_POINTS:
                newkp[:,:,0]+=xx1; newkp[:,:,1]+=yy1
                obj['history'].clear()
                obj['history'].append(newkp.reshape(-1,2))
                obj['p0'], obj['lost'] = newkp, 0

        if obj['lost'] >= DROP_LIMIT:
            lost_tracks.append({
                'hist': kf.get_hist(),
                'hog': kf.get_hog(),
                'deep': kf.get_deep(),
                'cls': obj['cls'],
                'kf': kf,
                'last_box': (x1,y1,x2,y2),
                'last_seen': frame_count
            })
            trackers.remove(obj)
            continue

        color = (0,255,0)
        if obj['p0'] is not None and len(obj['p0']) >= MIN_LK_POINTS:
            pts = obj['p0'].reshape(-1,2)
            hull = cv2.convexHull(pts.astype(np.float32))
            bx, by, bw, bh = cv2.boundingRect(hull)
            cv2.rectangle(frame, (bx,by), (bx+bw,by+bh), color, 2)
            label = CLASSES.get(obj['cls'], str(obj['cls']))
            cv2.putText(frame, f"{label} #{obj['kf'].id} age:{obj['age']}", (bx, by-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
            for pt in obj['p0']:
                cv2.circle(frame, tuple(pt[0].astype(int)), 2, (0,0,255), -1)
        else:
            x1, y1, x2, y2 = kf.last
            cv2.rectangle(frame, (x1,y1), (x2,y2), (255,0,0), 2)
            label = CLASSES.get(obj['cls'], str(obj['cls']))
            cv2.putText(frame, f"{label} #{obj['kf'].id} age:{obj['age']}", (x1, y1-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2)

    #Improved matching for lost track recovery
    if len(lost_tracks) > 0:
        for lt in lost_tracks[:]:
            if frame_count - lt['last_seen'] > REID_MAX_AGE:
                lost_tracks.remove(lt)
                continue
            hist_lost = lt['hist']
            hog_lost = lt['hog']
            deep_lost = lt['deep']
            best_obj = None
            best_score = 0
            for obj in trackers:
                if obj['kf'].id in active_ids:
                    continue
                hist_obj = obj['kf'].get_hist()
                hog_obj = obj['kf'].get_hog()
                deep_obj = obj['kf'].get_deep()
                sim_hist = 1.0 - hist_sim(hist_lost, hist_obj)
                sim_hog = hog_sim(hog_lost, hog_obj)
                sim_deep = deep_sim(deep_lost, deep_obj)
                # Use weighted vote of all features
                total_sim = 0.4*sim_hist + 0.3*sim_hog + 0.3*sim_deep
                if sim_hist > (1-REID_SIM_THRESH) and sim_hog > HOG_SIM_THRESH and sim_deep > DEEP_SIM_THRESH and lt['cls'] == obj['cls']:
                    if total_sim > best_score:
                        best_obj = obj
                        best_score = total_sim
            if best_obj is not None:
                best_obj['lost'] = 0
                best_obj['missed'] = 0
                best_obj['kf'].hist = hist_lost
                best_obj['kf'].hog_desc = hog_lost
                best_obj['kf'].deep_feat = deep_lost
                best_obj['kf'].memory.append(hist_lost)
                best_obj['kf'].memory_hog.append(hog_lost)
                best_obj['kf'].memory_deep.append(deep_lost)
                lost_tracks.remove(lt)
                print(f"Re-identified track #{best_obj['kf'].id} at frame {frame_count}")

    out.write(frame)
    old_gray = gray.copy()

cap.release()
out.release()
cv2.destroyAllWindows()

total_time = time.time() - start_time
fps = frame_count / total_time
print(f"✅ Done – saved to {OUTPUT_PATH}")
print(f"Average FPS: {fps:.2f}")

In [None]:
#CSRT_version2
import cv2
import torch
import time
import numpy as np
from ultralytics import YOLO

video_path = "car2.mp4"
output_video_path = "car2_csrt.mp4"
target_object_name = "car"
x_frames = 50

try:
    model = YOLO('yolov8n.pt')
    class_names = list(model.names.values())
except Exception as e:
    print(f"Error loading YOLO model: {e}")
    class_names = ["person", "car", "dog", "cat"]

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise FileNotFoundError(f"Error: Could not open video at {video_path}")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

tracker = None
frame_count = 0
detection_frame_count = 0
total_processing_time = 0
tracking_frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    start_time = time.time()

    if tracker is None:
        if frame_count <= x_frames:
            results = model(frame, stream=True, verbose=False)
            for r in results:
                boxes = r.boxes
                for box in boxes:
                    cls = int(box.cls[0])
                    label = model.names[cls]
                    if label == target_object_name:
                        x1, y1, x2, y2 = box.xyxy[0]
                        bbox = (int(x1), int(y1), int(x2 - x1), int(y2 - y1))
                        tracker = cv2.TrackerCSRT_create()
                        tracker.init(frame, bbox)
                        print(f"Target '{target_object_name}' found at {bbox}. Starting tracker.")
                        total_processing_time = 0
                        tracking_frame_count = 0
                        break
                if tracker:
                    break
        else:
            print(f"\nCould not find '{target_object_name}' in the first {x_frames} frames. Exiting.")
            out.write(frame)
            break
    else:
        success, bbox = tracker.update(frame)
        tracking_frame_count += 1
        if success:
            x, y, w, h = [int(v) for v in bbox]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            # Label above box
            cv2.putText(frame, target_object_name, (x, y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
        else:
            cv2.putText(frame, "Tracking Failure", (100, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)

        end_time = time.time()
        total_processing_time += (end_time - start_time)

    out.write(frame)

cap.release()
out.release()

if tracking_frame_count > 0 and total_processing_time > 0:
    avg_fps = tracking_frame_count / total_processing_time
    print(f"\n✅ Done! Output saved to '{output_video_path}'")
    print(f"📈 Average Tracking FPS: {avg_fps:.2f}")
else:
    print("\nNo tracking data to compute FPS.")