Splitting the dataset in the train,test and val.  
And neglecting the cattle which has less than 6 images.

In [1]:
from pathlib import Path
import random
import shutil
from collections import defaultdict

# ====== USER SETTINGS ======
RAW_DATASET = r"D:\Abhishek\base_256"     
OUTPUT_DATASET = r"D:\Abhishek\split"  # where to save train/val/test
TRAIN_RATIO = 0.6
VAL_RATIO = 0.2
TEST_RATIO = 0.2
SEED = 42
MIN_PER_ID = 3
COPY_FILES = True   # set to False to move instead of copy
# ===========================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
random.seed(SEED)

def list_images(folder: Path):
    return [p for p in folder.iterdir() if p.suffix.lower() in VALID_EXTS and p.is_file()]

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def main():
    raw_root = Path(RAW_DATASET)
    out_root = Path(OUTPUT_DATASET)

    # Prepare output dirs
    for split in ("train", "val", "test"):
        safe_mkdir(out_root / split)

    op = shutil.copy2 if COPY_FILES else shutil.move

    split_counts = defaultdict(lambda: {"train": 0, "val": 0, "test": 0})

    for cid_dir in sorted(raw_root.iterdir()):
        if not cid_dir.is_dir():
            continue
        cid = cid_dir.name
        imgs = list_images(cid_dir)

        if len(imgs) < MIN_PER_ID:
            print(f"[skip] {cid}: only {len(imgs)} images (< {MIN_PER_ID})")
            continue

        random.shuffle(imgs)
        n = len(imgs)
        n_train = max(1, int(round(n * TRAIN_RATIO)))
        n_val = max(1, int(round(n * VAL_RATIO)))
        n_test = max(1, n - n_train - n_val)

        while n_train + n_val + n_test > n:
            if n_train > 1:
                n_train -= 1
            elif n_val > 1:
                n_val -= 1
            elif n_test > 1:
                n_test -= 1
            else:
                break

        train_imgs = imgs[:n_train]
        val_imgs = imgs[n_train:n_train + n_val]
        test_imgs = imgs[n_train + n_val:]

        for split_name, split_imgs in zip(
            ["train", "val", "test"], [train_imgs, val_imgs, test_imgs]
        ):
            dst_dir = out_root / split_name / cid
            safe_mkdir(dst_dir)
            for img in split_imgs:
                op(str(img), str(dst_dir / img.name))
            split_counts[cid][split_name] = len(split_imgs)

    print("\n[done] Dataset split complete!")
    for cid, counts in split_counts.items():
        print(f"{cid}: train={counts['train']} val={counts['val']} test={counts['test']}")

if __name__ == "__main__":
    main()


[skip] cattle-014: only 1 images (< 3)
[skip] cattle-021: only 2 images (< 3)
[skip] cattle-025: only 2 images (< 3)
[skip] cattle-027: only 1 images (< 3)
[skip] cattle-030: only 2 images (< 3)
[skip] cattle-036: only 1 images (< 3)
[skip] cattle-038: only 2 images (< 3)
[skip] cattle-039: only 1 images (< 3)
[skip] cattle-040: only 1 images (< 3)
[skip] cattle-045: only 1 images (< 3)
[skip] cattle-050: only 2 images (< 3)
[skip] cattle-057: only 1 images (< 3)
[skip] cattle-058: only 2 images (< 3)
[skip] cattle-060: only 2 images (< 3)
[skip] cattle-061: only 2 images (< 3)
[skip] cattle-062: only 1 images (< 3)
[skip] cattle-063: only 2 images (< 3)
[skip] cattle-064: only 1 images (< 3)
[skip] cattle-065: only 1 images (< 3)
[skip] cattle-069: only 2 images (< 3)
[skip] cattle-077: only 2 images (< 3)
[skip] cattle-083: only 1 images (< 3)
[skip] cattle-084: only 2 images (< 3)
[skip] cattle-085: only 1 images (< 3)
[skip] cattle-089: only 2 images (< 3)
[skip] cattle-091: only 1

No. of images in the test, train and validate

In [2]:
from pathlib import Path

# ==== Change this to your split dataset root ====
DATASET_ROOT = r"D:\Abhishek\split"
# ===============================================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

def count_images_in_split(split_path: Path):
    total = 0
    for cid_folder in split_path.iterdir():
        if cid_folder.is_dir():
            total += sum(1 for img in cid_folder.iterdir() if img.suffix.lower() in VALID_EXTS)
    return total

def main():
    root = Path(DATASET_ROOT)
    for split_name in ["train", "val", "test"]:
        split_path = root / split_name
        if split_path.exists():
            count = count_images_in_split(split_path)
            print(f"{split_name}: {count} images")
        else:
            print(f"[warn] {split_name} folder not found.")

if __name__ == "__main__":
    main()


train: 1488 images
val: 549 images
test: 577 images


Extracting the features for the dataset using following Heuristic approach:  
1. LPB  
2. Gabor  
3. Blob  

In [10]:
#!/usr/bin/env python3
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from skimage.feature import local_binary_pattern, blob_log

# ============ USER SETTINGS ============
DATASET_ROOT = r"D:\Abhishek\split"   # contains train/, val/, test/
CSV_OUT      = r"D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv"
IMG_SIZE     = 224
# LBP params (uniform -> histogram bins = P+2)
LBP_P = 16
LBP_R = 2
# Gabor params
GABOR_THETAS = [0, np.pi/4, np.pi/2, 3*np.pi/4]
GABOR_LAMBDAS = [4, 8, 16]
# ======================================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

def imread_rgb(p):
    img = cv2.imread(str(p))
    if img is None:
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def resize_keep_aspect(img, target_short=IMG_SIZE):
    h, w = img.shape[:2]
    short = min(h, w)
    scale = target_short / (short if short > 0 else 1)
    nh, nw = max(1, int(round(h*scale))), max(1, int(round(w*scale)))
    return cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

def center_crop(img, size=IMG_SIZE):
    h, w = img.shape[:2]
    y0, x0 = max(0, (h - size)//2), max(0, (w - size)//2)
    y1, x1 = min(h, y0 + size), min(w, x0 + size)
    crop = img[y0:y1, x0:x1]
    if crop.shape[0] != size or crop.shape[1] != size:
        crop = cv2.resize(crop, (size, size), interpolation=cv2.INTER_AREA)
    return crop

def preprocess_for_texture(img_rgb):
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    img_bgr = cv2.bilateralFilter(img_bgr, d=5, sigmaColor=40, sigmaSpace=40)
    lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    l = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(l)
    lab2 = cv2.merge([l, a, b])
    out = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
    return cv2.cvtColor(out, cv2.COLOR_BGR2RGB)

def feat_lbp(gray, P=LBP_P, R=LBP_R):
    # uniform LBP -> bins = P + 2
    lbp = local_binary_pattern(gray, P=P, R=R, method="uniform")
    bins = np.arange(0, P + 3)  # 0..P+2  -> P+2 bins
    hist, _ = np.histogram(lbp, bins=bins, range=(0, P+2), density=False)
    hist = hist.astype(np.float32)
    hist /= (hist.sum() + 1e-6)
    return hist  # length = P+2

def feat_gabor(gray):
    feats = []
    for th in GABOR_THETAS:
        for lam in GABOR_LAMBDAS:
            ksize = 9
            sigma = 0.56 * lam
            gamma = 0.5
            kernel = cv2.getGaborKernel((ksize, ksize), sigma, th, lam, gamma, psi=0, ktype=cv2.CV_32F)
            resp = cv2.filter2D(gray, cv2.CV_32F, kernel)
            feats.extend([float(resp.mean()), float(resp.std())])
    return np.array(feats, dtype=np.float32)  # length = 2 * len(thetas) * len(lambdas)

def feat_blobs(gray):
    g = (gray.astype(np.float32) / 255.0).copy()
    blobs = blob_log(g, min_sigma=1.5, max_sigma=6.0, num_sigma=5, threshold=0.05, overlap=0.5)
    n = len(blobs)
    radii = (blobs[:, 2] * np.sqrt(2)) if n > 0 else np.array([])
    mean_r = float(radii.mean()) if n > 0 else 0.0
    density = n / (gray.shape[0]*gray.shape[1])
    return np.array([n, mean_r, density], dtype=np.float32)  # length = 3

def extract_features(img_rgb):
    img = resize_keep_aspect(img_rgb, IMG_SIZE)
    img = center_crop(img, IMG_SIZE)
    img = preprocess_for_texture(img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    f_lbp = feat_lbp(gray)                        # len = LBP_P+2
    f_gab = feat_gabor(gray)                      # len = 2*len(thetas)*len(lambdas)
    f_blob = feat_blobs(gray)                     # len = 3
    mean_i, std_i = float(gray.mean()), float(gray.std())
    return np.concatenate([f_lbp, f_gab, f_blob, [mean_i, std_i]]).astype(np.float32)

def walk_split(root: Path, split_name: str):
    split_dir = root / split_name
    if not split_dir.exists():
        return []
    rows = []
    for cid_dir in sorted([d for d in split_dir.iterdir() if d.is_dir()]):
        cattle_id = cid_dir.name
        for img_path in cid_dir.iterdir():
            if img_path.suffix.lower() not in VALID_EXTS:
                continue
            rows.append((split_name, cattle_id, img_path))
    return rows

def make_column_names(example_feat: np.ndarray):
    # derive lengths
    len_lbp  = LBP_P + 2
    len_gab  = 2 * len(GABOR_THETAS) * len(GABOR_LAMBDAS)
    len_blob = 3
    len_int  = 2
    assert example_feat.size == (len_lbp + len_gab + len_blob + len_int), \
        f"Feature length mismatch: got {example_feat.size}, expected {len_lbp + len_gab + len_blob + len_int}"

    cols = ["split", "cattle_id", "image_path"]
    cols += [f"lbp_{i}" for i in range(len_lbp)]
    cols += [f"gabor_{i}" for i in range(len_gab)]
    cols += ["blob_count", "blob_mean_r", "blob_density"]
    cols += ["mean_intensity", "std_intensity"]
    return cols

def main():
    root = Path(DATASET_ROOT)
    rows = []
    for split in ["train", "val", "test"]:
        rows.extend(walk_split(root, split))
    if not rows:
        print(f"[error] No images found under {DATASET_ROOT} (expected train/val/test subfolders).")
        return

    feats_list = []
    first_feat = None
    for split, cid, ip in rows:
        img = imread_rgb(ip)
        if img is None:
            print(f"[skip] unreadable: {ip}")
            continue
        f = extract_features(img)
        if first_feat is None:
            first_feat = f
        feats_list.append([split, cid, str(ip)] + list(f))

    if not feats_list:
        print("[error] No features extracted.")
        return

    # build columns dynamically to match feature lengths
    columns = make_column_names(first_feat)
    df = pd.DataFrame(feats_list, columns=columns)

    Path(CSV_OUT).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(CSV_OUT, index=False)
    print(f"[ok] wrote features: {CSV_OUT}")
    print(f"[info] rows: {len(df)}, feature dims: {len(columns) - 3}")

if __name__ == "__main__":
    main()


[ok] wrote features: D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv
[info] rows: 2614, feature dims: 47


SVM Classifier

In [11]:
#!/usr/bin/env python3
import os, json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# ============ USER SETTINGS ============
CSV_PATH   = r"D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv"
OUTPUT_DIR = r".\svm_artifacts"
# ======================================

# Auto-pick numeric features (exclude these columns)
EXCLUDE = {"cattle_id","image_path","split"}

def pick_features(df):
    feats = [c for c in df.columns if c not in EXCLUDE and pd.api.types.is_numeric_dtype(df[c])]
    if not feats:
        raise ValueError("No numeric feature columns found.")
    return feats

def topk_accuracy(clf, X, y, le, k=5):
    if X.shape[0] == 0:
        return None
    if hasattr(clf, "decision_function"):
        scores = clf.decision_function(X)
    elif hasattr(clf, "predict_proba"):
        scores = clf.predict_proba(X)
    else:
        return None
    if scores.ndim == 1:
        scores = np.vstack([-scores, scores]).T
    topk = np.argsort(scores, axis=1)[:, ::-1][:, :min(k, scores.shape[1])]
    y_idx = le.transform(y)
    return float(np.mean([y_idx[i] in topk[i] for i in range(len(y))]))

def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = pd.read_csv(CSV_PATH)

    # ensure required columns
    for col in ["cattle_id","split"]:
        if col not in df.columns:
            raise ValueError(f"CSV must contain '{col}' column.")

    feats = pick_features(df)
    print(f"[info] feature dims: {len(feats)}")

    le = LabelEncoder()
    df["label_idx"] = le.fit_transform(df["cattle_id"])

    # Use train for training and test for evaluation (val is unused here, but kept in CSV)
    train_df = df[df["split"].str.lower() == "train"].copy()
    test_df  = df[df["split"].str.lower() == "test"].copy()

    X_train = train_df[feats].values
    y_train = train_df["cattle_id"].values
    X_test  = test_df[feats].values
    y_test  = test_df["cattle_id"].values

    # scale
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    # grid search RBF SVM
    param_grid = {
        "C": [0.1, 1, 3, 10],
        "gamma": ["scale", 0.1, 0.03, 0.01],
        "kernel": ["rbf"],
    }
    base = SVC(probability=True, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(base, param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)
    grid.fit(X_train_s, y_train)
    best = grid.best_estimator_

    print(f"\n[best] params: {grid.best_params_}")
    print(f"[best] cv accuracy: {grid.best_score_:.4f}")

    # evaluate
    if len(y_test):
        y_pred = best.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)
        print(f"\n[Test] Top-1 accuracy: {acc:.4f}")
        print("\n[Test] Classification report:")
        print(classification_report(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred, labels=le.classes_)
        print("\n[Test] Confusion matrix (rows=true, cols=pred):")
        header = "predâ†’," + ",".join(le.classes_)
        print(header)
        for i, cls in enumerate(le.classes_):
            print(cls + "," + ",".join(str(x) for x in cm[i]))

        top5 = topk_accuracy(best, X_test_s, y_test, le, k=5)
        if top5 is not None:
            print(f"\n[Test] Top-5 accuracy: {top5:.4f}")
    else:
        print("\n[warn] No test rows found in CSV.")

    # save artifacts
    joblib.dump(best, Path(OUTPUT_DIR) / "svm_model.joblib")
    joblib.dump(scaler, Path(OUTPUT_DIR) / "scaler.joblib")
    (Path(OUTPUT_DIR) / "label_encoder.json").write_text(
        json.dumps({"classes": le.classes_.tolist()}, indent=2), encoding="utf-8"
    )
    print(f"\n[save] artifacts in: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


[info] feature dims: 47
Fitting 5 folds for each of 16 candidates, totalling 80 fits





[best] params: {'C': 10, 'gamma': 0.03, 'kernel': 'rbf'}
[best] cv accuracy: 0.5296

[Test] Top-1 accuracy: 0.5477

[Test] Classification report:
              precision    recall  f1-score   support

  cattle-001       1.00      0.50      0.67         2
  cattle-002       0.00      0.00      0.00         1
  cattle-003       1.00      1.00      1.00         1
  cattle-004       0.00      0.00      0.00         1
  cattle-005       0.00      0.00      0.00         1
  cattle-006       0.50      1.00      0.67         1
  cattle-007       0.00      0.00      0.00         1
  cattle-008       1.00      1.00      1.00         1
  cattle-009       0.00      0.00      0.00         1
  cattle-010       0.00      0.00      0.00         2
  cattle-011       0.50      1.00      0.67         1
  cattle-012       0.00      0.00      0.00         1
  cattle-013       1.00      1.00      1.00         1
  cattle-015       0.50      1.00      0.67         1
  cattle-016       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



[Test] Top-5 accuracy: 0.7296

[save] artifacts in: .\svm_artifacts


-------------------------------------------------------------------------
Extracting the features from the image using the following algorithm:  
1. LBP  
2. Gabor  
3. Blob  
4. HOG 

In [None]:
#!/usr/bin/env python3
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from skimage.feature import local_binary_pattern, hog, blob_log
from scipy.stats import skew, kurtosis

# ============ USER SETTINGS ============
DATASET_ROOT = Path(r"C:\Users\Amrit Shah\Desktop\Muzzle based Identification\split")  # contains train/, val/, test/
CSV_OUT      = Path(r"C:\Users\Amrit Shah\Desktop\muzzle_features_LBP_Gabor_Blob_HOG.csv")

IMG_SIZE     = 224
GRID_SIZE    = 5
CELL_SIZE    = IMG_SIZE // GRID_SIZE
LBP_P = 24
LBP_R = 2
GABOR_THETAS = [0, np.pi/8, np.pi/4, 3*np.pi/8, np.pi/2, 5*np.pi/8, 3*np.pi/4, 7*np.pi/8]  # 8 directions
GABOR_LAMBDAS = [3, 5, 7, 9, 11, 13]  # 6 wavelengths (finer scale options)

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

# ============ IMAGE HELPERS ============
def imread_rgb(p):
    img = cv2.imread(str(p))
    if img is None:
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def resize_keep_aspect(img, target=IMG_SIZE):
    h, w = img.shape[:2]
    short = min(h, w)
    scale = target / (short if short > 0 else 1)
    nh, nw = max(1, int(round(h*scale))), max(1, int(round(w*scale)))
    return cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

def center_crop(img, size=IMG_SIZE):
    h, w = img.shape[:2]
    y0, x0 = max(0, (h - size)//2), max(0, (w - size)//2)
    y1, x1 = min(h, y0 + size), min(w, x0 + size)
    crop = img[y0:y1, x0:x1]
    if crop.shape[0] != size or crop.shape[1] != size:
        crop = cv2.resize(crop, (size, size), interpolation=cv2.INTER_AREA)
    return crop

def preprocess_for_texture(img_rgb):
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    img_bgr = cv2.bilateralFilter(img_bgr, d=5, sigmaColor=40, sigmaSpace=40)
    lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    l = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(l)
    lab2 = cv2.merge([l, a, b])
    out = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
    return cv2.cvtColor(out, cv2.COLOR_BGR2RGB)

def split_into_grid(img, grid_size=GRID_SIZE):
    h, w = img.shape
    step = h // grid_size
    return [img[i*step:(i+1)*step, j*step:(j+1)*step] for i in range(grid_size) for j in range(grid_size)]

# ============ FEATURE FUNCTIONS ============
def feat_lbp(gray):
    lbp = local_binary_pattern(gray, P=LBP_P, R=LBP_R, method="uniform")
    bins = np.arange(0, LBP_P + 3)
    hist, _ = np.histogram(lbp, bins=bins, range=(0, LBP_P+2), density=False)
    hist = hist.astype(np.float32)
    hist /= (hist.sum() + 1e-6)
    return hist

def feat_gabor(gray):
    feats = []
    for th in GABOR_THETAS:
        for lam in GABOR_LAMBDAS:
            ksize = 9
            sigma = 0.56 * lam
            gamma = 0.5
            kernel = cv2.getGaborKernel((ksize, ksize), sigma, th, lam, gamma, psi=0, ktype=cv2.CV_32F)
            resp = cv2.filter2D(gray, cv2.CV_32F, kernel)
            feats.extend([float(resp.mean()), float(resp.std())])
    return np.array(feats, dtype=np.float32)

def feat_hog(cell):
    return hog(cell,
            orientations=12,
            pixels_per_cell=(CELL_SIZE//2, CELL_SIZE//2),
            cells_per_block=(2, 2),
            block_norm='L2-Hys',
            visualize=False,
            feature_vector=True)


def feat_blobs(gray):
    g = (gray.astype(np.float32) / 255.0)
    blobs = blob_log(g, min_sigma=1.5, max_sigma=6.0, num_sigma=5, threshold=0.05, overlap=0.5)
    n = len(blobs)
    radii = (blobs[:, 2] * np.sqrt(2)) if n > 0 else np.array([])
    mean_r = float(radii.mean()) if n > 0 else 0.0
    density = n / (gray.shape[0]*gray.shape[1])
    return np.array([n, mean_r, density], dtype=np.float32)

def extract_features_grid(img_rgb):
    img = resize_keep_aspect(img_rgb, IMG_SIZE)
    img = center_crop(img, IMG_SIZE)
    img = preprocess_for_texture(img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    features = []
    for cell in split_into_grid(gray):
        features.extend(feat_lbp(cell))
        features.extend(feat_gabor(cell))
        features.extend(feat_hog(cell))
        features.extend(feat_blobs(cell))
        flat = cell.flatten()
        features.extend([flat.mean(), flat.std(), skew(flat), kurtosis(flat)])
    return np.array(features, dtype=np.float32)

# ============ DATASET WALKER ============
def walk_split(root: Path, split: str):
    out = []
    for cid_dir in (root / split).glob("*"):
        if not cid_dir.is_dir():
            continue
        for img_path in cid_dir.glob("*"):
            if img_path.suffix.lower() in VALID_EXTS:
                out.append((split, cid_dir.name, img_path))
    return out

def make_column_names(example_feat):
    cols = ["split", "cattle_id", "image_path"]
    num_cells = GRID_SIZE * GRID_SIZE
    idx = 0
    for i in range(num_cells):
        cols += [f"lbp_{i}_{j}" for j in range(LBP_P+2)]
        cols += [f"gabor_{i}_{j}" for j in range(2 * len(GABOR_THETAS) * len(GABOR_LAMBDAS))]
        cols += [f"hog_{i}_{j}" for j in range(len(feat_hog(np.zeros((CELL_SIZE, CELL_SIZE), dtype=np.uint8))))]
        cols += [f"blob_count_{i}", f"blob_mean_r_{i}", f"blob_density_{i}"]
        cols += [f"intensity_mean_{i}", f"intensity_std_{i}", f"intensity_skew_{i}", f"intensity_kurt_{i}"]
    return cols

# ============ MAIN ============
def main():
    all_rows = []
    for split in ["train", "val", "test"]:
        all_rows.extend(walk_split(DATASET_ROOT, split))

    feats_list = []
    first_feat = None

    for split, cid, ip in tqdm(all_rows, desc="Extracting features"):
        img = imread_rgb(ip)
        if img is None:
            print(f"[skip] unreadable: {ip}")
            continue
        feat = extract_features_grid(img)
        if first_feat is None:
            first_feat = feat
        feats_list.append([split, cid, str(ip)] + list(feat))

    if not feats_list:
        print("No features extracted.")
        return

    df = pd.DataFrame(feats_list, columns=make_column_names(first_feat))
    CSV_OUT.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(CSV_OUT, index=False)
    print(f"[OK] Saved features to: {CSV_OUT}")
    print(f"[INFO] Feature dimensions: {len(df.columns) - 3}, Rows: {len(df)}")

if __name__ == "__main__":
    main()


Training the SVM model using the extracted features: LPB+GABOR+BLOB+HOG with grid-based approach and evaluating its performance.

In [None]:
#!/usr/bin/env python3
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc
)
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# ================= USER SETTINGS =================
CSV_PATH   = r"C:\Users\Amrit Shah\Desktop\Muzzle based Identification\Second dataset dataset\gamma0.9_muzzle_features_LBP_Gabor_Blob_HOG.csv"
OUTPUT_DIR = r".\svmwithHOG_gamma0.9_artifacts"
# =================================================

EXCLUDE = {"cattle_id","image_path","split"}  # non-feature columns


# ---------- FEATURE PICKER ----------
def pick_features(df):
    feats = [c for c in df.columns if c not in EXCLUDE and pd.api.types.is_numeric_dtype(df[c])]
    if not feats:
        raise ValueError("No numeric feature columns found.")
    return feats


# ---------- SCORE HELPERS ----------
def topk_accuracy_from_scores(y_true, y_score, le, k=5):
    """Compute Top-k accuracy given class scores (n_samples x n_classes)."""
    if y_score is None or y_score.ndim != 2:
        return None
    topk_idx = np.argsort(y_score, axis=1)[:, ::-1][:, :min(k, y_score.shape[1])]
    y_idx = le.transform(y_true)
    return float(np.mean([y_idx[i] in topk_idx[i] for i in range(len(y_idx))]))

def get_scores(clf, X):
    if hasattr(clf, "decision_function"):
        s = clf.decision_function(X)
        if s.ndim == 1:  # binary fallback
            s = np.vstack([-s, s]).T
        return s
    elif hasattr(clf, "predict_proba"):
        return clf.predict_proba(X)
    else:
        return None


# ---------- PLOTS / SAVERS ----------
def save_topk_bar(top1, top5, out_path):
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    xs = np.arange(2)
    vals = [top1, top5]
    ax.bar(xs, vals, color=["#1f77b4", "#ff7f0e"])
    ax.set_xticks(xs); ax.set_xticklabels(["Top-1", "Top-5"])
    ax.set_ylim(0, 1)
    ax.set_ylabel("Accuracy")
    ax.set_title("Top-k Accuracy")
    for i, v in enumerate(vals):
        ax.text(i, v + 0.01, f"{v:.3f}", ha="center", va="bottom", fontsize=9)
    fig.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)

def save_per_class_f1(df_report, out_csv, out_png_bottom30):
    df_report.to_csv(out_csv, index=False)
    df_sorted = df_report.sort_values("f1-score", ascending=True)
    bottom = df_sorted.head(min(30, len(df_sorted)))
    fig_h = max(6, 0.20 * len(bottom) + 2)
    fig = plt.figure(figsize=(10, fig_h))
    ax = fig.add_subplot(111)
    ax.barh(bottom["class"], bottom["f1-score"], color="#2ca02c")
    ax.set_xlabel("F1-score")
    ax.set_title("Per-class F1 (lowest 30)")
    for y, v in enumerate(bottom["f1-score"].values):
        ax.text(v + 0.005, y, f"{v:.2f}", va="center", fontsize=8)
    fig.tight_layout()
    fig.savefig(out_png_bottom30, dpi=200)
    plt.close(fig)

def save_multiclass_roc(clf, X_test, y_test, le, out_path):
    y_score = get_scores(clf, X_test)
    if y_score is None:
        raise RuntimeError("Classifier has neither decision_function nor predict_proba.")
    n_classes = len(le.classes_)
    if y_score.shape[1] != n_classes:
        raise RuntimeError(f"Score shape {y_score.shape} != number of classes {n_classes}")

    y_idx = le.transform(y_test)
    y_true_bin = label_binarize(y_idx, classes=np.arange(n_classes))

    fpr, tpr, roc_auc = {}, {}, {}
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    fpr["macro"], tpr["macro"] = all_fpr, mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111)
    ax.plot(fpr["micro"], tpr["micro"], lw=2, label=f"micro-avg (AUC={roc_auc['micro']:.3f})")
    ax.plot(fpr["macro"], tpr["macro"], lw=2, label=f"macro-avg (AUC={roc_auc['macro']:.3f})")
    ax.plot([0, 1], [0, 1], "k--", lw=1)
    ax.set_xlabel("False Positive Rate"); ax.set_ylabel("True Positive Rate")
    ax.set_title("Multi-class ROC (One-vs-Rest)")
    ax.legend(loc="lower right", fontsize=8)
    fig.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)

def save_confusion_matrix(y_true, y_pred, le, out_csv, out_png):
    cm = confusion_matrix(y_true, y_pred, labels=le.classes_)
    # Save CSV
    df_cm = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    df_cm.to_csv(out_csv)
    # Save heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df_cm, annot=False, cmap="Blues", fmt="d")
    plt.title("Confusion Matrix")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()


# ---------- MAIN ----------
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = pd.read_csv(CSV_PATH)

    for col in ["cattle_id","split"]:
        if col not in df.columns:
            raise ValueError(f"CSV must contain '{col}' column.")

    feats = pick_features(df)
    print(f"[info] feature dims: {len(feats)}")

    le = LabelEncoder()
    df["label_idx"] = le.fit_transform(df["cattle_id"])

    train_df = df[df["split"].str.lower() == "train"].copy()
    test_df  = df[df["split"].str.lower() == "test"].copy()

    X_train = train_df[feats].values
    y_train = train_df["cattle_id"].values
    X_test  = test_df[feats].values
    y_test  = test_df["cattle_id"].values

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    param_grid = {
        "C": [0.1, 1, 3, 10],
        "gamma": ["scale", 0.1, 0.03, 0.01],
        "kernel": ["rbf"],
    }
    base = SVC(probability=True, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(base, param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)
    grid.fit(X_train_s, y_train)
    best = grid.best_estimator_

    print(f"\n[best] params: {grid.best_params_}")
    print(f"[best] cv accuracy: {grid.best_score_:.4f}")

    metrics_summary = {"best_params": grid.best_params_, "cv_accuracy": float(grid.best_score_)}
    if len(y_test):
        y_pred = best.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)
        print(f"\n[Test] Top-1 accuracy: {acc:.4f}")

        # Console classification report
        print("\n[Test] Classification report:")
        print(classification_report(y_test, y_pred))

        # Console confusion matrix
        print("\n[Test] Confusion matrix (rows=true, cols=pred):")
        cm = confusion_matrix(y_test, y_pred, labels=le.classes_)
        header = "predâ†’," + ",".join(le.classes_)
        print(header)
        for i, cls in enumerate(le.classes_):
            print(cls + "," + ",".join(str(x) for x in cm[i]))

        # Per-class metrics DataFrame
        report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True, zero_division=0)
        per_class_rows = []
        for cls in le.classes_:
            if cls in report:
                row = report[cls]
                per_class_rows.append({
                    "class": cls,
                    "precision": row.get("precision", 0.0),
                    "recall": row.get("recall", 0.0),
                    "f1-score": row.get("f1-score", 0.0),
                    "support": int(row.get("support", 0)),
                })
        df_per_class = pd.DataFrame(per_class_rows)

        # Scores for Top-k and ROC
        y_score = get_scores(best, X_test_s)
        top5 = topk_accuracy_from_scores(y_test, y_score, le, k=5) or 0.0
        print(f"\n[Test] Top-5 accuracy: {top5:.4f}")

        out_dir = Path(OUTPUT_DIR)
        out_dir.mkdir(parents=True, exist_ok=True)

        # Save plots/CSVs
        save_topk_bar(top1=acc, top5=top5, out_path=out_dir / "topk_accuracy.png")
        pd.DataFrame({"metric":["Top-1","Top-5"], "value":[acc, top5]}).to_csv(out_dir / "topk_accuracy.csv", index=False)

        save_per_class_f1(
            df_report=df_per_class,
            out_csv=out_dir / "per_class_metrics.csv",
            out_png_bottom30=out_dir / "per_class_f1_bottom30.png"
        )

        save_confusion_matrix(y_test, y_pred, le, out_csv=out_dir/"confusion_matrix.csv", out_png=out_dir/"confusion_matrix.png")

        try:
            save_multiclass_roc(best, X_test_s, y_test, le, out_dir / "roc_curve.png")
        except Exception as e:
            print(f"[warn] ROC could not be generated: {e}")

        metrics_summary.update({
            "test_top1_acc": float(acc),
            "test_top5_acc": float(top5),
            "n_classes": int(len(le.classes_)),
            "n_test": int(len(y_test)),
        })
    else:
        print("\n[warn] No test rows found in CSV.")

    # Save artifacts
    joblib.dump(best, Path(OUTPUT_DIR) / "svm_model.joblib")
    joblib.dump(scaler, Path(OUTPUT_DIR) / "scaler.joblib")
    (Path(OUTPUT_DIR) / "label_encoder.json").write_text(
        json.dumps({"classes": le.classes_.tolist()}, indent=2), encoding="utf-8"
    )
    (Path(OUTPUT_DIR) / "metrics.json").write_text(json.dumps(metrics_summary, indent=2), encoding="utf-8")

    print(f"\n[save] artifacts and plots in: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()


-----------------------------------------------------------------------------------------------------
Testing the SVM artifacts for the LPB+GABOR+BLOB+HOG

In [None]:
import json
from pathlib import Path
import cv2
import numpy as np
import joblib
from skimage.feature import local_binary_pattern, hog, blob_log
from scipy.stats import skew, kurtosis

# ====== USER SETTINGS ======
IMAGE_PATH = r"C:\Users\Amrit Shah\Desktop\Muzzle based Identification\Second dataset dataset\split\test\cattle_4680\cattle_4680_DSCF1256.jpg"
SVM_DIR    = r"C:\Users\Amrit Shah\Desktop\Muzzle based Identification\Second dataset dataset\svmwithHOG_artifacts"
IMG_SIZE   = 224
GRID_SIZE  = 5
CELL_SIZE  = IMG_SIZE // GRID_SIZE

# LBP params
LBP_P = 24
LBP_R = 2

# Gabor params
GABOR_THETAS  = [0, np.pi/8, np.pi/4, 3*np.pi/8, np.pi/2, 5*np.pi/8, 3*np.pi/4, 7*np.pi/8]
GABOR_LAMBDAS = [3, 5, 7, 9, 11, 13]

# ---------------- IMAGE HELPERS ----------------
def imread_rgb(p):
    img = cv2.imread(str(p))
    if img is None:
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def resize_keep_aspect(img, target=IMG_SIZE):
    h, w = img.shape[:2]
    short = min(h, w)
    scale = target / (short if short > 0 else 1)
    nh, nw = max(1, int(round(h*scale))), max(1, int(round(w*scale)))
    return cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

def center_crop(img, size=IMG_SIZE):
    h, w = img.shape[:2]
    y0, x0 = max(0, (h - size)//2), max(0, (w - size)//2)
    y1, x1 = min(h, y0 + size), min(w, x0 + size)
    crop = img[y0:y1, x0:x1]
    if crop.shape[0] != size or crop.shape[1] != size:
        crop = cv2.resize(crop, (size, size), interpolation=cv2.INTER_AREA)
    return crop

def preprocess_for_texture(img_rgb):
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    img_bgr = cv2.bilateralFilter(img_bgr, d=5, sigmaColor=40, sigmaSpace=40)
    lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    l = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(l)
    lab2 = cv2.merge([l, a, b])
    out = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
    return cv2.cvtColor(out, cv2.COLOR_BGR2RGB)

def split_into_grid(img, grid_size=GRID_SIZE):
    h, w = img.shape
    step = h // grid_size
    return [img[i*step:(i+1)*step, j*step:(j+1)*step] for i in range(grid_size) for j in range(grid_size)]

# ---------------- FEATURE FUNCTIONS ----------------
def feat_lbp(gray):
    lbp = local_binary_pattern(gray, P=LBP_P, R=LBP_R, method="uniform")
    bins = np.arange(0, LBP_P + 3)
    hist, _ = np.histogram(lbp, bins=bins, range=(0, LBP_P+2), density=False)
    hist = hist.astype(np.float32)
    hist /= (hist.sum() + 1e-6)
    return hist

def feat_gabor(gray):
    feats = []
    for th in GABOR_THETAS:
        for lam in GABOR_LAMBDAS:
            ksize = 9
            sigma = 0.56 * lam
            gamma = 0.5
            kernel = cv2.getGaborKernel((ksize, ksize), sigma, th, lam, gamma, psi=0, ktype=cv2.CV_32F)
            resp = cv2.filter2D(gray, cv2.CV_32F, kernel)
            feats.extend([float(resp.mean()), float(resp.std())])
    return np.array(feats, dtype=np.float32)

def feat_hog(cell):
    return hog(cell,
               orientations=12,
               pixels_per_cell=(CELL_SIZE//2, CELL_SIZE//2),
               cells_per_block=(2, 2),
               block_norm='L2-Hys',
               visualize=False,
               feature_vector=True)

def feat_blobs(gray):
    g = (gray.astype(np.float32) / 255.0)
    blobs = blob_log(g, min_sigma=1.5, max_sigma=6.0, num_sigma=5, threshold=0.05, overlap=0.5)
    n = len(blobs)
    radii = (blobs[:, 2] * np.sqrt(2)) if n > 0 else np.array([])
    mean_r = float(radii.mean()) if n > 0 else 0.0
    density = n / (gray.shape[0]*gray.shape[1])
    return np.array([n, mean_r, density], dtype=np.float32)

def extract_features_grid(img_rgb):
    img = resize_keep_aspect(img_rgb, IMG_SIZE)
    img = center_crop(img, IMG_SIZE)
    img = preprocess_for_texture(img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    features = []
    for cell in split_into_grid(gray):
        features.extend(feat_lbp(cell))
        features.extend(feat_gabor(cell))
        features.extend(feat_hog(cell))
        features.extend(feat_blobs(cell))
        flat = cell.flatten()
        features.extend([flat.mean(), flat.std(), skew(flat), kurtosis(flat)])
    return np.array(features, dtype=np.float32)

# ---------------- LOAD SVM ARTIFACTS ----------------
svm_model = joblib.load(Path(SVM_DIR) / "svm_model.joblib")
scaler    = joblib.load(Path(SVM_DIR) / "scaler.joblib")
le_data   = json.load(open(Path(SVM_DIR) / "label_encoder.json", "r", encoding="utf-8"))
classes   = le_data["classes"]

# ---------------- RUN INFERENCE ----------------
img = imread_rgb(IMAGE_PATH)
if img is None:
    raise ValueError(f"Could not read image: {IMAGE_PATH}")

features = extract_features_grid(img).reshape(1, -1)
features_scaled = scaler.transform(features)

pred_idx = svm_model.predict(features_scaled)[0]

# Handle probabilities if available
if hasattr(svm_model, "predict_proba"):
    probs = svm_model.predict_proba(features_scaled)[0]
else:
    probs = None

pred_class = classes[svm_model.classes_.tolist().index(pred_idx)]
print(f"Predicted class: {pred_class}")

if probs is not None:
    top5_idx = probs.argsort()[::-1][:5]
    print("Top-5 predictions:")
    for i in top5_idx:
        print(f"  {classes[i]}: {probs[i]:.4f}")
