In [1]:
from pathlib import Path
import random
import shutil
from collections import defaultdict

# ====== USER SETTINGS ======
RAW_DATASET = r"D:\Abhishek\base_256"     
OUTPUT_DATASET = r"D:\Abhishek\split"  # where to save train/val/test
TRAIN_RATIO = 0.6
VAL_RATIO = 0.2
TEST_RATIO = 0.2
SEED = 42
MIN_PER_ID = 3
COPY_FILES = True   # set to False to move instead of copy
# ===========================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
random.seed(SEED)

def list_images(folder: Path):
    return [p for p in folder.iterdir() if p.suffix.lower() in VALID_EXTS and p.is_file()]

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def main():
    raw_root = Path(RAW_DATASET)
    out_root = Path(OUTPUT_DATASET)

    # Prepare output dirs
    for split in ("train", "val", "test"):
        safe_mkdir(out_root / split)

    op = shutil.copy2 if COPY_FILES else shutil.move

    split_counts = defaultdict(lambda: {"train": 0, "val": 0, "test": 0})

    for cid_dir in sorted(raw_root.iterdir()):
        if not cid_dir.is_dir():
            continue
        cid = cid_dir.name
        imgs = list_images(cid_dir)

        if len(imgs) < MIN_PER_ID:
            print(f"[skip] {cid}: only {len(imgs)} images (< {MIN_PER_ID})")
            continue

        random.shuffle(imgs)
        n = len(imgs)
        n_train = max(1, int(round(n * TRAIN_RATIO)))
        n_val = max(1, int(round(n * VAL_RATIO)))
        n_test = max(1, n - n_train - n_val)

        while n_train + n_val + n_test > n:
            if n_train > 1:
                n_train -= 1
            elif n_val > 1:
                n_val -= 1
            elif n_test > 1:
                n_test -= 1
            else:
                break

        train_imgs = imgs[:n_train]
        val_imgs = imgs[n_train:n_train + n_val]
        test_imgs = imgs[n_train + n_val:]

        for split_name, split_imgs in zip(
            ["train", "val", "test"], [train_imgs, val_imgs, test_imgs]
        ):
            dst_dir = out_root / split_name / cid
            safe_mkdir(dst_dir)
            for img in split_imgs:
                op(str(img), str(dst_dir / img.name))
            split_counts[cid][split_name] = len(split_imgs)

    print("\n[done] Dataset split complete!")
    for cid, counts in split_counts.items():
        print(f"{cid}: train={counts['train']} val={counts['val']} test={counts['test']}")

if __name__ == "__main__":
    main()


[skip] cattle-014: only 1 images (< 3)
[skip] cattle-021: only 2 images (< 3)
[skip] cattle-025: only 2 images (< 3)
[skip] cattle-027: only 1 images (< 3)
[skip] cattle-030: only 2 images (< 3)
[skip] cattle-036: only 1 images (< 3)
[skip] cattle-038: only 2 images (< 3)
[skip] cattle-039: only 1 images (< 3)
[skip] cattle-040: only 1 images (< 3)
[skip] cattle-045: only 1 images (< 3)
[skip] cattle-050: only 2 images (< 3)
[skip] cattle-057: only 1 images (< 3)
[skip] cattle-058: only 2 images (< 3)
[skip] cattle-060: only 2 images (< 3)
[skip] cattle-061: only 2 images (< 3)
[skip] cattle-062: only 1 images (< 3)
[skip] cattle-063: only 2 images (< 3)
[skip] cattle-064: only 1 images (< 3)
[skip] cattle-065: only 1 images (< 3)
[skip] cattle-069: only 2 images (< 3)
[skip] cattle-077: only 2 images (< 3)
[skip] cattle-083: only 1 images (< 3)
[skip] cattle-084: only 2 images (< 3)
[skip] cattle-085: only 1 images (< 3)
[skip] cattle-089: only 2 images (< 3)
[skip] cattle-091: only 1

In [2]:
from pathlib import Path

# ==== Change this to your split dataset root ====
DATASET_ROOT = r"D:\Abhishek\split"
# ===============================================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

def count_images_in_split(split_path: Path):
    total = 0
    for cid_folder in split_path.iterdir():
        if cid_folder.is_dir():
            total += sum(1 for img in cid_folder.iterdir() if img.suffix.lower() in VALID_EXTS)
    return total

def main():
    root = Path(DATASET_ROOT)
    for split_name in ["train", "val", "test"]:
        split_path = root / split_name
        if split_path.exists():
            count = count_images_in_split(split_path)
            print(f"{split_name}: {count} images")
        else:
            print(f"[warn] {split_name} folder not found.")

if __name__ == "__main__":
    main()


train: 1488 images
val: 549 images
test: 577 images


In [10]:
#!/usr/bin/env python3
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from skimage.feature import local_binary_pattern, blob_log

# ============ USER SETTINGS ============
DATASET_ROOT = r"D:\Abhishek\split"   # contains train/, val/, test/
CSV_OUT      = r"D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv"
IMG_SIZE     = 224
# LBP params (uniform -> histogram bins = P+2)
LBP_P = 16
LBP_R = 2
# Gabor params
GABOR_THETAS = [0, np.pi/4, np.pi/2, 3*np.pi/4]
GABOR_LAMBDAS = [4, 8, 16]
# ======================================

VALID_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

def imread_rgb(p):
    img = cv2.imread(str(p))
    if img is None:
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def resize_keep_aspect(img, target_short=IMG_SIZE):
    h, w = img.shape[:2]
    short = min(h, w)
    scale = target_short / (short if short > 0 else 1)
    nh, nw = max(1, int(round(h*scale))), max(1, int(round(w*scale)))
    return cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

def center_crop(img, size=IMG_SIZE):
    h, w = img.shape[:2]
    y0, x0 = max(0, (h - size)//2), max(0, (w - size)//2)
    y1, x1 = min(h, y0 + size), min(w, x0 + size)
    crop = img[y0:y1, x0:x1]
    if crop.shape[0] != size or crop.shape[1] != size:
        crop = cv2.resize(crop, (size, size), interpolation=cv2.INTER_AREA)
    return crop

def preprocess_for_texture(img_rgb):
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    img_bgr = cv2.bilateralFilter(img_bgr, d=5, sigmaColor=40, sigmaSpace=40)
    lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    l = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(l)
    lab2 = cv2.merge([l, a, b])
    out = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
    return cv2.cvtColor(out, cv2.COLOR_BGR2RGB)

def feat_lbp(gray, P=LBP_P, R=LBP_R):
    # uniform LBP -> bins = P + 2
    lbp = local_binary_pattern(gray, P=P, R=R, method="uniform")
    bins = np.arange(0, P + 3)  # 0..P+2  -> P+2 bins
    hist, _ = np.histogram(lbp, bins=bins, range=(0, P+2), density=False)
    hist = hist.astype(np.float32)
    hist /= (hist.sum() + 1e-6)
    return hist  # length = P+2

def feat_gabor(gray):
    feats = []
    for th in GABOR_THETAS:
        for lam in GABOR_LAMBDAS:
            ksize = 9
            sigma = 0.56 * lam
            gamma = 0.5
            kernel = cv2.getGaborKernel((ksize, ksize), sigma, th, lam, gamma, psi=0, ktype=cv2.CV_32F)
            resp = cv2.filter2D(gray, cv2.CV_32F, kernel)
            feats.extend([float(resp.mean()), float(resp.std())])
    return np.array(feats, dtype=np.float32)  # length = 2 * len(thetas) * len(lambdas)

def feat_blobs(gray):
    g = (gray.astype(np.float32) / 255.0).copy()
    blobs = blob_log(g, min_sigma=1.5, max_sigma=6.0, num_sigma=5, threshold=0.05, overlap=0.5)
    n = len(blobs)
    radii = (blobs[:, 2] * np.sqrt(2)) if n > 0 else np.array([])
    mean_r = float(radii.mean()) if n > 0 else 0.0
    density = n / (gray.shape[0]*gray.shape[1])
    return np.array([n, mean_r, density], dtype=np.float32)  # length = 3

def extract_features(img_rgb):
    img = resize_keep_aspect(img_rgb, IMG_SIZE)
    img = center_crop(img, IMG_SIZE)
    img = preprocess_for_texture(img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    f_lbp = feat_lbp(gray)                        # len = LBP_P+2
    f_gab = feat_gabor(gray)                      # len = 2*len(thetas)*len(lambdas)
    f_blob = feat_blobs(gray)                     # len = 3
    mean_i, std_i = float(gray.mean()), float(gray.std())
    return np.concatenate([f_lbp, f_gab, f_blob, [mean_i, std_i]]).astype(np.float32)

def walk_split(root: Path, split_name: str):
    split_dir = root / split_name
    if not split_dir.exists():
        return []
    rows = []
    for cid_dir in sorted([d for d in split_dir.iterdir() if d.is_dir()]):
        cattle_id = cid_dir.name
        for img_path in cid_dir.iterdir():
            if img_path.suffix.lower() not in VALID_EXTS:
                continue
            rows.append((split_name, cattle_id, img_path))
    return rows

def make_column_names(example_feat: np.ndarray):
    # derive lengths
    len_lbp  = LBP_P + 2
    len_gab  = 2 * len(GABOR_THETAS) * len(GABOR_LAMBDAS)
    len_blob = 3
    len_int  = 2
    assert example_feat.size == (len_lbp + len_gab + len_blob + len_int), \
        f"Feature length mismatch: got {example_feat.size}, expected {len_lbp + len_gab + len_blob + len_int}"

    cols = ["split", "cattle_id", "image_path"]
    cols += [f"lbp_{i}" for i in range(len_lbp)]
    cols += [f"gabor_{i}" for i in range(len_gab)]
    cols += ["blob_count", "blob_mean_r", "blob_density"]
    cols += ["mean_intensity", "std_intensity"]
    return cols

def main():
    root = Path(DATASET_ROOT)
    rows = []
    for split in ["train", "val", "test"]:
        rows.extend(walk_split(root, split))
    if not rows:
        print(f"[error] No images found under {DATASET_ROOT} (expected train/val/test subfolders).")
        return

    feats_list = []
    first_feat = None
    for split, cid, ip in rows:
        img = imread_rgb(ip)
        if img is None:
            print(f"[skip] unreadable: {ip}")
            continue
        f = extract_features(img)
        if first_feat is None:
            first_feat = f
        feats_list.append([split, cid, str(ip)] + list(f))

    if not feats_list:
        print("[error] No features extracted.")
        return

    # build columns dynamically to match feature lengths
    columns = make_column_names(first_feat)
    df = pd.DataFrame(feats_list, columns=columns)

    Path(CSV_OUT).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(CSV_OUT, index=False)
    print(f"[ok] wrote features: {CSV_OUT}")
    print(f"[info] rows: {len(df)}, feature dims: {len(columns) - 3}")

if __name__ == "__main__":
    main()


[ok] wrote features: D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv
[info] rows: 2614, feature dims: 47


In [11]:
#!/usr/bin/env python3
import os, json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# ============ USER SETTINGS ============
CSV_PATH   = r"D:\Abhishek\muzzle_features_LPB_Gabor_blob.csv"
OUTPUT_DIR = r".\svm_artifacts"
# ======================================

# Auto-pick numeric features (exclude these columns)
EXCLUDE = {"cattle_id","image_path","split"}

def pick_features(df):
    feats = [c for c in df.columns if c not in EXCLUDE and pd.api.types.is_numeric_dtype(df[c])]
    if not feats:
        raise ValueError("No numeric feature columns found.")
    return feats

def topk_accuracy(clf, X, y, le, k=5):
    if X.shape[0] == 0:
        return None
    if hasattr(clf, "decision_function"):
        scores = clf.decision_function(X)
    elif hasattr(clf, "predict_proba"):
        scores = clf.predict_proba(X)
    else:
        return None
    if scores.ndim == 1:
        scores = np.vstack([-scores, scores]).T
    topk = np.argsort(scores, axis=1)[:, ::-1][:, :min(k, scores.shape[1])]
    y_idx = le.transform(y)
    return float(np.mean([y_idx[i] in topk[i] for i in range(len(y))]))

def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df = pd.read_csv(CSV_PATH)

    # ensure required columns
    for col in ["cattle_id","split"]:
        if col not in df.columns:
            raise ValueError(f"CSV must contain '{col}' column.")

    feats = pick_features(df)
    print(f"[info] feature dims: {len(feats)}")

    le = LabelEncoder()
    df["label_idx"] = le.fit_transform(df["cattle_id"])

    # Use train for training and test for evaluation (val is unused here, but kept in CSV)
    train_df = df[df["split"].str.lower() == "train"].copy()
    test_df  = df[df["split"].str.lower() == "test"].copy()

    X_train = train_df[feats].values
    y_train = train_df["cattle_id"].values
    X_test  = test_df[feats].values
    y_test  = test_df["cattle_id"].values

    # scale
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    # grid search RBF SVM
    param_grid = {
        "C": [0.1, 1, 3, 10],
        "gamma": ["scale", 0.1, 0.03, 0.01],
        "kernel": ["rbf"],
    }
    base = SVC(probability=True, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(base, param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)
    grid.fit(X_train_s, y_train)
    best = grid.best_estimator_

    print(f"\n[best] params: {grid.best_params_}")
    print(f"[best] cv accuracy: {grid.best_score_:.4f}")

    # evaluate
    if len(y_test):
        y_pred = best.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)
        print(f"\n[Test] Top-1 accuracy: {acc:.4f}")
        print("\n[Test] Classification report:")
        print(classification_report(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred, labels=le.classes_)
        print("\n[Test] Confusion matrix (rows=true, cols=pred):")
        header = "pred→," + ",".join(le.classes_)
        print(header)
        for i, cls in enumerate(le.classes_):
            print(cls + "," + ",".join(str(x) for x in cm[i]))

        top5 = topk_accuracy(best, X_test_s, y_test, le, k=5)
        if top5 is not None:
            print(f"\n[Test] Top-5 accuracy: {top5:.4f}")
    else:
        print("\n[warn] No test rows found in CSV.")

    # save artifacts
    joblib.dump(best, Path(OUTPUT_DIR) / "svm_model.joblib")
    joblib.dump(scaler, Path(OUTPUT_DIR) / "scaler.joblib")
    (Path(OUTPUT_DIR) / "label_encoder.json").write_text(
        json.dumps({"classes": le.classes_.tolist()}, indent=2), encoding="utf-8"
    )
    print(f"\n[save] artifacts in: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()


[info] feature dims: 47
Fitting 5 folds for each of 16 candidates, totalling 80 fits





[best] params: {'C': 10, 'gamma': 0.03, 'kernel': 'rbf'}
[best] cv accuracy: 0.5296

[Test] Top-1 accuracy: 0.5477

[Test] Classification report:
              precision    recall  f1-score   support

  cattle-001       1.00      0.50      0.67         2
  cattle-002       0.00      0.00      0.00         1
  cattle-003       1.00      1.00      1.00         1
  cattle-004       0.00      0.00      0.00         1
  cattle-005       0.00      0.00      0.00         1
  cattle-006       0.50      1.00      0.67         1
  cattle-007       0.00      0.00      0.00         1
  cattle-008       1.00      1.00      1.00         1
  cattle-009       0.00      0.00      0.00         1
  cattle-010       0.00      0.00      0.00         2
  cattle-011       0.50      1.00      0.67         1
  cattle-012       0.00      0.00      0.00         1
  cattle-013       1.00      1.00      1.00         1
  cattle-015       0.50      1.00      0.67         1
  cattle-016       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



[Test] Top-5 accuracy: 0.7296

[save] artifacts in: .\svm_artifacts
