# 1: Base Data Inspection

Feel free to minimise from Base Data Inspection, I got lost down rabbit holes before I realised this was a machine learning subject not feature engineering. 

That said I found there was ~100,000 duplicate rows, 0 duplicated columns, 0 constant columns. 

Chose not to remove the duplicate rows as, I was to *consider that the amount of data for each species in the database available is an indication of its abundance or rarity*

Making it a Hierarchal Multiclassification, edge detection problem, given the scarcity of specific wood types, the target to inditify would be less likely to have duplicates then others, resulting in more complicated classification in this scenario.

## 1.01 | Installing Required Libraries

In [None]:
# Bootstrap Package Installs
import importlib
import subprocess
import sys
libraries = {
    "numpy": "numpy",
    "polars": "polars",
    "matplotlib": "matplotlib",
    "seaborn": "seaborn",
    "scikit-learn": "sklearn",
    "xgboost": "xgboost",
    "joblib": "joblib",
    "opencv-python": "cv2",
    "cvxopt": "cvxopt",
    "graphviz": "graphviz",
    "pickle": "pickle",
    "logging": "logging",
    "imbalanced-learn": "imbalanced-learn",
    "numba": "numba",
    "gc": "gc",
}

for pip_name, import_name in libraries.items():
    try:
        importlib.import_module(import_name)
        print(f"{pip_name} Already Installed")
    except ImportError:
        print(f"{pip_name} Installing")
        subprocess.check_call([sys.executable, "-m","pip","install","--upgrade", pip_name])
print("Libraries Ready")


In [None]:
# For simplicity:
import os
import gc
import numpy as np
import numba as noomba
import seaborn as sns


import polars as pl
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from math import log2
from matplotlib.lines import Line2D

import logging
logging.basicConfig(
    level= logging.INFO,
    format="%(levelname)s: %(message)s"
)
ilog = logging.info

FIGSIZE = (10,6)
FONTSIZE = 18

## 1.02 | Definitions / Unfinished

#### Helper | Check Delete

In [None]:
def check_delete(file):
    if file.exists():
        file.unlink()
        print(f"Removed {file}")

#### **Information Gain**:

#### Helper | Entropy

In [None]:
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * log2(p) for p in probabilities if p > 0])

Helper | Information Gain

In [None]:
def information_gain(df, feature, target):
    total_entropy = entropy(df[target].to_numpy())
    values, counts = np.unique(df[feature].to_numpy(), return_counts=True)
    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) * entropy(df.loc[df[feature] == v, target].to_numpy())
        for i, v in enumerate(values)
    ])
    return total_entropy - weighted_entropy

#### Helper | Intrinsic Information

In [None]:
def intrinsic_information(df, feature):
    values, counts = np.unique(df[feature].to_numpy(), return_counts=True)
    probabilities = counts / len(df)
    return -np.sum([p * log2(p) for p in probabilities if p > 0])

#### Helper | gain_ratio

In [None]:
def gain_ratio(df, feature, target):
    ig = information_gain(df, feature, target)
    ii = intrinsic_information(df, feature)
    return ig / ii if ii != 0 else 0

#### Helper | Stats

In [None]:
def stats(dataframe, features):
    if isinstance(features, str):
        features = [features]
    for f in features:
        mean = dataframe[f].mean()
        median = dataframe[f].median()
        std = dataframe[f].std()
        print(f"{f} Statistics: Mean: {mean:.4f}, Median: {median:.4f}, Standard Deviation: {std:.4f}")

Helper | Squish

In [None]:
def squish(INS: Path, OUT: Path, CLASS_HEADERS: list[str]):      
    files = [
"Data\BGLBP.parquet",
"Data\CSLBP.parquet",
"Data\CSSILTP.parquet",
"Data\OLBP.parquet",
"Data\SCSLBP.parquet",
"Data\SILTP.parquet",
"Data\Tchebyshev.parquet",
]
    dfs = [pl.read_parquet(f) for f in files]
    master = dfs[0]
    for df in dfs[1:]:
        feat_cols = [c for c in df.columns if c.lower() not in CLASS_HEADERS]
        master = master.hstack(df.select(feat_cols))
    master.write_parquet(OUT)

Helper | Check File Counts

In [None]:
def check_file_counts(path, filter:str):
    return len(list(path.glob(filter)))

## 1.03 | Config / Basic Raw_File Cleaning:

In [None]:
BASE = Path("Data")
RAW = BASE / "Raw"

directories = [
    BASE, 
    RAW,
]
for directory in directories:
    directory.mkdir(parents = True, exist_ok = True) 
    print(f"Directory Check: {directory}")

File Name Cleaning & Parquet transformation for Local Machine Memory

In [None]:
before = check_file_counts(RAW, "*.csv")
print(f"Before: {before}")

In [None]:
CLASS_HEADERS = ["family", "genus", "species"]

In [None]:
import polars as pl
for file in RAW.glob("*.csv"):
    name = file.stem.split("_")[1]
    print(name)
    df = pl.read_csv(file, has_header = False)
    ncols = len(df.columns) 
    new_headers = CLASS_HEADERS + [f"{name}_{i}" for i in range(1, ncols - len(CLASS_HEADERS) + 1)]
    df = df.rename(dict(zip(df.columns, new_headers)))
    df.write_parquet(BASE / f"{name}.parquet")
    print(f"Rows x Cols: {len(df)} x {ncols}")
    

In [None]:
after = check_file_counts(BASE, "*.parquet")
print(f"After: {after}")

Create master Dataset

In [None]:
MASTER = BASE / "master.parquet"
check_delete(MASTER)
squish(BASE, MASTER, CLASS_HEADERS)

Read

In [None]:
data = pl.read_parquet(MASTER)
data = data.to_pandas()

## 1.04 | Base Data Inspection: Introduction

stats

In [None]:
data.describe()

classes visualisation

In [None]:
data.hist(column = classes, xlabelsize = 10, ylabelsize = 10, figsize = (14,14), bins = 21)

class line

In [None]:
newdata = data.copy()
newdata["index"] = range(1, len(newdata) + 1)
np_data = newdata.loc[:,["family", "genus", "species", "index"]].to_numpy()
family, genus, species, index = np_data[:,0], np_data[:,1], np_data[:,2], np_data[:, 3]

In [None]:
FIGURESIZE = (10,6)
FONTSIZE = 18
nind = len(newdata["index"])

In [None]:
plt.figure(figsize=FIGURESIZE)
plt.plot(index, family, "r", label="Family")
plt.plot(index, genus, "g", label="Genus")
plt.plot(index, species, "b", label="Species")
plt.fill_between(index, family, 0, color="r", alpha=0.3)
plt.fill_between(index, genus, family, color="g", alpha=0.3)
plt.fill_between(index, species, genus, color="b", alpha=0.3)
plt.ylabel("Instance as Table Index", fontsize=FONTSIZE)
plt.xlabel("Index", fontsize=FONTSIZE)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=FIGURESIZE)
plt.plot(index, family, "r", label="Family")
plt.plot(index, genus, "g", label="Genus")
plt.plot(index, species, "b", label="Species")
plt.xlim(0, nind)
plt.ylim(-10, 70)
plt.fill_between(index, family, 0, color="r", alpha=0.3)
plt.fill_between(index, genus, family, color="g", alpha=0.3)
plt.fill_between(index, species, genus, color="b", alpha=0.3)
plt.ylabel("Instance as Table Index", fontsize=FONTSIZE)
plt.xlabel("Index", fontsize=FONTSIZE)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
files = [
"Data\BGLBP.parquet",
"Data\CSLBP.parquet",
"Data\CSSILTP.parquet",
"Data\OLBP.parquet",
"Data\SCSLBP.parquet",
"Data\SILTP.parquet",
"Data\Tchebyshev.parquet",
]

Histograms and Correlation Matrix

In [None]:
for file in files:
    df = pd.read_parquet(file)
    print(file)
    matrix = df.corr()
    classes = ["family", "genus", "species"]
    features = [c for c in df.columns if c not in classes]
    df.hist(column = features, xlabelsize = 5, ylabelsize = 5, figsize = (10,6), bins = 21)
    plt.figure(figsize=(40,40))
    sns.heatmap(matrix, annot = True, cmap = "coolwarm", fmt = ".2f", linewidths = 0.5)
    plt.title(f"Correlation Matrix: {file}")

In [None]:
df = data.copy()
classes = ["family", "genus", "species"]
features = [col for col in df.columns if col not in classes]

In [None]:
for c in classes:
    min, max = df[c].min(), df[c].max()
    exp = set(range(min, max + 1))
    present = set(df[c].unique())
    missing = sorted(exp - present)
    unique = len(df[c].unique())
    print("_____________________________________________________________________")
    print(c)
    print(f"Uniques {unique}, Minimum: {min}, Maximum: {max}, Missing: {missing}") 

In [None]:
for feature in features:
    dracula = len(df[feature].unique())
    if dracula <= 2:
        print(f"{feature}: {dracula}")
    else:
        None

In [None]:
print(f"Shape {len(df.columns)} x {len(df)}")

In [None]:
gc.collect()

## 1.05 | Base Data Inspection: Key-Takeaways

The labels, whilst being numerical, are just labels, so the line plots might give an understanding of frequency per class pair offers limited insight into the data being presented.

DataFrame Dimensions: 293830 x 536 (R x C)

Constant Columns: None / All columns had at least 2 Unique values

* Class Histograms:
    - Families:
        - 58 Unique
        - 7 familily groups (groups being ~ 3 familiys) saw (less) than 5_000 observations. 
        - 5 familily groups (groups being ~ 3 familiys) saw (more) than 25_000 observations. 
        - Missing families [1, 28]
    - Genus:
        - 191 Unique
        - 8 Genus groups (groups being ~ 9 Geni) saw (less) than 10_000 observations. 
        - 4 Genus groups (groups being ~ 9 Geni) saw (more) than 20_000 observations. 
        - Missing Genus [30, 62, 70, 110, 141]
    - Species:
        - 925 Unique
        - 9 Species groups (groups being ~ 44 Species) saw (less) than 10_000 observations. 
        - 6 Species groups (groups being ~ 44 Species) saw (more) than 20_000 observations.
        - Missing species [114, 229] 

Classes & features are both very skewed and  hierarchal (family -> genus -> species) and multi-class (classes are non-binary) in nature

Using all files and features

## Additional Data Inspection

#### Helper | Distribution vs Expected

In [None]:
def distribution_vs_expected(df: pl.DataFrame, classes: str):
    exp_even = len(df) / df.select(pl.col(classes).n_unique()).item()
    LIMITS = {"family": 60, "genus": 128, "species": 927}
    full   = pl.DataFrame({classes: range(1, LIMITS[classes] + 1)})
    counts = df.group_by(classes).agg(pl.len().alias("n"))
    joined = full.join(counts, on=classes, how="left")
    missing = joined.filter(pl.col("n").is_null()).get_column(classes).to_list()
    dev = (
        joined.with_columns((pl.col("n").fill_null(0) - exp_even).alias("dev"))
            .sort(classes)
    )
    plt.figure(figsize=(11,4))
    plt.bar(dev.get_column(classes).to_list(), dev["dev"].to_list(), width=0.9, label="Actual − Expected")
    plt.axhline(0, linewidth=1)
    handle = Line2D([0],[0], color='none')
    label  = f"Missing {classes}: " + (", ".join(map(str, missing)) if missing else "None")
    plt.legend([handle], [label], loc='center left', bbox_to_anchor=(1.0, 0.5), frameon=False)
    N = LIMITS[classes]
    step = 5 if N <= 60 else (10 if N <= 150 else 50)
    plt.xticks(range(step, N + 1, step))

    plt.xlabel(classes.capitalize()); plt.ylabel("Actual − Expected")
    plt.title(f"Deviation per {classes.capitalize()} (expected ≈ {exp_even:.2f}/{classes})")
    plt.tight_layout()
    plt.show()

###  Distribution vs Expected (mean):

In [None]:
from pathlib import Path
import polars as pl
DATA = Path("Data/master.parquet")

In [None]:
classes = ["family", "genus", "species"]
for c in classes:
    data = pl.read_parquet(DATA)
    distribution_vs_expected(data, c)

The most performative indicators are the classes themselves (no surprise).
As the data is hierarchal in structure:

Family -> Genus -> Species

For many datasets, samples are not drawn uniformly from the feature space

* 0 D: Clusters
* 1 D: Line segments - curves
* 2 D: Planes / surfaces

A ***manifold*** is a term used to descrie a group of samples that locally vary in some dimensions, but not in others

## Data Handling / Preprocessing

In [None]:
# Went a different direction

## Entropy's:

In [None]:
# Different Direction

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from pathlib import Path

In [None]:
spec = data.iloc[:, 2].to_numpy()
u, inv = np.unique(spec, return_inverse=True)
cnt = np.bincount(inv)

In [None]:
x1 = np.log1p(cnt)
x2 = np.argsort(np.argsort(cnt)) / (len(cnt) - 1 + 1e-9)
X_all = np.c_[x1, x2]

In [None]:
k = 2
TESTSIZE = 0.4

In [None]:
km = KMeans(n_clusters=k, n_init=10, random_state=0).fit(x1.reshape(-1,1))
order = np.argsort(km.cluster_centers_.ravel())
y_all = np.empty_like(km.labels_)
for r, c in enumerate(order): y_all[km.labels_ == c] = r
X, ins, y, outs = train_test_split(
    X_all, y_all, test_size=TESTSIZE, stratify=y_all, random_state=0
)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
ins = scaler.transform(ins)
y, outs = y, outs

In [None]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X, y)
xpred   = KNN.predict(X)
outpred = KNN.predict(ins)
print("Train:", accuracy_score(y, xpred))
print("Test :", accuracy_score(outs, outpred))

In [None]:
conf = confusion_matrix(outs, outpred)
conf

In [None]:
check = conf[0,0] + conf[1,1] + conf[2,2] + conf[3,3]
print(check * 2.5)

In [None]:
FIGSIZE = (10,6)

In [None]:
x_min, x_max = X[:,0].min()-0.5, X[:,0].max()+0.5
y_min, y_max = X[:,1].min()-0.5, X[:,1].max()+0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))
Z = KNN.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

plt.figure(figsize=FIGSIZE)
plt.contourf(xx, yy, Z, alpha=0.25)
plt.scatter(X[:,0], X[:,1], c=y, s=10, edgecolors='k')
plt.scatter(ins[:,0], ins[:,1], c=outs, s=10, edgecolors='r')
plt.title("KNN rarity (0=rarest)")
plt.xlabel("log1p(count)") 
plt.ylabel("species frequency")

# 1.05 | Base Data Inspection: Key-Takeaways (repeated)

The labels, whilst being numerical, are just labels, so the line plots might give an understanding of frequency per class pair offers limited insight into the data being presented.

DataFrame Dimensions: 293830 x 536 (R x C)

Constant Columns: None / All columns had at least 2 Unique values

* Class Histograms:
    - Families:
        - 58 Unique
        - 7 familily groups (groups being ~ 3 familiys) saw (less) than 5_000 observations. 
        - 5 familily groups (groups being ~ 3 familiys) saw (more) than 25_000 observations. 
        - Missing families [1, 28]
    - Genus:
        - 191 Unique
        - 8 Genus groups (groups being ~ 9 Geni) saw (less) than 10_000 observations. 
        - 4 Genus groups (groups being ~ 9 Geni) saw (more) than 20_000 observations. 
        - Missing Genus [30, 62, 70, 110, 141]
    - Species:
        - 925 Unique
        - 9 Species groups (groups being ~ 44 Species) saw (less) than 10_000 observations. 
        - 6 Species groups (groups being ~ 44 Species) saw (more) than 20_000 observations.
        - Missing species [114, 229] 

Classes & features are both very skewed and  hierarchal (family -> genus -> species) and multi-class (classes are non-binary) in nature

# 2. | Modelling

1. **Boosting models**: AdaBoost and XGBoost.
2. **Bagging models**: two Random Forests (one with default settings and one with optimised hyperparameters).
3. **Support Vector Machines**: linear and RBF kernels.z

In [1]:
gc.collect()

NameError: name 'gc' is not defined

# Method: 

1. First and foremost, is this wood potentially rare? Or the easier question, is this wood, common?

    - Use SVM classifiers to partition the data, not optimised, reiterated settings.
    - Exploits hierarchal structure of the data by instead of directly targeting species (1:~900), targets family (1:60)
    - Doesnt re-class or add systemic bias.
    - Risk the SVM mis-classification on rare wood type as being common, or misclassifying unseen wood types altogether
    - SVM classifier to initially distinguish families with rare species (species < 1750 observations)
	- Using both linear and polynomial, exploring weighted? 

2. Filtering data based on SVM classification, we know the family might be rare, but does what genus is each family have a rare member?
	- Random Forest Bagging, & subagging & hyperparameters.

3. We have now established that both the family and genus (predicted) may contain a rare species. to not misspecify the species we look to using;
	- XGboosted decision tree 
	or
	- adaboost decision tree stump
	or 
	- a weighted average of both and/or all methods to make the final judgement, hoping to capture both rarity, misclassification metrics per sample and class-classification
	

### Step 1 | Classifying Rarity species rarity with Support Vector Machine

Definetely not up to writing my own custom algorithms at this point, but I definetely understand the limitations of being unfamiliar with the math.

Probably could've chosen a better library for SVM's but we're here so just going to run with it. I looked at SVMlib for parallelisation  but found the artifact preservation to be a little puzzling.

In [26]:
# Libs
import gc
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import (
    RandomForestClassifier, BaggingClassifier,
    AdaBoostClassifier, HistGradientBoostingClassifier
)

from sklearn.metrics import (
    confusion_matrix, balanced_accuracy_score, accuracy_score, f1_score, roc_curve, auc, log_loss
)

# from sklearn.calibration import CalibratedClassifierCV
from joblib import dump

In [27]:
RS = 1234

In [28]:
# Helpers

# LIBRARY

In [29]:
def load_xy(drop_cols = ("family", "genus", "species")):
    data = pd.read_parquet(DATA)
    X = data.drop(columns = list(drop_cols), errors = "ignore").to_numpy(dtype=np.float32)
    return data, X

In [30]:
def initial_rare_labels(species, method="gmm", random_state=RS, save_model=True, return_counts=False):
    cnt = Counter(species)
    counts = np.fromiter((cnt[s] for s in species), dtype=np.int32, count=len(species))
    x = np.log1p(counts).reshape(-1, 1)

    if method == "gmm":
        model = GaussianMixture(n_components=2, random_state=random_state).fit(x)
        rare_comp = np.argmin(model.means_.ravel())
        hard = (model.predict(x) == rare_comp).astype(np.int32)
        soft = model.predict_proba(x)[:, rare_comp]
        if save_model: dump(model, MODELS / "01_species_gmm.joblib")

    elif method == "kmeans":
        model = KMeans(n_clusters=2, n_init=10, random_state=random_state).fit(x)
        rare_comp = np.argmin(model.cluster_centers_.ravel())
        hard = (model.labels_ == rare_comp).astype(np.int32)
        d = np.linalg.norm(x - model.cluster_centers_[rare_comp], axis=1)
        soft = (d.max() - d) / (d.max() - d.min() + 1e-9)
        if save_model: dump(model, MODELS / "01_species_kmeans.joblib")

    elif method == "knn":
        k = 10
        model = KNeighborsRegressor(n_neighbors=k)
        model.fit(x, counts)
        local_density = model.predict(x)
        soft = 1 - (local_density - local_density.min()) / (local_density.max() - local_density.min() + 1e-9)
        hard = (soft > np.median(soft)).astype(np.int32)
        if save_model: dump(model, MODELS / "01_species_knn.joblib")

    else:
        raise ValueError("method must be 'gmm', 'kmeans', or 'knn'")

    return (hard, soft, counts) if return_counts else (hard, soft)

# helpers

In [31]:
def _latest(prefix):
    paths = sorted(MODELS.glob(prefix + "*.joblib"))
    if not paths: raise FileNotFoundError(f"No saved model with prefix {prefix}")
    return paths[-1]

In [32]:
def binary_metrics(outs, proba, threshold, csv_path=None,
                   classifier_name=None, method=None, train_size=None):
    pred = proba[:, 0] >= threshold
    tn, fp, fn, tp = confusion_matrix(outs, pred).ravel()

    eps = 1e-12
    n = tn + fp + fn + tp

    acc  = (tp + tn) / n
    prec = tp / (tp + fp + eps)
    rec  = tp / (tp + fn + eps)
    spec = tn / (tn + fp + eps)
    f1   = 2 * prec * rec / (prec + rec + eps)
    fpr  = fp / (fp + tn + eps)
    fnr  = fn / (fn + tp + eps)
    bal  = 0.5 * (rec + spec)
    mcc  = (tp * tn - fp * fn) / np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn) + eps)
    ll   = log_loss(outs, proba)

    metrics = {
        "Classifier": classifier_name,
        "Method": method,
        "TrainSize": train_size,
        "Threshold": threshold,
        "TP": tp, "TN": tn, "FP": fp, "FN": fn,
        "Accuracy": acc, "Precision": prec, "Recall": rec,
        "Specificity": spec, "F1": f1,
        "FPR": fpr, "FNR": fnr,
        "Balanced_Acc": bal, "MCC": mcc,
        "LogLoss": ll
    }

    if csv_path:
        df = pd.DataFrame([metrics])
        df.to_csv(csv_path, mode="a", index=False,
                  header=not pd.io.common.file_exists(csv_path))
        print(f"Appended → {csv_path}")

    return metrics

In [33]:
def threshold(y_true, proba, thr):
    p = proba[:,1] if getattr(proba, "ndim", 1) == 2 else proba
    pred = (p >= thr).astype(int)
    cm = confusion_matrix(np.asarray(y_true).astype(int), pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    return tp, tn, fp, fn

In [34]:
# ChatGPT
def timed_job(fn, classifier, method, train_size):
    cname = classifier.steps[-1][1].__class__.__name__ if hasattr(classifier, "steps") else classifier.__class__.__name__
    print(f"▶ Running: {method} – {cname} – {train_size}")
    t0 = time.perf_counter()
    result = fn(classifier, method, train_size)
    t1 = time.perf_counter()
    print(f"✅ Finished: {method} – {cname} – {train_size} in {t1 - t0:.2f}s")
    return result, t1 - t0

All credit for **binary plots** belongs to https://www.youtube.com/@machinelearningpractice2089

In [35]:
def binary_plots(outs, proba, img1path = None, img2path=None, classifier_name=None, method=None, train_size=None):
        # --- use positive class probabilities ---
    if proba.ndim > 1:
        proba_pos = proba[:, 1]
    else:
        proba_pos = proba

    fpr, tpr, thresholds = roc_curve(outs, proba_pos)
    auc_score = roc_auc_score(outs, proba_pos)

    # --- TPR/FPR vs threshold ---
    fig, ax = plt.subplots(figsize=FIGSIZE)
    ax.plot(thresholds, tpr, color="b", label="TPR")
    ax.plot(thresholds, fpr, color="r", label="FPR")
    ax.plot(thresholds, tpr - fpr, color="g", label="TPR−FPR")
    ax.invert_xaxis()
    ax.set_xlabel("Threshold", fontsize=FONTSIZE)
    ax.set_ylabel("Fraction", fontsize=FONTSIZE)
    ax.legend()
    ax.grid(alpha=0.3)
    path1 = img1path
    fig.savefig(path1, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("roc_plots")
    # --- ROC curve ---
    fig, ax = plt.subplots(figsize=FIGSIZE)
    ax.plot(fpr, tpr, color="b", lw=2, label=f"ROC (AUC={auc_score:.3f})")
    ax.plot([0, 1], [0, 1], "r--", lw=1)
    ax.set_xlabel("FPR", fontsize=FONTSIZE)
    ax.set_ylabel("TPR", fontsize=FONTSIZE)
    ax.set_aspect("equal", "box")
    ax.legend()
    ax.grid(alpha=0.3)
    path2 = img2path
    fig.savefig(path2, dpi=300, bbox_inches="tight")
    plt.close(fig)

In [36]:
def save_model(classifier, X, model_path):
    artifact = {
        "model": classifier,
        "features": list(X.columns) if hasattr(X, "columns") else None
    }
    dump(artifact, model_path, compress=3)
    print(f"Model saved → {model_path}")

In [37]:
def load_model(model_path):
    obj = load(model_path)
    return obj["model"], obj.get("features"), obj.get("classes")

In [38]:
def safe_to_csv(df, path):
    for _ in range(5):
        try:
            df.to_csv(path, mode="a", index=False, header=not os.path.exists(path))
            return
        except PermissionError:
            time.sleep(0.5)
    raise

## Stage 1 | Config

In [43]:
# Config
DATA = Path("Data/master.parquet")
RS = RS

FIGSIZE = (10,6)
FONTSIZE = 10

MODELS = Path("clusters")
plot_dir = Path("Plots")
CSV = Path("output")
SVM = Path("SVM")

for dir in [plot_dir, CSV, SVM, MODELS]:
    dir.mkdir(parents=True, exist_ok=True)
    print(f"Made {dir}")

Made Plots
Made output
Made SVM
Made clusters


In [44]:
#====================================================================================
# - 1) Binary classification - is this a potentially rare species?
#====================================================================================

## SVM: Definitions

In [45]:
MAXITER = 2_500

# --- Linear Function
SVMLIN = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)), 
    ("svc", SVC(
        kernel="linear",
        probability=True,
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
    ))
])
SVMLIN.name = "SVMLINEAR"

# --- Radial Basis Function
SVMRBF = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)), 
    ("svc", SVC(
        kernel="rbf",
        probability=True,         
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
        )
    )
])
SVMRBF.name = "SVMRBF"

# --- Poly degree 2
SVMPoly2 = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)),  
    ("svc", SVC(
        kernel="poly",
        degree = 2,
        probability=True,
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
    ))
])
SVMPoly2.name = "SVMPoly2"

# --- Poly degree 3
SVMPoly3 = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)),  
    ("svc", SVC(
        kernel="poly",
        degree = 3,
        probability=True,
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
    ))
])
SVMPoly3.name = "SVMPoly3"

# --- Poly degree 4
SVMPoly4 = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)),  
    ("svc", SVC(
        kernel="poly",
        degree = 4,
        probability=True,
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
    ))
])
SVMPoly4.name = "SVMPoly4"

# --- Poly degree 5
SVMPoly5 = Pipeline([
    ("mm", MinMaxScaler()),
    ("lda", LinearDiscriminantAnalysis(n_components=1)),  
    ("svc", SVC(
        kernel="poly",
        degree = 5,
        probability=True,
        tol=1e-2,
        max_iter= MAXITER,
        decision_function_shape="ovr"
    ))
])
SVMPoly5.name = "SVMPoly5"

In [46]:
def SVM_TRAIN_LOG(CLASSIFIER, METHOD, TRAINSIZE):
    j = getattr(CLASSIFIER, "name", None)
   
    print(f"Classifier name: {j}")
    
    data, X = load_xy()
    species = data["species"].to_numpy()
    y, p_rare = initial_rare_labels(species, method=METHOD)
    X, ins, y, outs = train_test_split(
        X, y, test_size=(1 - (0.01 * TRAINSIZE)),
        stratify=y, random_state=RS
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RS)
    _ = cross_val_predict(
        CLASSIFIER,
        X, y,
        cv=cv, 
        method="predict",
        n_jobs = 1,
    )

    CLASSIFIER.fit(X, y)
    proba = CLASSIFIER.predict_proba(ins)
    ROC = Path("Plots/ROC")
    ROC1 = ROC / f"{method}_{classifier_name}_{train_size}_roc.png"
    ROC2 = ROC / f"{method}_{classifier_name}_{train_size}_tprfpr.png" 
    
    
    # save model
    mpath = SVM / f"{METHOD}_{j}_{TRAINSIZE}.joblib"
    save_model(CLASSIFIER, X, mpath)

    binary_plots(outs, proba, img1path = ROC2, img2path = ROC1,classifier_name=j, method=METHOD, train_size=TRAINSIZE)
    # metrics + plots
    binary_metrics(
        outs, proba, THRESHOLD,
        csv_path= CSV / "all_metrics.csv",
        classifier_name=j,
        method=METHOD,
        train_size=TRAINSIZE
    )
    thresholds = np.linspace(0.35, 0.65, 10001)
    TP, TN, FP, FN = [], [], [], []
    for thr in thresholds:
        tp, tn, fp, fn = threshold(outs, proba, thr)
        TP.append(tp); TN.append(tn); FP.append(fp); FN.append(fn)

    plt.figure(figsize=FIGSIZE)
    plt.plot(thresholds, TP, "g", label="TP")
    plt.plot(thresholds, TN, "r", label="TN")
    plt.plot(thresholds, FP, "m", label="FP")
    plt.plot(thresholds, FN, "c", label="FN")
    plt.xlim([0, 1])
    plt.xlabel("Threshold"); plt.ylabel("Count"); plt.legend()
    plot_path = plot_dir / f"{METHOD}_{j}_{TRAINSIZE}_thresholds.png"
    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
    plt.close()

    # All ChatGPT this bit
    if TRAINSIZE == 75:
        try:
            fig, ax = plt.subplots(figsize=(7, 5))
            ax.contourf(xx, yy, Z, alpha=0.3, cmap="Pastel1", levels=25)
            ax.contour(xx, yy, Z, colors="k", linewidths=1, levels=[0.5])

            # sample to avoid overcrowding
            N = 2000  # adjustable
            idx = np.random.choice(len(ins2D), size=min(N, len(ins2D)), replace=False)

            common = (y_test_common_rare == 0)
            rare = (y_test_common_rare == 1)

            ax.scatter(ins2D[np.intersect1d(np.where(common)[0], idx), 0],
                    ins2D[np.intersect1d(np.where(common)[0], idx), 1],
                    s=10, c="green", alpha=0.5, label="Common")

            ax.scatter(ins2D[np.intersect1d(np.where(rare)[0], idx), 0],
                    ins2D[np.intersect1d(np.where(rare)[0], idx), 1],
                    s=10, c="blue", alpha=0.5, label="Rare")

            mis_idx = np.intersect1d(np.where(misclassified)[0], idx)
            ax.scatter(ins2D[mis_idx, 0],
                    ins2D[mis_idx, 1],
                    s=40, marker="x", c="red", lw=1.2, label="Misclassified")

            ax.legend(frameon=False)
            ax.set_title(f"Decision Boundary – {METHOD} – {j}")
            ax.set_xlabel("PC1")
            ax.set_ylabel("PC2")
            ax.grid(alpha=0.3)
        
        except Exception as e:
            print(f"⚠️ Boundary plot skipped ({METHOD} – {j}): {e}")

        
    print(f"✅ Done: {METHOD} – {j} – {TRAINSIZE}")
    return (METHOD, j, TRAINSIZE)

# Stage 1 | SVM Models

In [None]:
import time, os
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time

FIGSIZE = (10,6)
FONTSIZE = 10

THRESHOLD = 0.501
# TRAINS = [25, 50, 75]
TRAINS = [75]
methods = ["gmm", "kmeans", "knn"]
classifiers = [SVMLIN, SVMRBF, SVMPoly2, SVMPoly3, SVMPoly4, SVMPoly5]


tasks = list(product(classifiers, methods, TRAINS))
start = time.perf_counter()
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(SVM_TRAIN_LOG, *args) for args in tasks]
    for f in as_completed(futures):
        print(f.result())

end = time.perf_counter()
print(f"\n⏱ total time: {end - start:.2f} sec")

Classifier name: SVMLINEAR
Classifier name: SVMLINEAR
Classifier name: SVMLINEAR
Classifier name: SVMRBF
Classifier name: SVMRBF




In [None]:
# Earlier workings on SVM
# for CLASSIFIER in classifiers:
#     for METHOD in methods:
#         for TRAINSIZE in TRAINS:
#             j = CLASSIFIER.__class__.__name__
            
#             data, X = load_xy()
#             species = data["species"].to_numpy()    
#             y, p_rare = initial_rare_labels(species, method = METHOD) 
#             X, ins, y, outs = train_test_split(X, y, test_size=(1-(0.01*TRAINSIZE)), stratify=y, random_state=RS)
#             CLASSIFIER.fit(X, y)
#             proba = CLASSIFIER.predict_proba(ins)

#             mpath = Path(f"Models/SVM/{METHOD}_{j}_{TRAINSIZE}.joblib")
#             mpath.parent.mkdir(parents=True, exist_ok=True)  
#             save_model(CLASSIFIER, X, mpath)

#             binary_metrics(
#                 outs, proba, THRESHOLD,
#                 csv_path=CSV,
#                 classifier_name=CLASSIFIER.__class__.__name__,
#                 method=METHOD,
#                 train_size=TRAINSIZE
#             )
#             binary_metrics(outs, proba, THRESHOLD)

#             thresholds = np.linspace(0.35, 0.65, 10001)
#             TP, TN, FP, FN = [], [], [], []

#             for thr in thresholds:
#                 tp, tn, fp, fn = threshold(outs, proba, thr)
#                 TP.append(tp)
#                 TN.append(tn)
#                 FP.append(fp)
#                 FN.append(fn)

#             plt.figure(figsize=FIGSIZE)
#             plt.plot(thresholds, TP, "g", label="TP")
#             plt.plot(thresholds, TN, "r", label="TN")
#             plt.plot(thresholds, FP, "m", label="FP")
#             plt.plot(thresholds, FN, "c", label="FN")
#             plt.xlim([0.4, 0.6])
#             plt.xlabel("Threshold")
#             plt.ylabel("Count")
#             plt.legend()
#             plot_path = plot_dir / f"{METHOD}_{j}_{TRAINSIZE:.2f}_thresholds.png"
#             plt.savefig(plot_path, dpi=300, bbox_inches="tight")
#             plt.close()
#             binary_plots(
#                 outs, proba,
#                 classifier_name=j,
#                 method=METHOD,
#                 train_size=TRAINSIZE
#             )

In [None]:
import pandas as pd
gate_metrics = pd.read_csv("all_metrics.csv").sort_values("Classifier")
gate_metrics

In [None]:
from IPython.display import Image

## Using Kmeans and RBF kernel (3 fold cross validation):

![title](Plots/Thresholds/kmeans_SVMRBF_75_thresholds.png)

Do I even need to say much else. Can record list at 0.40 and 0.60 to determine most likely candidates.

## Using KNN and RBF kernel (3 fold cross validation):

![title](Plots/Thresholds/knn_SVMRBF_75_thresholds.png)

Best threshold around ~0.515

Benefit from the increase in TN detection, and lacked following by FN.

Whilst TP starts to plateau and TN's are at high's.

Both used RBF as expected due to the high data volume and RBF margins seperating before refined calibration.

# Stage 2 | Random Forest Ensemble for Family/Genus Classification

Now that rarity has been measured, we can start looking at class-probabilities

In [None]:
gc.collect()

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_predict
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)
RS = 1234
plt.style.use("seaborn-v0_8-whitegrid")

In [None]:
FIGSIZE = (10, 6)
FONTSIZE = 10
plot_RF = Path("Plots/RF"); plot_dir.mkdir(parents=True, exist_ok=True)
model_RF = Path("Models/RF"); model_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Helpers

In [None]:
def load_xy(drop_cols = ("family", "genus", "species")):
    data = pd.read_parquet(DATA)
    X = data.drop(columns = list(drop_cols), errors = "ignore").to_numpy(dtype=np.float32)
    return data, X

In [None]:
def initial_rare_labels(species, method="kmeans", random_state=RS, save_model=True, return_counts=False):
    cnt = Counter(species)
    counts = np.fromiter((cnt[s] for s in species), dtype=np.int32, count=len(species))
    x = np.log1p(counts).reshape(-1, 1)

    if method == "gmm":
        model = GaussianMixture(n_components=2, random_state=random_state).fit(x)
        rare_comp = np.argmin(model.means_.ravel())
        hard = (model.predict(x) == rare_comp).astype(np.int32)
        soft = model.predict_proba(x)[:, rare_comp]
        if save_model: dump(model, MODELS / "01_species_gmm.joblib")

    elif method == "kmeans":
        model = KMeans(n_clusters=2, n_init=10, random_state=random_state).fit(x)
        rare_comp = np.argmin(model.cluster_centers_.ravel())
        hard = (model.labels_ == rare_comp).astype(np.int32)
        d = np.linalg.norm(x - model.cluster_centers_[rare_comp], axis=1)
        soft = (d.max() - d) / (d.max() - d.min() + 1e-9)
        if save_model: dump(model, MODELS / "01_species_kmeans.joblib")

    elif method == "knn":
        k = 10
        model = KNeighborsRegressor(n_neighbors=k)
        model.fit(x, counts)
        local_density = model.predict(x)
        soft = 1 - (local_density - local_density.min()) / (local_density.max() - local_density.min() + 1e-9)
        hard = (soft > np.median(soft)).astype(np.int32)
        if save_model: dump(model, MODELS / "01_species_knn.joblib")

    else:
        raise ValueError("method must be 'gmm', 'kmeans', or 'knn'")

    return (hard, soft, counts) if return_counts else (hard, soft)

In [None]:
def entropy_row(p):
    p = np.clip(p, 1e-12, 1.0)
    return -(p * np.log(p)).sum(axis=1)

In [None]:
def top_margin(p):
    s = np.sort(p, axis=1)[:, ::-1]
    return s[:, 0] - s[:, 1]

In [None]:
def hard_vote(preds):
    preds = np.stack(preds, axis=1)
    return np.array([np.bincount(row).argmax() for row in preds])

In [None]:
def soft_vote(prob_list):
    P = np.mean(prob_list, axis=0)
    return P, P.argmax(axis=1)

In [None]:
def oracle_bound(preds, y_true):
    preds = np.stack(preds, axis=1)
    ok = (preds == y_true[:, None]).any(axis=1)
    return ok.mean()

In [None]:
def plot_confusion(y_true, y_pred, title, labels=None):
    cm = confusion_matrix(y_true, y_pred, normalize="true")
    plt.figure(figsize=(7,6))
    sns.heatmap(cm, cmap="YlGnBu", annot=False, cbar=True)
    plt.title(f"{title}\nNormalized Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
def plot_roc(y_true, y_score, n_classes, title):
    y_true_bin = pd.get_dummies(y_true)
    plt.figure(figsize=(7,6))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_true_bin.iloc[:, i], y_score[:, i])
        auc = roc_auc_score(y_true_bin.iloc[:, i], y_score[:, i])
        plt.plot(fpr, tpr, lw=1, label=f"Class {i} (AUC={auc:.2f})")
    plt.plot([0,1],[0,1],"--",c="grey")
    plt.title(f"ROC Curves – {title}")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()

In [None]:
def plot_pr(y_true, y_score, n_classes, title):
    y_true_bin = pd.get_dummies(y_true)
    plt.figure(figsize=(7,6))
    for i in range(n_classes):
        pr, rc, _ = precision_recall_curve(y_true_bin.iloc[:, i], y_score[:, i])
        plt.plot(rc, pr, lw=1, label=f"Class {i}")
    plt.title(f"Precision–Recall – {title}")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.legend()
    plt.show()

In [None]:

data, X = load_xy()
species = data["species"].to_numpy()
y_family = data["family"].to_numpy()
y_genus  = data["genus"].to_numpy()
_, p_rare = initial_rare_labels(species, method="kmeans")

# Split consistent with Stage 1
X, ins, yF, outsF = train_test_split(X, y_family, test_size=0.2, stratify=y_family, random_state=RS)
_, insG, yG, outsG = train_test_split(X, y_genus,  test_size=0.2, stratify=y_genus,  random_state=RS)
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=RS)


In [None]:
base_family = [
    ("RF",  RandomForestClassifier(n_estimators=300, bootstrap=True,  n_jobs=-1, random_state=RS)),
    ("Sub", RandomForestClassifier(n_estimators=300, bootstrap=False, n_jobs=-1, random_state=RS)),
    ("Bag", BaggingClassifier(estimator=DecisionTreeClassifier(random_state=RS),
                              n_estimators=200, bootstrap=True, n_jobs=-1, random_state=RS)),
]

In [None]:
base_genus = [
    ("RF",  RandomForestClassifier(n_estimators=400, bootstrap=True,  n_jobs=-1, random_state=RS)),
    ("Sub", RandomForestClassifier(n_estimators=400, bootstrap=False, n_jobs=-1, random_state=RS)),
    ("Bag", BaggingClassifier(estimator=DecisionTreeClassifier(random_state=RS),
                              n_estimators=300, bootstrap=True, n_jobs=-1, random_state=RS)),
]

In [None]:
def train_meta_ensemble(X, y, ins, outs, base_models, label="family"):
    n_classes = len(np.unique(y))
    oof_list, preds_list, prob_list = [], [], []

    print(f"\nTraining base models for {label.upper()} ...")
    for name, est in base_models:
        oof = cross_val_predict(est, X, y, cv=cv, method="predict_proba", n_jobs=-1)
        oof_list.append(oof)
        est.fit(X, y)
        p = est.predict(ins)
        P = est.predict_proba(ins)
        preds_list.append(p); prob_list.append(P)
        print(f"  {name:<4} acc = {accuracy_score(outs, p):.4f}")

    # --- stack meta features ---
    stack_train = np.hstack(oof_list)
    stack_test  = np.hstack(prob_list)
    stack_train = np.hstack([stack_train,
                             entropy_row(np.mean(oof_list, axis=0)).reshape(-1,1),
                             top_margin(np.mean(oof_list, axis=0)).reshape(-1,1)])
    stack_test  = np.hstack([stack_test,
                             entropy_row(np.mean(prob_list, axis=0)).reshape(-1,1),
                             top_margin(np.mean(prob_list, axis=0)).reshape(-1,1),
                             p_rare[:len(ins)].reshape(-1,1)])  # gate prob as meta feature

    stacker = LogisticRegression(max_iter=200, n_jobs=-1, random_state=RS)
    stacker.fit(stack_train, y)
    y_pred_stack = stacker.predict(ins)
    y_proba_stack = stacker.predict_proba(ins)

    # --- metrics ---
    acc_vote = accuracy_score(outs, hard_vote(preds_list))
    acc_soft = accuracy_score(outs, soft_vote(prob_list)[1])
    acc_stack = accuracy_score(outs, y_pred_stack)
    acc_oracle = oracle_bound(preds_list, outs)

    print(f"\nResults – {label.upper()}")
    print(f"  Hard vote  : {acc_vote:.4f}")
    print(f"  Soft vote  : {acc_soft:.4f}")
    print(f"  Stacker    : {acc_stack:.4f}")
    print(f"  Oracle     : {acc_oracle:.4f}\n")
    print(classification_report(outs, y_pred_stack, digits=3))

    # --- plots ---
    plot_confusion(outs, y_pred_stack, f"{label.upper()} – Stacker")
    plot_roc(outs, y_proba_stack, n_classes, f"{label.upper()} – Stacker")
    plot_pr(outs, y_proba_stack, n_classes, f"{label.upper()} – Stacker")

    return stacker, y_pred_stack, y_proba_stack

In [None]:
t0 = time()
fam_stacker, fam_pred, fam_proba = train_meta_ensemble(X, yF, ins, outsF, base_family, label="family")
gen_stacker, gen_pred, gen_proba = train_meta_ensemble(X, yG, insG, outsG, base_genus, label="genus")

In [None]:
def build_mapping_family_to_genus(family, genus):
    mapping = {}
    for f, g in zip(family, genus):
        mapping.setdefault(int(f), set()).add(int(g))
    return mapping

def enforce_hierarchy(pred_family, proba_genus, mapping):
    fixed = proba_genus.copy()
    for i, f in enumerate(pred_family):
        allowed = mapping.get(int(f), None)
        if allowed is None: continue
        mask = np.ones(fixed.shape[1], dtype=bool)
        mask[list(allowed)] = False
        fixed[i, mask] = 0
        s = fixed[i].sum()
        if s > 0: fixed[i] /= s
    return fixed, fixed.argmax(axis=1)

fam2gen = build_mapping_family_to_genus(y_family, y_genus)
gen_fixed_proba, gen_fixed_pred = enforce_hierarchy(fam_pred, gen_proba, fam2gen)

print(f"\n[Hierarchy] genus corrections {(gen_fixed_pred != gen_pred).mean():.3f} of samples changed.")
print(f"[Hierarchy] corrected genus acc = {accuracy_score(outsG, gen_fixed_pred):.4f}")

plot_confusion(outsG, gen_fixed_pred, "GENUS – Hierarchy-Corrected")

# Stage 3 | 

# Stage 3: Boosting Classifier for Final Species Identification

Purpose: The final stage predicts the exact species. This is the most fine-grained classification, and we use a boosting ensemble to maximize accuracy. Boosting works by sequentially training “weak” learners and focusing on mistakes of the previous ones
scikit-learn.org
. The two common options are AdaBoost (Adaptive Boosting) and Gradient Boosting. Scikit-learn provides AdaBoostClassifier (which by default uses shallow decision trees as base estimators) and the more recent HistGradientBoostingClassifier – a fast implementation of gradient-boosted trees that is great for tabular data
scikit-learn.org
. We can choose either; here we’ll demonstrate with AdaBoost (for multi-class, scikit-learn uses the SAMME.R algorithm).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from joblib import dump
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

FIGSIZE = (9,6)
RS = 42
plot_dir = Path("Plots/RF")
plot_dir.mkdir(parents=True, exist_ok=True)
model_dir = Path("Models/RF")
model_dir.mkdir(parents=True, exist_ok=True)

In [None]:
from sklearn.ensemble import AdaBoostClassifier  # or HistGradientBoostingClassifier

species_clf = AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier(max_depth=3),
                                 random_state=42)
species_clf.fit(X_train_species, y_train_species)  # y_train_species has the final species labels
dump(species_clf, "models/species_boost_model.joblib")


In [None]:
y_pred_species = species_clf.predict(X_test_species)
cm_species = confusion_matrix(y_test_species, y_pred_species, labels=species_clf.classes_)
ConfusionMatrixDisplay(confusion_matrix=cm_species,
                      display_labels=species_clf.classes_).plot(cmap="Blues", xticks_rotation="vertical")
plt.title("Species Classifier Confusion Matrix")
plt.show()


In [None]:
# def binary_plots(outs, proba, classifier_name=None, method=None, train_size=None):

#     fpr,tpr, thresholds = roc_curve(outs, proba[:,0])
#         # TPR/FPR
#     print(tpr-fpr)
#     fig,ax = plt.subplots(figsize=FIGSIZE)
#     ax.plot(thresholds, tpr, color = "b")
#     ax.plot(thresholds, fpr, color = "r")
#     ax.plot(thresholds, (tpr-fpr)-0.1, color = "g")
#     ax.invert_xaxis()
#     ax.set_xlabel("threshold", fontsize=FONTSIZE)
#     ax.set_ylabel("fraction", fontsize=FONTSIZE)
#     ax.legend(["TPR", "FPR", "distance"])
#     plot_dir = Path("Plots")
#     thresh_path = plot_dir / f"{method}_{classifier_name}_{train_size}_tprfpr.png"
#     fig.savefig(thresh_path, dpi=300, bbox_inches="tight")

#     # ROC plot
#     fig,ax = plt.subplots(figsize=FIGSIZE)
#     ax.plot(fpr,tpr, color="b")
#     ax.plot([0,1], [0,1], "r--")
#     ax.set_xlabel("FPR", fontsize=FONTSIZE)
#     ax.set_ylabel("TPR", fontsize=FONTSIZE)
#     ax.set_aspect("equal", "box")
#     roc_path = plot_dir / f"{method}_{classifier_name}_{train_size}_roc.png"
#     fig.savefig(roc_path, dpi=300, bbox_inches="tight")

# Best Model 

# Best Model:

In [None]:
# Initial Gate
# MODEL = "models/rare_gate_supervised.joblib"
# def predict_rare(X_new, model_path=MODEL):
#     m = joblib.load(model_path)
#     pred = m.predict(X_new.astype(np.float32))       # 1=rare, 0=common
#     return pred
# X_new = ...  # numpy array [n_samples, n_features]
# rare_flags = predict_rare(X_new)  # 1=rare, 0=common


Should be madatory watching






https://www.youtube.com/@machinelearningpractice2089