In [9]:
#import necessary libraries
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from pathlib import Path
from collections import Counter
import pandas as pd, joblib


In [10]:
CLASSES = ['LEFT','CENTRE','RIGHT']

def _order_cols_3(est_classes):
    est_classes = np.asarray(est_classes)
    # numeric classes like [0,1,2]
    if np.issubdtype(est_classes.dtype, np.integer):
        pos = {int(c): j for j, c in enumerate(est_classes)}  # 0->col,1->col,2->col
        return [pos[0], pos[1], pos[2]]
    # string classes like ['LEFT','CENTRE','RIGHT'] or 'center'
    canon = [str(c).upper().replace("CENTER","CENTRE") for c in est_classes]
    pos = {c: j for j, c in enumerate(canon)}
    return [pos['LEFT'], pos['CENTRE'], pos['RIGHT']]

def get_oof_probas_3_fast(
    estimator, X, y,
    n_splits=3, random_state=42,
    calib_cv=2, max_features_override=50000, verbose=True
):
    """
    Fast, leak-safe OOF probabilities for 3-class base models.
    - y: array-like of ints in {0,1,2} (LEFT,CENTRE,RIGHT)
    - Works for tabular or text Pipelines; calibrates if no predict_proba.
    """
    y_arr = np.asarray(y, dtype=int).ravel()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof = np.zeros((len(y_arr), 3), dtype=float)

    for k, (tr, va) in enumerate(skf.split(np.zeros(len(y_arr)), y_arr), 1):
        est = clone(estimator)

        # Speed-up for text: cap TF-IDF features inside each fold if present
        if isinstance(est, Pipeline) and 'tfidf' in est.named_steps and max_features_override is not None:
            try:
                est.named_steps['tfidf'].set_params(max_features=max_features_override)
            except Exception:
                pass

        # Add calibration if the base lacks predict_proba (e.g., LinearSVC)
        needs_cal = not hasattr(est, "predict_proba") and hasattr(est, "decision_function")
        if needs_cal:
            est = CalibratedClassifierCV(est, method='sigmoid', cv=calib_cv)

        Xi_tr = X.iloc[tr] if hasattr(X, "iloc") else np.asarray(X)[tr]
        Xi_va = X.iloc[va] if hasattr(X, "iloc") else np.asarray(X)[va]

        est.fit(Xi_tr, y_arr[tr])
        p = est.predict_proba(Xi_va)

        order3 = _order_cols_3(getattr(est, "classes_", np.arange(p.shape[1])))
        p3 = p[:, order3]

        s = p3.sum(axis=1, keepdims=True); s[s==0] = 1.0
        oof[va] = p3 / s

        if verbose:
            print(f"Fold {k}/{n_splits} done (calibrated={needs_cal})")

    return oof

def _entropy(a): a = np.clip(a, 1e-12, 1.0); return (-a*np.log(a)).sum(axis=1)
def _margin(a):  t2 = np.sort(a, axis=1)[:, -2:]; return t2[:,1] - t2[:,0]

def probs_to_df(p, prefix, n_rows):
    if p is None:
        df = pd.DataFrame({f'{prefix}_p_{c}': np.nan for c in CLASSES}, index=range(n_rows))
        df[f'{prefix}_entropy'] = np.nan; df[f'{prefix}_margin'] = np.nan
        return df
    df = pd.DataFrame(p, columns=[f'{prefix}_p_{c}' for c in CLASSES])
    df[f'{prefix}_entropy'] = _entropy(p)
    df[f'{prefix}_margin']  = _margin(p)
    return df

def make_block(y, dataset_name, p_nela=None, p_factoid=None, p_anes=None):
    n = len(y)
    X = pd.concat([
        probs_to_df(p_nela,    'nela',    n),
        probs_to_df(p_factoid, 'factoid', n),
        probs_to_df(p_anes,    'anes',    n),
    ], axis=1)
    X['dataset'] = dataset_name
    return X, pd.Series(y)

In [12]:
ROOT = Path.home() / "Desktop" / "Dissertation"
ART  = ROOT / "anes_artifacts"

pipeline_path = ART / "anes_pipeline.joblib"            # 3-class spec you saved
labels_path   = ART / "anes_train_labels_3.csv"         # <-- 3-class
parquet_path  = ART / "anes_train_features.parquet"
csv_path      = ART / "anes_train_features.csv"

# Load features (prefer Parquet, fall back to CSV)
X_anes = pd.read_parquet(parquet_path) if parquet_path.exists() else pd.read_csv(csv_path)

y_anes_3  = pd.read_csv(labels_path)['label'].astype(str)   # <-- 3-class labels
model_anes = joblib.load(pipeline_path)

print("X_anes shape:", X_anes.shape)
print("y_anes_3 counts:\n", y_anes_3.value_counts())

X_anes shape: (8277, 113)
y_anes_3 counts:
 label
CENTRE    3784
LEFT      3363
RIGHT     1130
Name: count, dtype: int64


In [13]:
# Encode 3-class labels to 0..2 in fixed order
ORDER3 = ['LEFT','CENTRE','RIGHT']
y_anes_codes = pd.Categorical(
    pd.Series(y_anes_3).astype(str).str.upper().str.replace("CENTER","CENTRE", regex=False),
    categories=ORDER3, ordered=True
).codes
assert (y_anes_codes != -1).all(), "Unmapped labels present in y_anes_3."

# Align indices
X_anes = X_anes.reset_index(drop=True)
y_anes_codes = pd.Series(y_anes_codes).reset_index(drop=True).astype(int).values

# Sanity checks
assert len(X_anes) == len(y_anes_codes), f"Length mismatch: X={len(X_anes)} vs y={len(y_anes_codes)}"
print("X_anes:", X_anes.shape,
      "| unique y:", sorted(np.unique(y_anes_codes)),
      "| counts:", Counter(y_anes_codes))

# Choose CV splits (≤ smallest class; min 2, cap at 3 for speed)
cls_counts = pd.Series(y_anes_codes).value_counts()
n_splits_anes = int(max(2, min(3, cls_counts.min())))
print("Using n_splits_anes =", n_splits_anes)


X_anes: (8277, 113) | unique y: [np.int64(0), np.int64(1), np.int64(2)] | counts: Counter({np.int64(1): 3784, np.int64(0): 3363, np.int64(2): 1130})
Using n_splits_anes = 3


In [14]:
# OOF (3-class) for ANES
anes_oof = get_oof_probas_3_fast(
    model_anes, X_anes, y_anes_codes,   # y_anes_codes from y_anes_3 → 0,1,2
    n_splits=n_splits_anes, calib_cv=2, max_features_override=None, verbose=True
)
print("anes_oof:", anes_oof.shape, "| row-sum min/max:",
      np.round(anes_oof.sum(axis=1).min(),4), np.round(anes_oof.sum(axis=1).max(),4))

# Build ANES meta-block (use 3-class labels)
Xa, ya = make_block(
    pd.Series(y_anes_3).str.upper().str.replace("CENTER","CENTRE", regex=False),
    'anes',
    p_anes=anes_oof
)
print("Xa:", Xa.shape)
print("ya counts:\n", ya.value_counts())


Fold 1/3 done (calibrated=False)
Fold 2/3 done (calibrated=False)
Fold 3/3 done (calibrated=False)
anes_oof: (8277, 3) | row-sum min/max: 1.0 1.0
Xa: (8277, 16)
ya counts:
 label
CENTRE    3784
LEFT      3363
RIGHT     1130
Name: count, dtype: int64


In [16]:
# ===== N1 (FAST): Load NELA artifacts → fast OOF → meta block =====
from pathlib import Path
import numpy as np, pandas as pd, joblib, time
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

ROOT  = Path.home() / "Desktop" / "Dissertation"
ARTN  = ROOT / "nela_artifacts"

# 1) Load artifacts
nela_texts = pd.read_csv(ARTN / "nela_train_text.csv")["text"].astype(str).fillna("")
y_nela_3   = pd.read_csv(ARTN / "nela_train_labels_3.csv")["label"].astype(str)
model_nela = joblib.load(ARTN / "nela_pipeline.joblib")
print("Loaded NELA:", len(nela_texts), "texts")

# 2) Encode labels to 0..2 (LEFT,CENTRE,RIGHT)
ORDER3 = ['LEFT','CENTRE','RIGHT']
y_nela_codes = pd.Categorical(
    y_nela_3.str.upper().str.replace("CENTER","CENTRE", regex=False),
    categories=ORDER3, ordered=True
).codes

# 3) Pick fast CV folds (≤ smallest class, cap at 3; min 2)
cls_counts = pd.Series(y_nela_codes).value_counts()
n_splits_nela = int(max(2, min(3, cls_counts.min())))
print("NELA splits (fast):", n_splits_nela, "| counts:", cls_counts.sort_index().to_dict())

def _order_cols_3(est_classes):
    est_classes = np.array(est_classes)
    if np.issubdtype(est_classes.dtype, np.integer):
        m = {int(c): j for j, c in enumerate(est_classes)}  # [0,1,2]
        return [m[0], m[1], m[2]]
    canon = [str(c).upper().replace("CENTER","CENTRE") for c in est_classes]
    m = {c: j for j, c in enumerate(canon)}
    return [m['LEFT'], m['CENTRE'], m['RIGHT']]

def get_oof_probas_any_fast(
    estimator, X, y,
    n_splits=3, calib_cv=2, max_features_override=50000, verbose=True
):
    """Fast OOF: fewer folds, lighter calibration, cap TF-IDF max_features during OOF."""
    y_arr = np.asarray(y, dtype=int).ravel()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros((len(y_arr), 3), dtype=float)

    for fold, (tr, va) in enumerate(skf.split(np.zeros(len(y_arr)), y_arr), 1):
        t0 = time.time()
        est = clone(estimator)

        # If Pipeline with 'tfidf', cap features for speed (only inside this fold)
        if isinstance(est, Pipeline) and 'tfidf' in est.named_steps and max_features_override is not None:
            try:
                est.named_steps['tfidf'].set_params(max_features=max_features_override)
            except Exception:
                pass

        # Wrap with calibrator if no predict_proba but has decision_function (e.g., LinearSVC)
        needs_cal = not hasattr(est, "predict_proba") and hasattr(est, "decision_function")
        if needs_cal:
            est = CalibratedClassifierCV(est, method='sigmoid', cv=calib_cv)

        Xi_tr = X.iloc[tr] if hasattr(X, "iloc") else np.asarray(X)[tr]
        Xi_va = X.iloc[va] if hasattr(X, "iloc") else np.asarray(X)[va]

        est.fit(Xi_tr, y_arr[tr])
        p = est.predict_proba(Xi_va)
        est_classes = getattr(est, "classes_", np.arange(p.shape[1]))

        if p.shape[1] == 5:
            # Ensure [EL, L, C, R, ER] then collapse to 3
            if np.issubdtype(np.asarray(est_classes).dtype, np.integer):
                order5 = np.argsort(est_classes)  # 0..4
                p5 = p[:, order5]
            else:
                canon = [str(c).upper().replace(' ', '_') for c in est_classes]
                order5 = [canon.index(k) for k in ['EXTREME_LEFT','LEFT','CENTRE','RIGHT','EXTREME_RIGHT']]
                p5 = p[:, order5]
            p3 = np.stack([p5[:,0]+p5[:,1], p5[:,2], p5[:,3]+p5[:,4]], axis=1)
        else:
            p3 = p[:, _order_cols_3(est_classes)]

        s = p3.sum(axis=1, keepdims=True); s[s==0] = 1.0
        oof[va] = p3 / s

        if verbose:
            print(f"Fold {fold}/{n_splits} done in {time.time()-t0:.1f}s (calibrated={needs_cal})")
    return oof

# 4) FAST OOF call
nela_oof = get_oof_probas_any_fast(
    model_nela, nela_texts, y_nela_codes,
    n_splits=n_splits_nela, calib_cv=2, max_features_override=50000, verbose=True
)
print("nela_oof:", nela_oof.shape, "| row-sum min/max:",
      np.round(nela_oof.sum(axis=1).min(),4), np.round(nela_oof.sum(axis=1).max(),4))

# 5) Build NELA meta-block (define make_block fallback if missing)
if 'make_block' not in globals():
    def _entropy(a): a = np.clip(a, 1e-12, 1.0); return (-a*np.log(a)).sum(axis=1)
    def _margin(a): t2 = np.sort(a, axis=1)[:, -2:]; return t2[:,1]-t2[:,0]
    def probs_to_df(p, prefix, n):
        CLASSES = ['LEFT','CENTRE','RIGHT']
        df = pd.DataFrame(p, columns=[f'{prefix}_p_{c}' for c in CLASSES])
        df[f'{prefix}_entropy'] = _entropy(p); df[f'{prefix}_margin'] = _margin(p)
        return df
    def make_block(y, dataset_name, p_nela=None, p_factoid=None, p_anes=None):
        n = len(y)
        CLASSES = ['LEFT','CENTRE','RIGHT']
        frames = []
        for pfx, p in [('nela', p_nela), ('factoid', p_factoid), ('anes', p_anes)]:
            if p is None:
                df = pd.DataFrame({f'{pfx}_p_{c}': np.nan for c in CLASSES}, index=range(n))
                df[f'{pfx}_entropy'] = np.nan; df[f'{pfx}_margin'] = np.nan
            else:
                df = probs_to_df(p, pfx, n)
            frames.append(df)
        X = pd.concat(frames, axis=1); X['dataset'] = dataset_name
        return X, pd.Series(y)

Xn, yn = make_block(
    y_nela_3.str.upper().str.replace("CENTER","CENTRE", regex=False),
    'nela',
    p_nela=nela_oof
)
print("Xn:", Xn.shape)
print("yn counts:\n", yn.value_counts())


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded NELA: 1772948 texts
NELA splits (fast): 3 | counts: {0: 765787, 1: 338588, 2: 668573}


AttributeError: 'Pipeline' object has no attribute 'transform_input'

In [15]:
# ===== N1 (FAST): Load NELA artifacts → fast OOF → meta block =====
from pathlib import Path
import numpy as np, pandas as pd, joblib, time
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

ROOT  = Path.home() / "Desktop" / "Dissertation"
ARTN  = ROOT / "nela_artifacts"

# 1) Load artifacts
nela_texts = pd.read_csv(ARTN / "nela_train_text.csv")["text"].astype(str).fillna("")
y_nela_3   = pd.read_csv(ARTN / "nela_train_labels_3.csv")["label"].astype(str)
model_nela = joblib.load(ARTN / "nela_pipeline.joblib")
print("Loaded NELA:", len(nela_texts), "texts")

# 2) Encode labels to 0..2 (LEFT,CENTRE,RIGHT)
ORDER3 = ['LEFT','CENTRE','RIGHT']
y_nela_codes = pd.Categorical(
    y_nela_3.str.upper().str.replace("CENTER","CENTRE", regex=False),
    categories=ORDER3, ordered=True
).codes

# 3) Pick fast CV folds (≤ smallest class, cap at 3; min 2)
cls_counts = pd.Series(y_nela_codes).value_counts()
n_splits_nela = int(max(2, min(3, cls_counts.min())))
print("NELA splits (fast):", n_splits_nela, "| counts:", cls_counts.sort_index().to_dict())

def _order_cols_3(est_classes):
    est_classes = np.array(est_classes)
    if np.issubdtype(est_classes.dtype, np.integer):
        m = {int(c): j for j, c in enumerate(est_classes)}  # [0,1,2]
        return [m[0], m[1], m[2]]
    canon = [str(c).upper().replace("CENTER","CENTRE") for c in est_classes]
    m = {c: j for j, c in enumerate(canon)}
    return [m['LEFT'], m['CENTRE'], m['RIGHT']]

def get_oof_probas_any_fast(
    estimator, X, y,
    n_splits=3, calib_cv=2, max_features_override=50000, verbose=True
):
    """Fast OOF: fewer folds, lighter calibration, cap TF-IDF max_features during OOF."""
    y_arr = np.asarray(y, dtype=int).ravel()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros((len(y_arr), 3), dtype=float)

    for fold, (tr, va) in enumerate(skf.split(np.zeros(len(y_arr)), y_arr), 1):
        t0 = time.time()
        est = clone(estimator)

        # If Pipeline with 'tfidf', cap features for speed (only inside this fold)
        if isinstance(est, Pipeline) and 'tfidf' in est.named_steps and max_features_override is not None:
            try:
                est.named_steps['tfidf'].set_params(max_features=max_features_override)
            except Exception:
                pass

        # Wrap with calibrator if no predict_proba but has decision_function (e.g., LinearSVC)
        needs_cal = not hasattr(est, "predict_proba") and hasattr(est, "decision_function")
        if needs_cal:
            est = CalibratedClassifierCV(est, method='sigmoid', cv=calib_cv)

        Xi_tr = X.iloc[tr] if hasattr(X, "iloc") else np.asarray(X)[tr]
        Xi_va = X.iloc[va] if hasattr(X, "iloc") else np.asarray(X)[va]

        est.fit(Xi_tr, y_arr[tr])
        p = est.predict_proba(Xi_va)
        est_classes = getattr(est, "classes_", np.arange(p.shape[1]))

        if p.shape[1] == 5:
            # Ensure [EL, L, C, R, ER] then collapse to 3
            if np.issubdtype(np.asarray(est_classes).dtype, np.integer):
                order5 = np.argsort(est_classes)  # 0..4
                p5 = p[:, order5]
            else:
                canon = [str(c).upper().replace(' ', '_') for c in est_classes]
                order5 = [canon.index(k) for k in ['EXTREME_LEFT','LEFT','CENTRE','RIGHT','EXTREME_RIGHT']]
                p5 = p[:, order5]
            p3 = np.stack([p5[:,0]+p5[:,1], p5[:,2], p5[:,3]+p5[:,4]], axis=1)
        else:
            p3 = p[:, _order_cols_3(est_classes)]

        s = p3.sum(axis=1, keepdims=True); s[s==0] = 1.0
        oof[va] = p3 / s

        if verbose:
            print(f"Fold {fold}/{n_splits} done in {time.time()-t0:.1f}s (calibrated={needs_cal})")
    return oof

# 4) FAST OOF call
nela_oof = get_oof_probas_any_fast(
    model_nela, nela_texts, y_nela_codes,
    n_splits=n_splits_nela, calib_cv=2, max_features_override=50000, verbose=True
)
print("nela_oof:", nela_oof.shape, "| row-sum min/max:",
      np.round(nela_oof.sum(axis=1).min(),4), np.round(nela_oof.sum(axis=1).max(),4))

# 5) Build NELA meta-block (define make_block fallback if missing)
if 'make_block' not in globals():
    def _entropy(a): a = np.clip(a, 1e-12, 1.0); return (-a*np.log(a)).sum(axis=1)
    def _margin(a): t2 = np.sort(a, axis=1)[:, -2:]; return t2[:,1]-t2[:,0]
    def probs_to_df(p, prefix, n):
        CLASSES = ['LEFT','CENTRE','RIGHT']
        df = pd.DataFrame(p, columns=[f'{prefix}_p_{c}' for c in CLASSES])
        df[f'{prefix}_entropy'] = _entropy(p); df[f'{prefix}_margin'] = _margin(p)
        return df
    def make_block(y, dataset_name, p_nela=None, p_factoid=None, p_anes=None):
        n = len(y)
        CLASSES = ['LEFT','CENTRE','RIGHT']
        frames = []
        for pfx, p in [('nela', p_nela), ('factoid', p_factoid), ('anes', p_anes)]:
            if p is None:
                df = pd.DataFrame({f'{pfx}_p_{c}': np.nan for c in CLASSES}, index=range(n))
                df[f'{pfx}_entropy'] = np.nan; df[f'{pfx}_margin'] = np.nan
            else:
                df = probs_to_df(p, pfx, n)
            frames.append(df)
        X = pd.concat(frames, axis=1); X['dataset'] = dataset_name
        return X, pd.Series(y)

Xn, yn = make_block(
    y_nela_3.str.upper().str.replace("CENTER","CENTRE", regex=False),
    'nela',
    p_nela=nela_oof
)
print("Xn:", Xn.shape)
print("yn counts:\n", yn.value_counts())

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded NELA: 1772948 texts
NELA splits (fast): 3 | counts: {0: 765787, 1: 338588, 2: 668573}


AttributeError: 'Pipeline' object has no attribute 'transform_input'

In [None]:
# ==== FACTOID: LOAD ARTIFACTS ====
from pathlib import Path
import pandas as pd, numpy as np, joblib

ROOT = Path.home() / "Desktop" / "Dissertation"
ARTF = ROOT / "factoid_artifacts"

fact_texts   = pd.read_csv(ARTF / "factoid_train_text.csv")["text"]
y_fact_3     = pd.read_csv(ARTF / "factoid_train_labels_3.csv")["label"].astype(str)
model_factoid = joblib.load(ARTF / "factoid_pipeline.joblib")

# Encode labels to 0..2 in fixed order
ORDER3 = ['LEFT','CENTRE','RIGHT']
y_fact_codes = pd.Categorical(y_fact_3.str.upper(), categories=ORDER3, ordered=True).codes

# Choose CV splits safely (can’t exceed smallest class size)
cls_counts = pd.Series(y_fact_codes).value_counts()
n_splits_fact = int(max(2, min(5, cls_counts.min())))
print("FACTOID splits:", n_splits_fact, "| counts:", cls_counts.sort_index().to_dict())

# ==== FACTOID: BUILD OOF (3-class) ====
factoid_oof = get_oof_probas_any_fast(model_factoid, fact_texts, y_fact_codes, n_splits=n_splits_fact)
print("factoid_oof:", factoid_oof.shape, "| row-sum min/max:",
      np.round(factoid_oof.sum(axis=1).min(),4), np.round(factoid_oof.sum(axis=1).max(),4))

# ==== FACTOID: BUILD META-BLOCK ====
Xf, yf = make_block(y_fact_3.str.upper().str.replace("CENTER","CENTRE", regex=False), 'factoid',
                    p_factoid=factoid_oof)

print("Xf:", Xf.shape)
print("yf counts:\n", yf.value_counts())


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


FACTOID splits: 5 | counts: {0: 2989, 1: 617, 2: 295}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


NameError: name 'get_oof_probas_any_fast' is not defined

In [None]:
# === N2 (ALL-THREE): train meta on Xn+Xf+Xa ===
import pandas as pd, numpy as np, joblib
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Concatenate available blocks
blocks = [blk for blk in [Xn if 'Xn' in globals() else None,
                          Xf if 'Xf' in globals() else None,
                          Xa if 'Xa' in globals() else None] if blk is not None]
labels = [lbl for lbl in [yn if 'yn' in globals() else None,
                          yf if 'yf' in globals() else None,
                          ya if 'ya' in globals() else None] if lbl is not None]

X_meta = pd.concat(blocks, axis=0, ignore_index=True)
y_meta = pd.concat(labels, axis=0, ignore_index=True)
print("Included datasets:", X_meta['dataset'].value_counts().to_dict())

# One-hot dataset tag
X_meta = pd.get_dummies(X_meta, columns=['dataset'], drop_first=False)
META_COLUMNS = X_meta.columns.tolist()

# Encode labels
CLASSES = ['LEFT','CENTRE','RIGHT']
le = LabelEncoder().fit(CLASSES)
y_enc = le.transform(y_meta)

# Optional class weights
cls_counts = pd.Series(y_enc).value_counts()
cls_weight = {i: (len(y_enc)/(len(cls_counts)*cls_counts[i])) for i in cls_counts.index}
w = pd.Series(y_enc).map(cls_weight).values

# Train/val split + train meta
X_tr, X_va, y_tr, y_va, w_tr, w_va = train_test_split(
    X_meta, y_enc, w, test_size=0.2, stratify=y_enc, random_state=42
)
meta = XGBClassifier(
    objective='multi:softprob', num_class=3,
    n_estimators=700, learning_rate=0.035, max_depth=4,
    subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
    tree_method='hist', eval_metric='mlogloss', n_jobs=-1
)
meta.fit(X_tr, y_tr, sample_weight=w_tr)

# Eval + save
pred = meta.predict(X_va)
pred_lbl, true_lbl = le.inverse_transform(pred), le.inverse_transform(y_va)
print("Meta Accuracy:", accuracy_score(true_lbl, pred_lbl))
print(classification_report(true_lbl, pred_lbl, digits=3, labels=CLASSES))
print(pd.DataFrame(confusion_matrix(true_lbl, pred_lbl, labels=CLASSES),
                   index=CLASSES, columns=CLASSES))

META_DIR = Path.home() / "Desktop" / "Dissertation" / "meta_artifacts"
META_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(meta, META_DIR / "meta_model.joblib")
joblib.dump(META_COLUMNS, META_DIR / "meta_columns.joblib")
joblib.dump(le, META_DIR / "meta_label_encoder.joblib")
print("Saved:", [p.name for p in META_DIR.iterdir()])


ValueError: No objects to concatenate

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, brier_score_loss

# meta_df must have at least: ['dataset','y_true_LCR','p_left','p_centre','p_right']
fact = meta[meta['dataset']=='FACTOID'].copy()

# 1) score
fact['s_meta'] = 1 - fact['p_centre']

# 2) true binary
y_true_bin = (fact['y_true_LCR'] != 'CENTRE').astype(int).to_numpy()

# 3) pick a threshold (reuse 0.56 to mirror your standalone, or tune on a FACTOID val split)
t = 0.56
y_hat_bin = (fact['s_meta'] >= t).astype(int)

# 4) metrics
print(classification_report(y_true_bin, y_hat_bin, target_names=['CENTRE','POLARIZED']))
print("AUROC:", roc_auc_score(y_true_bin, fact['s_meta']))
print("AP:", average_precision_score(y_true_bin, fact['s_meta']))
print("Brier:", brier_score_loss(y_true_bin, fact['s_meta']))
print("Confusion matrix:\n", confusion_matrix(y_true_bin, y_hat_bin))


NameError: name 'meta' is not defined