
# Unified Multiclass Phase Prediction — Cross‑Validated Baselines (LogReg, RF, SVM, XGB, CatBoost)

Note: This code is *not* polished. It's very RAM hungry and will stall your computer if you run out. Be warned!



## Configuration


In [None]:
from pathlib import Path

# Choose one:
DATASET_PATH = Path('dataset_engineered.csv')

# General settings
TARGET_COL = 'phase'
MIN_CLASS_COUNT = 2
TEST_SIZE = 0.20
RANDOM_SEED = 42

# CV & search
CV_FOLDS = 5
N_ITER = 20
N_JOBS = 8
OUTPUT_DIR = Path('outputs_unified')
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)



## Imports


In [None]:

import json
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Any

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    classification_report, ConfusionMatrixDisplay,
    accuracy_score, balanced_accuracy_score, f1_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    HAVE_XGB = False

try:
    from catboost import CatBoostClassifier
    HAVE_CAT = True
except Exception:
    HAVE_CAT = False

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (8, 6)



## 1) Load & Clean


In [3]:

# Load
df = pd.read_csv(DATASET_PATH)
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in {DATASET_PATH}")

# Drop ultra-rare classes
vc = df[TARGET_COL].value_counts()
rare = vc[vc < MIN_CLASS_COUNT].index.tolist()
if len(rare) > 0:
    print(f"Dropping rare classes (<{MIN_CLASS_COUNT} samples): {rare}")
    df = df[~df[TARGET_COL].isin(rare)].copy()

# Split
y = df[TARGET_COL].astype(str)
X = df.drop(columns=[TARGET_COL])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

# Column types
numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
categorical_cols = [c for c in X.columns if c not in numeric_cols]

print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")
print(f"Numeric cols: {len(numeric_cols)} | Categorical cols: {len(categorical_cols)}")


Dropping rare classes (<2 samples): ['wurtzite', 'rutile-type', 'hexagonal', 'scheelite', 'p2-type layered', 'disilicates', 'columbite', 'pseudocubic t phase structure']
Train shape: (591, 276) | Test shape: (148, 276)
Numeric cols: 276 | Categorical cols: 0



## 2) Shared Preprocessing


In [4]:

numeric_tf = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler(with_mean=False)),
])
categorical_tf = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_tf, numeric_cols),
        ('cat', categorical_tf, categorical_cols),
    ],
    remainder='drop',
    sparse_threshold=0.3,
)



## 3) Helper for Numeric‑label Models


In [5]:

from sklearn.base import BaseEstimator, ClassifierMixin, clone

class LabelEncodedClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.le_ = LabelEncoder()
        self.est_ = None

    def fit(self, X, y):
        y_enc = self.le_.fit_transform(y)
        self.est_ = clone(self.base_estimator)
        self.est_.fit(X, y_enc)
        return self

    def predict(self, X):
        y_enc = self.est_.predict(X)
        return self.le_.inverse_transform(y_enc)

    def predict_proba(self, X):
        proba = self.est_.predict_proba(X)
        # Columns are in encoded class order; map back
        return proba



## 4) Models & Search Spaces


In [6]:

from sklearn.metrics import make_scorer
SCORER = make_scorer(f1_score, average='macro')

def build_models() -> Dict[str, Any]:
    models = {}

    # Logistic Regression (OvR)
    models['logreg'] = LogisticRegression(
        multi_class='ovr', random_state=RANDOM_SEED
    )
    # Random Forest
    models['rf'] = RandomForestClassifier(
        n_jobs=-1, random_state=RANDOM_SEED
    )
    # SVM (RBF)
    models['svm'] = SVC(
        kernel='rbf', probability=True, random_state=RANDOM_SEED
    )
    # XGBoost (wrapped)
    if HAVE_XGB:
        models['xgb'] = LabelEncodedClassifier(
            XGBClassifier(
                objective='multi:softprob',
                tree_method='hist',
                random_state=RANDOM_SEED,
                n_jobs=-1,
            )
        )
    # CatBoost (can handle string labels but we use default sklearn API)
    if HAVE_CAT:
        models['cat'] = CatBoostClassifier(
            random_state=RANDOM_SEED,
            allow_writing_files=False,
            loss_function='MultiClass',
            verbose=False,
        )
    return models


def search_spaces() -> Dict[str, Dict[str, list]]:
    spaces = {
        'logreg': {
            'clf__C': list(np.logspace(-2, 2, 7)),
            'clf__penalty': ['l2'],
            'clf__solver': ['lbfgs'],
            'clf__max_iter': [2000],
            'clf__class_weight': [None, 'balanced'],
        },
        'rf': {
            'clf__n_estimators': [200, 300, 500],
            'clf__max_depth': [None, 8, 12, 16, 24],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__max_features': ['sqrt', 'log2', 0.5],
            'clf__class_weight': [None, 'balanced_subsample'],
        },
        'svm': {
            'clf__C': list(np.logspace(-2, 2, 7)),
            'clf__gamma': ['scale', 'auto'],
            'clf__class_weight': [None, 'balanced'],
        },
    }
    if HAVE_XGB:
        spaces['xgb'] = {
            'clf__base_estimator__n_estimators': [300, 500, 800],
            'clf__base_estimator__learning_rate': [0.03, 0.05, 0.1],
            'clf__base_estimator__max_depth': [4, 6, 8],
            'clf__base_estimator__subsample': [0.7, 0.9, 1.0],
            'clf__base_estimator__colsample_bytree': [0.6, 0.8, 1.0],
            'clf__base_estimator__reg_lambda': [0.0, 1.0, 5.0, 10.0],
            'clf__base_estimator__reg_alpha': [0.0, 0.5, 1.0],
        }
    if HAVE_CAT:
        spaces['cat'] = {
            'clf__depth': [4, 6, 8, 10],
            'clf__learning_rate': [0.03, 0.05, 0.1],
            'clf__l2_leaf_reg': [1, 3, 5, 9],
            'clf__bagging_temperature': [0.0, 0.5, 1.0],
            'clf__iterations': [500, 800, 1200],
        }
    return spaces



## 5) Train‑and‑Evaluate



In [7]:

import joblib


def train_and_eval(model_key: str,
                   model,
                   param_space: Dict[str, list]) -> Dict[str, Any]:
    print(f"=== {model_key.upper()} ===")
    pipe = Pipeline([('pre', preprocess), ('clf', model)])

    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_SEED)

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_space,
        n_iter=N_ITER,
        scoring=SCORER,
        cv=cv,
        n_jobs=N_JOBS,
        random_state=RANDOM_SEED,
        refit=True,
        verbose=1
    )
    search.fit(X_train, y_train)

    best_pipe = search.best_estimator_

    # Test evaluation
    y_pred = best_pipe.predict(X_test)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
        'macro_f1': f1_score(y_test, y_pred, average='macro'),
        'weighted_f1': f1_score(y_test, y_pred, average='weighted'),
        'report': classification_report(y_test, y_pred, output_dict=True),
        'best_params': search.best_params_,
        'cv_best_score_macro_f1': search.best_score_,
    }

    # Save artefacts
    out_dir = OUTPUT_DIR / model_key
    out_dir.mkdir(parents=True, exist_ok=True)

    pd.DataFrame(search.cv_results_).to_csv(out_dir / f"{model_key}_cv_results.csv", index=False)
    (out_dir / f"{model_key}_test_metrics.json").write_text(json.dumps(metrics, indent=2))

    # Confusion matrix
    fig, ax = plt.subplots(figsize=(10, 8))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax, xticks_rotation=90, colorbar=False)
    plt.tight_layout()
    fig.savefig(out_dir / f"{model_key}_confusion_matrix.png", dpi=160)
    plt.close(fig)

    # Save fitted end-to-end pipeline
    joblib.dump(best_pipe, out_dir / f"{model_key}_pipeline.pkl")

    print(json.dumps({k: v for k, v in metrics.items() if k not in ['report']}, indent=2))
    return metrics



## 6) Run All Models
We train and tune: **logreg**, **rf**, **svm**, and (if installed) **xgb**, **cat**.


In [None]:

models = build_models()
spaces = search_spaces()

results = {}
for key, mdl in models.items():
    try:
        res = train_and_eval(key, mdl, spaces[key])
        results[key] = res
    except Exception as e:
        print(f"[WARN] Skipping {key}: {e}")


summary_rows = []
for k, r in results.items():
    summary_rows.append({
        'model': k,
        'cv_macro_f1_best_mean': r.get('cv_best_score_macro_f1', None),
        'test_macro_f1': r['macro_f1'],
        'test_weighted_f1': r['weighted_f1'],
        'test_accuracy': r['accuracy'],
    })
summary_df = pd.DataFrame(summary_rows).sort_values('test_macro_f1', ascending=False)
summary_df


=== LOGREG ===
Fitting 5 folds for each of 14 candidates, totalling 70 fits
{
  "accuracy": 0.6959459459459459,
  "balanced_accuracy": 0.36437096577761535,
  "macro_f1": 0.3816873112039244,
  "weighted_f1": 0.6413896373571559,
  "best_params": {
    "clf__solver": "lbfgs",
    "clf__penalty": "l2",
    "clf__max_iter": 2000,
    "clf__class_weight": null,
    "clf__C": 0.01
  },
  "cv_best_score_macro_f1": NaN
}
=== RF ===
Fitting 5 folds for each of 20 candidates, totalling 100 fits
{
  "accuracy": 0.7972972972972973,
  "balanced_accuracy": 0.5691694068931921,
  "macro_f1": 0.5946413493627115,
  "weighted_f1": 0.7654795388348019,
  "best_params": {
    "clf__n_estimators": 300,
    "clf__min_samples_split": 5,
    "clf__min_samples_leaf": 1,
    "clf__max_features": "sqrt",
    "clf__max_depth": 16,
    "clf__class_weight": null
  },
  "cv_best_score_macro_f1": NaN
}
=== SVM ===
Fitting 5 folds for each of 20 candidates, totalling 100 fits
{
  "accuracy": 0.6283783783783784,
  "balanc