## Comparative Spectral Classification Pipeline

Authors: 
- Fábio D. Pacheco, up202502538

Date: 24/11/2025

### Description

This notebook is used for the following purposes:

#### Comparative Spectral Classification Pipeline

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Loading bar
from tqdm         import tqdm
from tqdm_joblib  import tqdm_joblib

# sklearn
from sklearn.model_selection        import StratifiedKFold, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing          import StandardScaler
from sklearn.pipeline               import Pipeline
from sklearn.metrics                import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support 
from sklearn.metrics                import confusion_matrix, classification_report, log_loss
from sklearn.preprocessing          import LabelEncoder
from sklearn.calibration            import calibration_curve
from sklearn.base                   import BaseEstimator, TransformerMixin, clone

# Classifiers
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.svm                    import SVC
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis

# Calibration
from sklearn.calibration            import CalibratedClassifierCV

# Savitzky–Golay
from scipy.signal                   import savgol_filter

# mRMR
from sklearn_mrmr.mrmr              import MRMRFeatureSelector

#### Dataset Loading

In [None]:
dataset_dir_path = "../dataset/"

xtrain_path = dataset_dir_path + "xtrain.csv"
xtest_path  = dataset_dir_path + "xtest.csv"

ytrain_path = dataset_dir_path + "ytrain.csv"
ytest_path  = dataset_dir_path + "ytest.csv"

if( os.path.exists(xtrain_path) ):  X_train  = pd.read_csv( xtrain_path, index_col=0 )
if( os.path.exists(ytrain_path) ):  y_train_ = pd.read_csv( ytrain_path, index_col=0 )
if( os.path.exists(xtest_path) ):   X_test   = pd.read_csv( xtest_path, index_col=0 )
if( os.path.exists(ytest_path) ):   y_test_  = pd.read_csv( ytest_path, index_col=0 )

print(f"Train shape of X:{X_train.shape}, y:{y_train_.shape}")
print(f"Test  shape of X:{X_test.shape}, y:{y_test_.shape}")

le = LabelEncoder( )

y_train = le.fit_transform( y_train_["polymer"] )
y_test  = le.transform( y_test_["polymer"] )

Train shape of X:(839, 2151), y:(839, 1)
Test  shape of X:(210, 2151), y:(210, 1)


#### Savitzky–Golay Transformer

In [18]:
class SavitzkyGolayTransformer(BaseEstimator, TransformerMixin):
  def __init__(
    self, 
    window_length=15, 
    polyorder=2, 
    deriv=0
  ):
    self.window_length = window_length
    self.polyorder = polyorder
    self.deriv = deriv

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return savgol_filter(
      X,
      window_length=self.window_length,
      polyorder=self.polyorder,
      deriv=self.deriv,
      axis=1
    )


#### 1.) Preprocessing Selection Stage

In [62]:
cv = StratifiedKFold(
  n_splits=8, 
  shuffle=True, 
  random_state=0
)

pipeline = Pipeline([
  ("scaler", StandardScaler()),
  ("sg", "passthrough"),
  ("mrmr", "passthrough"),
  ("pca", "passthrough"),
  ("clf", KNeighborsClassifier(
    n_neighbors=3,
    metric="euclidean"
  ))
])

param_grid = [
  # None
  {
    "sg":   ["passthrough"],
    "mrmr": ["passthrough"],
    "pca":  ["passthrough"],
  },

  # SG only
  {
    "sg":   [SavitzkyGolayTransformer(polyorder=1)],
    "mrmr": ["passthrough"],
    "pca":  ["passthrough"],
  },

  # PCA only
  {
    "sg":   ["passthrough"],
    "mrmr": ["passthrough"],
    "pca":  [PCA(n_components=0.99)],
  },

  # SG + PCA
  {
    "sg":   [SavitzkyGolayTransformer(polyorder=1)],
    "mrmr": ["passthrough"],
    "pca":  [PCA(n_components=0.99)],
  },

  # mRMR only
  {
    "sg":   ["passthrough"],
    "mrmr": [MRMRFeatureSelector(n_features_to_select=20, method="ftest")],
    "pca":  ["passthrough"],
  },

  # SG + mRMR
  {
    "sg":   [SavitzkyGolayTransformer(polyorder=1)],
    "mrmr": [MRMRFeatureSelector(n_features_to_select=20, method="ftest")],
    "pca":  ["passthrough"],
  },
]

grid = GridSearchCV(
  pipeline,
  param_grid,
  scoring="balanced_accuracy",
  cv=cv,
  n_jobs=-1,
  return_train_score=False
)

n_candidates = len(param_grid)
total_fits = n_candidates * cv.get_n_splits( )
with tqdm_joblib(tqdm(desc="GridSearch preprocessing", total=total_fits)):
  grid.fit(X_train, y_train)


[A

100%|██████████| 48/48 [01:41<00:00,  2.12s/it]


In [65]:
cv_results = pd.DataFrame(grid.cv_results_)

cv_results[[
  "mean_test_score",
  "std_test_score",
  "param_sg",
  "param_mrmr",
  "param_pca"
]].sort_values( "mean_test_score", ascending=False )

best_stage1 = [ ]
best_stage1.append( cv_results.iloc[
  cv_results["mean_test_score"].idxmax()
])
best_stage1.append( cv_results.iloc[
  cv_results["std_test_score"].idxmin()
])

print("=== Max mean")
print( best_stage1[0].params )
print( "Mean: ", best_stage1[0].mean_test_score )
print( "Std:  ", best_stage1[0].std_test_score )
print( "Fit time mean (s): ", best_stage1[0].mean_fit_time )
print( "Fit time std (s): ", best_stage1[0].std_fit_time )

print("=== Min variance")
print( best_stage1[1].params )
print( "Accuracy mean: ", best_stage1[1].mean_test_score )
print( "Accuracy std:  ", best_stage1[1].std_test_score )
print( "Fit time mean (s): ", best_stage1[1].mean_fit_time )
print( "Fit time std (s): ", best_stage1[1].std_fit_time )


=== Max mean
{'mrmr': 'passthrough', 'pca': PCA(n_components=0.99), 'sg': SavitzkyGolayTransformer(polyorder=1)}
Mean:  0.8471260830027992
Std:   0.0375653793833751
Fit time mean (s):  3.7591810822486877
Fit time std (s):  0.5541723486868213
=== Min variance
{'mrmr': 'passthrough', 'pca': 'passthrough', 'sg': SavitzkyGolayTransformer(polyorder=1)}
Accuracy mean:  0.844730969587949
Accuracy std:   0.031527647407635404
Fit time mean (s):  0.24008944630622864
Fit time std (s):  0.06027216426724537


#### 2.) Preprocessing Hyperparameter Optimization Stage

In [73]:
grids_stage2 = []
seen_params  = []
unique_best  = []

for best in best_stage1:
  if best.params not in seen_params:
    unique_best.append(best)
    seen_params.append(best.params)


for best in unique_best:
  pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("sg",    best.params["sg"]),
    ("mrmr",  best.params["mrmr"]),
    ("pca",   best.params["pca"]),
    ("clf",   KNeighborsClassifier(n_neighbors=3, metric="euclidean"))
  ])

  sg_list = ["passthrough"]
  if best.params["sg"] != "passthrough":
    sg_list = [
      SavitzkyGolayTransformer(window_length=11, polyorder=1, deriv=0), 
      SavitzkyGolayTransformer(window_length=11, polyorder=1, deriv=1),
      SavitzkyGolayTransformer(window_length=51, polyorder=1, deriv=0),
      SavitzkyGolayTransformer(window_length=51, polyorder=1, deriv=1)
    ]

  mrmr_list = ["passthrough"]
  if best.params["mrmr"] != "passthrough":
    mrmr_list = [
      MRMRFeatureSelector(n_features_to_select=40, method="ftest"),
      MRMRFeatureSelector(n_features_to_select=100, method="ftest"),
      MRMRFeatureSelector(n_features_to_select=200, method="ftest")
    ]

  pca_list = ["passthrough"]
  if best.params["pca"] != "passthrough":
    pca_list = [
      PCA(n_components=0.95),
      PCA(n_components=0.99),
      PCA(n_components=0.999)
    ]

  param_grid = [{
    "sg": sg_list,
    "mrmr": mrmr_list,
    "pca": pca_list,
  }]

  grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    return_train_score=False
  )

  n_candidates = len(sg_list) * len(mrmr_list) * len(pca_list)
  total_fits = n_candidates * cv.get_n_splits()

  with tqdm_joblib( tqdm( desc="GridSearch preprocessing", total=total_fits ) ):
    grid.fit( X_train, y_train )

  grids_stage2.append( grid )

GridSearch preprocessing:   0%|          | 0/96 [00:00<?, ?it/s]
[A

100%|██████████| 96/96 [01:51<00:00,  1.16s/it]

[A

100%|██████████| 32/32 [00:04<00:00,  7.11it/s]


In [86]:
best_stage2 = [ ]

for grid in grids_stage2: 
  cv_results = pd.DataFrame(grid.cv_results_)

  cv_results[[
    "mean_test_score",
    "std_test_score",
    "param_sg",
    "param_mrmr",
    "param_pca"
  ]].sort_values( "mean_test_score", ascending=False )

  opt = []

  opt.append( cv_results.iloc[
    cv_results["mean_test_score"].idxmax()
  ])
  opt.append( cv_results.iloc[
    cv_results["std_test_score"].idxmin()
  ])

  best_stage2.append( opt )

  print("=== Max mean")
  print( opt[0].params )
  print( "Mean: ", opt[0].mean_test_score )
  print( "Std:  ", opt[0].std_test_score )
  print( "Fit time mean (s): ", opt[0].mean_fit_time )
  print( "Fit time std (s): ", opt[0].std_fit_time )

  print("=== Min variance")
  print( opt[1].params )
  print( "Accuracy mean: ", opt[1].mean_test_score )
  print( "Accuracy std:  ", opt[1].std_test_score )
  print( "Fit time mean (s): ", opt[1].mean_fit_time )
  print( "Fit time std (s): ", opt[1].std_fit_time )


=== Max mean
{'mrmr': 'passthrough', 'pca': PCA(n_components=0.95), 'sg': SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)}
Mean:  0.9168467083584361
Std:   0.0370828831913902
Fit time mean (s):  3.586889773607254
Fit time std (s):  0.2549902371621424
=== Min variance
{'mrmr': 'passthrough', 'pca': PCA(n_components=0.999), 'sg': SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)}
Accuracy mean:  0.8570415233387201
Accuracy std:   0.020582263293456617
Fit time mean (s):  4.909534752368927
Fit time std (s):  1.0226899746928908
=== Max mean
{'mrmr': 'passthrough', 'pca': 'passthrough', 'sg': SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)}
Mean:  0.8532400877938635
Std:   0.019799229826236945
Fit time mean (s):  0.40205472707748413
Fit time std (s):  0.29010776412000744
=== Min variance
{'mrmr': 'passthrough', 'pca': 'passthrough', 'sg': SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)}
Accuracy mean:  0.8532400877938635
Accur

#### 3.) Classification Hyperparameter Optimization Stage

In [None]:
grids_stage3 = []
seen_params  = []
unique_best  = []

for stage1 in best_stage2:
  for best in stage1: 
    if best.params not in seen_params:
      unique_best.append(best)
      seen_params.append(best.params)
      
for best in unique_best:
  pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("sg",    best.params["sg"]),
    ("mrmr",  best.params["mrmr"]),
    ("pca",   best.params["pca"]),
    ("clf",   "passthrough") 
  ])

  param_grid = [
    # kNN
    #{ 
    #  "clf": [KNeighborsClassifier()],
    #  "clf__n_neighbors": [3, 5, 7, 9],
    #  "clf__metric": ['euclidean', 'manhattan', 'cosine'],
    #  "clf__weights": ['uniform', 'distance']
    #},

    # SVM
    {
      "clf": [SVC(probability=True, decision_function_shape="ovo", class_weight="balanced")],
      # "clf__C": [1000],
      # "clf__kernel": ["linear"]
    },

    # LDA
    #{
    #  "clf": [LinearDiscriminantAnalysis()],
    #  "clf__solver": ["svd", "lsqr"]
    #}
  ]

  grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1
  )

  # Calculate total fits: Sum of combinations in each dict * folds
  n_candidates = 0
  for pg in param_grid:
    combos = 1
    for key, values in pg.items():
      combos *= len(values)
    n_candidates += combos

  total_fits = n_candidates * cv.get_n_splits()

  with tqdm_joblib(tqdm(desc=f"Comparing Classifiers", total=total_fits)):
    grid.fit(X_train, y_train)

  grids_stage3.append(grid)









[A[A[A[A[A[A[A[A








Comparing Classifiers:   0%|          | 0/8 [01:35<?, ?it/s]
Comparing Classifiers:   0%|          | 0/8 [01:20<?, ?it/s]
100%|██████████| 8/8 [00:15<00:00,  1.89s/it]

[A

100%|██████████| 8/8 [00:16<00:00,  2.09s/it]


[A[A








100%|██████████| 8/8 [00:45<00:00,  5.73s/it]


In [118]:
best_stage3 = [ ]

for grid in grids_stage3: 
  cv_results = pd.DataFrame(grid.cv_results_)

  opt = []

  opt.append( cv_results.iloc[
    cv_results["mean_test_score"].idxmax()
  ])
  opt.append( cv_results.iloc[
    cv_results["std_test_score"].idxmin()
  ])

  best_stage3.append( opt )

  print("====== Model preprocessor")
  print(f"SG: {grid.estimator.named_steps['sg']}")
  print(f"PCA: {grid.estimator.named_steps['pca']}")
  print(f"mRMR: {grid.estimator.named_steps['mrmr']}")

  print("=== Max mean")
  print( opt[0].params )
  print( "Mean: ", opt[0].mean_test_score )
  print( "Std:  ", opt[0].std_test_score )
  print( "Fit time mean (s): ", opt[0].mean_fit_time )
  print( "Fit time std (s): ", opt[0].std_fit_time )

  print("=== Min variance")
  print( opt[1].params )
  print( "Accuracy mean: ", opt[1].mean_test_score )
  print( "Accuracy std:  ", opt[1].std_test_score )
  print( "Fit time mean (s): ", opt[1].mean_fit_time )
  print( "Fit time std (s): ", opt[1].std_fit_time )
  print( "" )


SG: SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)
PCA: PCA(n_components=0.95)
mRMR: passthrough
=== Max mean
{'clf': SVC(probability=True)}
Mean:  0.5643046117159389
Std:   0.044032038108131714
Fit time mean (s):  4.886230647563934
Fit time std (s):  0.40615739663878603
=== Min variance
{'clf': SVC(probability=True)}
Accuracy mean:  0.5643046117159389
Accuracy std:   0.044032038108131714
Fit time mean (s):  4.886230647563934
Fit time std (s):  0.40615739663878603

SG: SavitzkyGolayTransformer(deriv=1, polyorder=1, window_length=51)
PCA: PCA(n_components=0.999)
mRMR: passthrough
=== Max mean
{'clf': SVC(probability=True)}
Mean:  0.5696234702241567
Std:   0.039878943128092016
Fit time mean (s):  5.470881402492523
Fit time std (s):  0.6487531539711545
=== Min variance
{'clf': SVC(probability=True)}
Accuracy mean:  0.5696234702241567
Accuracy std:   0.039878943128092016
Fit time mean (s):  5.470881402492523
Fit time std (s):  0.6487531539711545

SG: SavitzkyGolayTransfor

#### Train the best performer models 

In [None]:
best_stage3_models = []

for i, grid in enumerate(grids_stage3):
  cv_results = pd.DataFrame(grid.cv_results_)
  
  # 1. Capture the Max Mean (Best Estimator is already fitted by GridSearch)
  best_stage3_models.append({
    "desc": f"Strategy_{i}_Max_Mean",
    "model": grid.best_estimator_, 
    "score": grid.best_score_
  })
  
  # 2. Capture the Min Variance (Might be different from Max Mean)
  idx_min_var = cv_results["std_test_score"].idxmin( )
  best_var_params = cv_results.loc[idx_min_var, "params"]
  
  # Check if it's the same as the best_estimator_ to avoid double work
  if idx_min_var != cv_results["mean_test_score"].idxmax():
    min_var_model = clone(grid.estimator).set_params(**best_var_params)
    min_var_model.fit(X_train, y_train)
    
    best_stage3_models.append({
        "desc": f"Strategy_{i}_Min_Var",
        "model": min_var_model,
        "score": cv_results.loc[idx_min_var, "mean_test_score"]
    })

print(f"Captured {len(best_stage3_models)} winners.")

#### 4.) Calibration with Platt Scaling Stage

In [None]:
calibration_results = []

for item in best_stage3_models:
  base_model = item['model']
  name = item['desc']
  
  # 1. Create the Calibrated version
  # We use cv=5 to perform an internal cross-validation for the scaling
  calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv=5)
  calibrated_model.fit(X_train, y_train)
  
  calibration_results.append({
    "name": name,
    "raw_model": base_model,
    "calibrated_model": calibrated_model
  })

print(f"Calibration complete for {len(calibration_results)} models.")