# Permutation feature importance for Classification and Regression Models

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from specvae.model import BaseModel
import specvae.dataset as dt
import specvae.utils as utils

## Load model

In [3]:
# Parameters
dataset = "MoNA"
model_name = "clf_518-345-207-129-12 (24-11-2021_21-29-35)"
model_dir = "d:\\Workspace\\SpecVAE\\.model\\MoNA\\clf\\clf_518-345-207-129-12 (24-11-2021_21-29-35)"
csv_path = "d:\\Workspace\\SpecVAE\\.model\\MoNA\\clf\\experiment.csv"
n_samples = 5000

In [4]:
device, cpu = utils.device(use_cuda=False)

Device in use:  cpu


In [5]:
print("Load model: %s..." % model_name)
model_path = os.path.join(model_dir, 'model.pth')
model = BaseModel.load(model_path, device)
model.eval()

Load model: clf_518-345-207-129-12 (24-11-2021_21-29-35)...


BaseClassifier(
  (layers): Sequential(
    (lin_1): Linear(in_features=518, out_features=345, bias=True)
    (lin_batchnorm_1): BatchNorm1d(345, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act_1): ReLU()
    (lin_2): Linear(in_features=345, out_features=207, bias=True)
    (lin_batchnorm_2): BatchNorm1d(207, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act_2): ReLU()
    (lin_3): Linear(in_features=207, out_features=129, bias=True)
    (lin_batchnorm_3): BatchNorm1d(129, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act_3): ReLU()
    (lin_4): Linear(in_features=129, out_features=12, bias=True)
  )
  (loss): BaseClassifierCriterium(
    (out): LogSoftmax(dim=None)
    (loss): CrossEntropyLoss()
  )
  (out): LogSoftmax(dim=None)
)

In [6]:
model.config

{'name': 'clf',
 'layer_config': array([518, 345, 207, 129]),
 'n_classes': 12,
 'dropout': 0.0,
 'class_weights': tensor([2.5667e-04, 1.2626e-03, 7.8125e-04, 1.3514e-02, 1.0309e-03, 1.2330e-03,
         7.6923e-02, 3.2258e-02, 1.0000e+00, 1.2987e-03, 7.3855e-04, 9.0909e-02]),
 'target_column': 'instrument',
 'target_column_id': 'instrument_id',
 'input_columns': ['spectrum',
  'collision_energy',
  'total_exact_mass',
  'precursor_mz',
  'ionization_mode_id',
  'instrument_type_id',
  'precursor_type_id',
  'kingdom_id',
  'superclass_id',
  'class_id'],
 'input_sizes': [100, 1, 1, 1, 2, 39, 73, 2, 19, 280],
 'types': [torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.float32,
  torch.int64],
 'dataset': 'MoNA',
 'transform': Compose(
     <specvae.dataset.SplitSpectrum object at 0x0000020D4A7317C8>
     <specvae.dataset.TopNPeaks object at 0x0000020D4A731908>
     <specvae.dat

## Load and transform data

In [7]:
if dataset == 'MoNA':
    base_path = utils.get_project_path() / '.data' / 'MoNA'
    metadata_path = base_path / 'MoNA_meta.npy'
elif dataset == 'HMDB':
    base_path = utils.get_project_path() / '.data' / 'HMDB'
    metadata_path = base_path / 'HMDB_meta.npy'

metadata = None
if os.path.exists(metadata_path):
    metadata = np.load(metadata_path, allow_pickle=True).item()

In [8]:
target_column_id = model.config['target_column_id']
target_column = model.config['target_column']
class_subset = model.config['class_subset'] if 'class_subset' in model.config else []

In [9]:
from specvae.classifier import BaseClassifier
from specvae.regressor import BaseRegressor

input_columns = model.config['input_columns']
columns = input_columns + [target_column_id]
types = model.config['types']

if isinstance(model, BaseClassifier):
    train_data, valid_data, test_data, metadata, cw = dt.load_data_classification(
        dataset, model.transform, n_samples, int(1e7), True, device, input_columns, types, target_column_id, True, class_subset)
elif isinstance(model, BaseRegressor):
    train_data, valid_data, test_data, metadata = dt.load_data_regression(
        dataset, model.transform, n_samples, int(1e7), True, device, input_columns, types, target_column, True)

Reject samples with 'no-class' assigned
Select classes from class_subset:  [0, 129, 130, 155, 161, 133, 122, 157, 136, 116, 115, 135]
Relabel classes  [0, 129, 130, 155, 161, 133, 122, 157, 136, 116, 115, 135]  to  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Load train data
Load and transform...
Progress: 5%
Convert data to pytorch tensors...
Load valid data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Progress: 50%
Progress: 55%
Progress: 60%
Progress: 65%
Progress: 70%
Progress: 75%
Progress: 80%
Progress: 85%
Progress: 90%
Progress: 95%
Convert data to pytorch tensors...
Load test data
Load and transform...
Progress: 5%
Progress: 10%
Progress: 15%
Progress: 20%
Progress: 25%
Progress: 30%
Progress: 35%
Progress: 40%
Progress: 45%
Convert data to pytorch tensors...


  class_weights_ = 1. / torch.tensor(class_count, dtype=torch.float32)


In [10]:
X_test, y_test, ids_test = next(iter(train_data))

In [11]:
X_test.shape

torch.Size([5002, 518])

## Permutation Feature Importance

In [12]:
from sklearn.inspection import permutation_importance

In [13]:
pi = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=0)

  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)
  out = self.out(x)


In [14]:
u = np.array([0] + model.config['input_sizes'])
u

array([  0, 100,   1,   1,   1,   2,  39,  73,   2,  19, 280])

In [15]:
s = {}
for i in range(1, len(u)):
    s[input_columns[i-1]] = pi.importances_mean[u[:i].sum():u[:i+1].sum()].sum()
s = dict(sorted(s.items(), key=lambda item: item[1]))
s

{'kingdom_id': 0.0,
 'ionization_mode_id': 0.039544182327069174,
 'superclass_id': 0.10997600959616156,
 'precursor_type_id': 0.11941223510595785,
 'total_exact_mass': 0.12261095561775288,
 'class_id': 0.17405037984806113,
 'precursor_mz': 0.18434626149540181,
 'spectrum': 0.18742502998800603,
 'collision_energy': 0.20149940023990404,
 'instrument_type_id': 0.43354658136745305}

In [16]:
import plotly.express as px
fig = px.bar(x=s.values(), y=s.keys(), orientation='h')
fig.show()

## Save results

In [17]:
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    df.loc[df['full_model_name'] == model_name, 'feature_importance'] = str(s)
    df.to_csv(csv_path)