# MultiSensor Dataset Preparation 
- Experiment data: March 2024. Aluminum, Laser-Wire DED
- Aurthor: Chen Lequn

### Notebook 3a: Feature selection, ML modelling and ablation study (feature-based audio model)
- Full Dataset: All features (from previous notebook 2b)
- Ablated Dataset 1: Features after removing those with Pearson correlation > 0.97 (within audio modality).
- Ablated Dataset 2: Features after removing those with Pearson correlation > 0.9 (within audio modality).
- Ablated Dataset 3: Features from Ablated Dataset 1 with MI > 0.05.
- Ablated Dataset 4: Features from Ablated Dataset 1 with MI > 0.1.

In [1]:
# from sklearnex import patch_sklearn, config_context
# import dpctl
# patch_sklearn()

In [2]:
# import wandb

In [3]:
import os
import glob
import sys
import math
import numpy as np
import pandas as pd
from pandas import DataFrame
import scipy as sp
import itertools
from collections import defaultdict
from pprint import pprint
import pickle
from joblib import dump, load

# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import gca
from pylab import *
import seaborn as sns
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%matplotlib inline

# Scikit-learn general
from sklearn import preprocessing, metrics, svm, datasets, tree, neighbors
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate, KFold, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, scale
from sklearn.ensemble import HistGradientBoostingClassifier

# Scikit-learn models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.multiclass import OneVsRestClassifier

# Imbalanced learn
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier

# XGBoost and LightGBM
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# Utilities
from scipy.stats import uniform, randint
from scipy.interpolate import griddata
from sklearn.utils import shuffle, resample, class_weight
from utils import *


In [4]:
PROJECT_ROOT_DIR = "../"
IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "result_images", 'defect classification', 'ML-baseline', 'ablation study')
model_path = os.path.join(PROJECT_ROOT_DIR, "trained_models", 'defect classification', 'ML-baseline', 'ablation study')
scaler_path = os.path.join(PROJECT_ROOT_DIR, "saved_scalers", 'defect classification',  'ML-baseline', 'ablation study')
os.makedirs(IMAGE_PATH, exist_ok=True)
os.makedirs(model_path, exist_ok=True)
os.makedirs(scaler_path, exist_ok=True)


# Change to desirable location of the raw dataset
Multimodal_dataset_PATH = "/home/chenlequn/pan1/Dataset/Laser-Wire-DED-ThermalAudio-Dataset"
Annotation_file_path = os.path.join(Multimodal_dataset_PATH, "Annotation")
Dataset_path = os.path.join(Multimodal_dataset_PATH, 'Dataset')
final_audio_dataset = os.path.join(Multimodal_dataset_PATH, 'Dataset', "audio")
final_image_dataset = os.path.join(Multimodal_dataset_PATH, 'Dataset', "thermal_images")
              

## function for automatically save the diagram/graph into the folder 
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGE_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 2.50

In [5]:
# # Initialize wandb
# wandb.init(project="LW-DED_audio-classification")

## Step 1: Dataset preparation

In [7]:
# Read the data back from the HDF5 file into a new DataFrame
df_audio_dataset = pd.read_hdf(os.path.join(Dataset_path, 'df_audio_dataset_with_annotations(raw_audio).h5'), key='df')

df_audio_dataset

Unnamed: 0,audio_name,image_name,experiment_number,label_1,label_2,label_3,rms_energy,amplitude_envelope_mean,amplitude_envelope_std,zero_crossing_rate,...,mfcc_8_mean,mfcc_8_std,mfcc_9_mean,mfcc_9_std,mfcc_10_mean,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std
0,Exp_17_1_10.wav,Exp_17_1_10.jpg,17_1,Non-defective,,,0.031746,0.038982,0.014304,0.064399,...,-1.955670,7.565694,6.784209,13.263989,-4.241771,7.623967,10.760045,6.365362,-4.255302,6.843529
1,Exp_17_1_9.wav,Exp_17_1_9.jpg,17_1,Non-defective,,,0.042696,0.046830,0.018378,0.048526,...,2.400870,8.425775,10.965203,9.471119,-3.135444,8.011312,13.603483,6.959040,-5.711248,7.241501
2,Exp_17_1_8.wav,Exp_17_1_8.jpg,17_1,Non-defective,,,0.035057,0.042734,0.011573,0.075737,...,1.150261,9.403788,14.162999,9.516252,-8.762444,7.957384,12.542859,4.588312,-11.905286,5.806112
3,Exp_17_1_7.wav,Exp_17_1_7.jpg,17_1,Non-defective,,,0.030621,0.034609,0.014976,0.087528,...,-1.952435,7.164872,11.940722,13.199364,-8.201095,7.420465,10.894137,8.839649,-8.552934,7.470698
4,Exp_17_1_6.wav,Exp_17_1_6.jpg,17_1,Non-defective,,,0.029938,0.039680,0.008860,0.091837,...,-2.629555,10.869521,10.822444,7.406270,-7.824436,7.594238,11.877039,7.440609,-7.672489,7.914073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6812,Exp_April_16_test_4_4.wav,Exp_April_16_test_4_4.jpg,April_16_test_4,Laser-off,Shielding Gas-off,,0.012902,0.015071,0.005879,0.099093,...,2.168951,6.490287,1.928032,8.401035,-1.724736,10.288368,-0.060851,8.381465,-12.953316,5.754806
6813,Exp_April_16_test_4_1.wav,Exp_April_16_test_4_1.jpg,April_16_test_4,Laser-off,Shielding Gas-off,,0.013222,0.015257,0.005407,0.084127,...,0.476796,9.227147,0.945553,7.366714,-1.322515,7.652929,-0.733449,5.396196,-14.786466,7.339162
6814,Exp_April_16_test_4_2.wav,Exp_April_16_test_4_2.jpg,April_16_test_4,Laser-off,Shielding Gas-off,,0.012246,0.014805,0.005100,0.093878,...,-0.026498,7.503857,2.117017,7.400688,-2.097013,8.523264,0.141502,6.387466,-10.734591,8.097878
6815,Exp_April_16_test_4_3.wav,Exp_April_16_test_4_3.jpg,April_16_test_4,Laser-off,Shielding Gas-off,,0.013367,0.016546,0.005384,0.087302,...,1.089101,6.276635,0.685349,5.679959,-5.039695,5.392179,-2.741401,4.763264,-13.847133,7.911690


In [8]:
# Extract labels
y = df_audio_dataset['label_1'].to_list()
y_array = np.array(y)
le = LabelEncoder()
le.fit(y)
Y_encoded = le.transform(y)

# Non-feature columns to be dropped
non_feature_columns = ['audio_name', 'image_name', 'experiment_number',
                       'label_1', 'label_2', 'label_3']

# Extract features for the full dataset
X_all = df_audio_dataset.drop(columns=non_feature_columns).to_numpy()

X_all.shape

(6817, 83)

In [9]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = np.unique(y),
                                                 y = y)

class_weights_encoded = class_weight.compute_class_weight('balanced',
                                                 classes = np.unique(Y_encoded),
                                                 y = Y_encoded)


class_names = (le.classes_).tolist()
print(f"class weights: {class_weights}")
print(f"class weights encoded: {class_weights_encoded}")
print(f"class        : {class_names}")

class weights: [2.99385156 0.60003521 1.00058711]
class weights encoded: [2.99385156 0.60003521 1.00058711]
class        : ['Balling', 'Laser-off', 'Non-defective']


In [10]:
# Given features based on previous analysis results
features_dropped_090 = ['spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_complexity_std', 'spectral_contrast_0_mean',
       'spectral_contrast_0_std', 'spectral_contrast_1_mean',
       'spectral_contrast_1_std', 'spectral_contrast_2_mean',
       'spectral_contrast_2_std', 'spectral_contrast_3_mean',
       'spectral_contrast_3_std', 'spectral_contrast_4_mean',
       'spectral_contrast_4_std', 'spectral_contrast_5_mean',
       'spectral_valley_0_std', 'spectral_valley_1_std',
       'spectral_valley_2_mean', 'spectral_valley_2_std',
       'spectral_valley_3_std', 'spectral_valley_4_std',
       'spectral_valley_5_mean', 'spectral_energy_band_ratio_std',
       'spectral_flatness_mean', 'spectral_flatness_std',
       'spectral_rolloff_std', 'spectral_strong_peak_mean',
       'spectral_strong_peak_std', 'spectral_variance_mean',
       'spectral_variance_std', 'spectral_skewness_std',
       'spectral_kurtosis_std', 'spectral_crest_factor_mean',
       'spectral_crest_factor_std', 'mfcc_0_std', 'mfcc_1_mean', 'mfcc_1_std',
       'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean',
       'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std',
       'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std', 'mfcc_9_mean',
       'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std', 'mfcc_11_mean',
       'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std']

features_dropped_097 = ['amplitude_envelope_std', 'zero_crossing_rate',
       'loudness_vickers', 'spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_complexity_mean', 'spectral_complexity_std',
       'spectral_contrast_0_mean', 'spectral_contrast_0_std',
       'spectral_contrast_1_mean', 'spectral_contrast_1_std',
       'spectral_contrast_2_mean', 'spectral_contrast_2_std',
       'spectral_contrast_3_mean', 'spectral_contrast_3_std',
       'spectral_contrast_4_mean', 'spectral_contrast_4_std',
       'spectral_contrast_5_mean', 'spectral_contrast_5_std',
       'spectral_valley_0_mean', 'spectral_valley_0_std',
       'spectral_valley_1_mean', 'spectral_valley_1_std',
       'spectral_valley_2_mean', 'spectral_valley_2_std',
       'spectral_valley_3_mean', 'spectral_valley_3_std',
       'spectral_valley_4_mean', 'spectral_valley_4_std',
       'spectral_valley_5_mean', 'spectral_valley_5_std',
       'spectral_decrease_mean', 'spectral_energy_band_ratio_mean',
       'spectral_energy_band_ratio_std', 'spectral_flatness_mean',
       'spectral_flatness_std', 'spectral_flux_std', 'spectral_rolloff_mean',
       'spectral_rolloff_std', 'spectral_strong_peak_mean',
       'spectral_strong_peak_std', 'spectral_variance_mean',
       'spectral_variance_std', 'spectral_skewness_std',
       'spectral_kurtosis_std', 'spectral_crest_factor_mean',
       'spectral_crest_factor_std', 'mfcc_0_mean', 'mfcc_0_std', 'mfcc_1_mean',
       'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean', 'mfcc_3_std',
       'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean',
       'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std',
       'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std',
       'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std']


features_dropped_090_mi_filtered_01 = ['spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_complexity_std', 'spectral_contrast_3_mean',
       'spectral_contrast_5_mean', 'spectral_valley_2_mean',
       'spectral_valley_5_mean', 'spectral_flatness_mean',
       'spectral_flatness_std', 'spectral_variance_mean',
       'spectral_skewness_std', 'spectral_crest_factor_mean',
       'spectral_crest_factor_std', 'mfcc_1_mean', 'mfcc_2_mean',
       'mfcc_3_mean', 'mfcc_5_mean', 'mfcc_7_mean', 'mfcc_9_mean',
       'mfcc_11_mean']


features_dropped_090_mi_filtered_005 = ['spectral_centroid_mean', 'spectral_centroid_std',
       'spectral_complexity_std', 'spectral_contrast_3_mean',
       'spectral_contrast_4_mean', 'spectral_contrast_5_mean',
       'spectral_valley_2_mean', 'spectral_valley_5_mean',
       'spectral_energy_band_ratio_std', 'spectral_flatness_mean',
       'spectral_flatness_std', 'spectral_rolloff_std',
       'spectral_strong_peak_std', 'spectral_variance_mean',
       'spectral_skewness_std', 'spectral_kurtosis_std',
       'spectral_crest_factor_mean', 'spectral_crest_factor_std',
       'mfcc_1_mean', 'mfcc_2_mean', 'mfcc_3_mean', 'mfcc_5_mean',
       'mfcc_6_mean', 'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean',
       'mfcc_9_mean', 'mfcc_9_std', 'mfcc_11_mean', 'mfcc_12_mean']

# Create ablated datasets
X_dropped_097 = df_audio_dataset[features_dropped_097].to_numpy()
X_dropped_090 = df_audio_dataset[features_dropped_090].to_numpy()
X_dropped_090_mi_filtered_01 = df_audio_dataset[features_dropped_090_mi_filtered_01].to_numpy()
X_dropped_090_mi_filtered_005 = df_audio_dataset[features_dropped_090_mi_filtered_005].to_numpy()


# Get the shapes of these additional ablated datasets
X_dropped_097.shape, X_dropped_090.shape, X_dropped_090_mi_filtered_005.shape, X_dropped_090_mi_filtered_01.shape,

((6817, 73), (6817, 58), (6817, 30), (6817, 20))

- ALL Dataset (X_all):  `83 features`
- Ablated Dataset 1 (X_dropped_097):  `73 features`
- Ablated Dataset 2 (X_dropped_090):  `58 features`
- Ablated Dataset 3 (X_dropped_090_mi_filtered_005): `30 features`
- Ablated Dataset 4 (X_dropped_090_mi_filtered_01):  `20 features`

### Train val test split
- Train 80%, Val 10%, Test 10%

In [11]:
y_array = np.array(y)

# First split: Separate out the training set (80% of original)
train_indices, temp_indices = train_test_split(
    np.arange(len(df_audio_dataset)), test_size=0.2, random_state=0, stratify=y_array)

# Second split: Separate out the validation and test sets (each will be 10% of original)
val_indices, test_indices = train_test_split(
    temp_indices, test_size=0.5, random_state=0, stratify=y_array[temp_indices])

# Check the shape of the indices for train, val, and test splits
len(train_indices), len(val_indices), len(test_indices)

(5453, 682, 682)

In [12]:
%store train_indices
%store val_indices
%store test_indices

Stored 'train_indices' (ndarray)
Stored 'val_indices' (ndarray)
Stored 'test_indices' (ndarray)


## Step 2: Create a Pipeline and Iterate Over Datasets and Models

In [13]:
# Define hyperparameters for each model
hyperparameters = {
    'KNN': {'n_neighbors': 12, 'weights': 'distance'},
    'DT': {'ccp_alpha': 3.89e-05, 'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 13},
    'LR': {'C': 5.15, 'max_iter': 2528, 'penalty': 'l2', 'solver': 'lbfgs'},
    'RF': {'bootstrap': False, 'max_depth': 64, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 307},
    'BalancedRF': {'bootstrap': False, 'max_depth': 23, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 17, 'n_estimators': 237},
    'RUSBoost': {'learning_rate': 0.1, 'n_estimators': 150},
    'Ada': {'algorithm': 'SAMME', 'learning_rate': 0.57, 'n_estimators': 307},
    'GP': {'kernel': 1**2 * RBF(length_scale=1)},
    'NN': {'activation': 'relu', 'alpha': 0.077, 'early_stopping': True, 'hidden_layer_sizes': (229, 255, 168), 'learning_rate': 'adaptive', 'max_iter': 11805, 'solver': 'adam', 'validation_fraction': 0.1},
    'XGBoost': {'colsample_bytree': 0.62, 'gamma': 0.48, 'learning_rate': 0.35, 'max_depth': 44, 'min_child_weight': 8, 'n_estimators': 480, 'subsample': 0.59,  'gpu_id': 0},
    'LightGBM': {'colsample_bytree': 0.80, 'learning_rate': 0.15, 'max_depth': 35, 'min_child_weight': 1, 'n_estimators': 276, 'num_leaves': 120, 'reg_alpha': 1.94, 'reg_lambda': 1.70, 'subsample': 0.86, 'verbose': -1},
    'SVM': {'C': 524.86, 'degree': 9, 'gamma': 0.029, 'kernel': 'rbf', 'probability': True},
    'GB': {'learning_rate': 0.23, 'max_depth': 49, 'min_samples_leaf': 10, 'min_samples_split': 17, 'n_estimators': 271, 'n_iter_no_change': 10, 'subsample': 0.73, 'tol': 0.0001},
    'HistGB': {'learning_rate': 0.23, 'max_iter': 100, 'max_depth': 49, 'min_samples_leaf': 10, 'n_iter_no_change': 10, 'tol': 0.0001},
}

# List of datasets
datasets = {
    'All Features': X_all,
    'Ablated 1': X_dropped_097,
    'Ablated 2': X_dropped_090,
    'Ablated 3': X_dropped_090_mi_filtered_005,
    'Ablated 4': X_dropped_090_mi_filtered_01,
}


# List of models
models = {
    'KNN': KNeighborsClassifier,
    'DT': DecisionTreeClassifier,
    'LR': LogisticRegression,
    'RF': RandomForestClassifier,
    'BalancedRF': BalancedRandomForestClassifier,
    'RUSBoost': RUSBoostClassifier,
    'Ada': AdaBoostClassifier,
    # 'GP': GaussianProcessClassifier,
    'NN': MLPClassifier,
    'XGBoost': XGBClassifier,
    'LightGBM': LGBMClassifier,
    'SVM': SVC,
    # 'GB': GradientBoostingClassifier,
    'HistGB': HistGradientBoostingClassifier
}

## Step 3: Model Training

In [14]:
# Initialize KFold with the number of splits you desire
kf = KFold(n_splits=5)
# Create a dictionary to store final results
final_results = defaultdict(list)
# Create a dictionary to store temporary cross-validation results for each dataset and model
cv_results = defaultdict(lambda: defaultdict(list))


# Iterate over datasets and models
for dataset_name, X in datasets.items():
    print(f"Processing dataset: {dataset_name}")
    # Split data
    X_train = X[train_indices]
    X_val = X[val_indices]
    X_test = X[test_indices]
  
    y_train = Y_encoded[train_indices]
    y_val = Y_encoded[val_indices]
    y_test = Y_encoded[test_indices]

    # Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    with open(os.path.join(scaler_path, f'StandardScaler_{dataset_name}.pkl'), 'wb') as file:
        pickle.dump(scaler, file)

    for model_name, Model in models.items():
        print(f"\tTraining model: {model_name}")
        # with wandb.init(project="LW-DED_audio-classification", name=f"{model_name}_{dataset_name}", reinit=True) as run:
            # Log model and hyperparameters
            # wandb.config.update({"Model": model_name, "Hyperparameters": hyperparameters[model_name]})
    
        # Initialize model with the best hyperparameters
        model = Model(**hyperparameters[model_name])

        # If the model is SVM, use only 20% of the training data
        if model_name == 'SVM':
            sample_indices = np.random.choice(len(X_train), int(0.2 * len(X_train)), replace=False)
            X_train_sampled = X_train[sample_indices]
            y_train_sampled = y_train[sample_indices]
            model.fit(X_train_sampled, y_train_sampled)
        else:
            # Train the model
            model.fit(X_train, y_train)

        # Save the model
        dump(model, os.path.join(model_path, f'{model_name}_{dataset_name}.joblib'))


        #####------   cross-validation on the validation dataset--------------
        fold_metrics = defaultdict(list)

        for train_idx, test_idx in kf.split(X_val):
            X_test_train, X_test_val = X_val[train_idx], X_val[test_idx]
            y_test_train, y_test_val = y_val[train_idx], y_val[test_idx]
            
            # Make predictions
            y_test_pred = model.predict(X_test_val)
            
            # Evaluate the model
            acc = accuracy_score(y_test_val, y_test_pred)
            f1 = f1_score(y_test_val, y_test_pred, average='weighted')
            precision = precision_score(y_test_val, y_test_pred, average='weighted')
            recall = recall_score(y_test_val, y_test_pred, average='weighted')
            
            # Optional: ROC-AUC can only be calculated for binary classification or multilabel indicator format
            try:
                roc_auc = roc_auc_score(y_test_val, model.predict_proba(X_test_val), multi_class='ovr', average='weighted')
                
            except:
                roc_auc = 'N/A'
            
            # Store the metrics for this fold
            fold_metrics['Accuracy'].append(acc)
            fold_metrics['F1 Score'].append(f1)
            fold_metrics['Precision'].append(precision)
            fold_metrics['Recall'].append(recall)
            fold_metrics['ROC-AUC'].append(roc_auc if roc_auc != 'N/A' else np.nan)
        
        # Calculate mean and std for each metric and print them
        for metric, values in fold_metrics.items():
            mean_val = np.nanmean(values)  # Handle 'N/A' values
            std_val = np.nanstd(values)    # Handle 'N/A' values
            
            print(f"\t\t{metric}: Mean = {mean_val}, Std = {std_val}")
            # Log the mean and std values to wandb
            # wandb.log({f"{metric}_mean": mean_val, f"{metric}_std": std_val})
            
            cv_results[dataset_name][f"{model_name}_{metric}_mean"] = mean_val
            cv_results[dataset_name][f"{model_name}_{metric}_std"] = std_val
             
            #####------   cross-validation on the validation dataset--------------
            
# Convert the temporary results to final format
for dataset_name, metrics in cv_results.items():
    final_results['Dataset'].append(dataset_name)
    for metric_name, value in metrics.items():
        final_results[metric_name].append(value)

# Convert results to a DataFrame for easier viewing
final_results_df = pd.DataFrame(final_results)

Processing dataset: All Features
	Training model: KNN
		Accuracy: Mean = 0.9384285100901673, Std = 0.008729594668824864
		F1 Score: Mean = 0.937679901515333, Std = 0.008644341069949532
		Precision: Mean = 0.9407192629301366, Std = 0.00652269582163597
		Recall: Mean = 0.9384285100901673, Std = 0.008729594668824864
		ROC-AUC: Mean = 0.9877177246821154, Std = 0.006425017951379703
	Training model: DT
		Accuracy: Mean = 0.9002361528553028, Std = 0.017413519839327315
		F1 Score: Mean = 0.9010709792496563, Std = 0.01731269173619315
		Precision: Mean = 0.9031479509537433, Std = 0.0168371191798254
		Recall: Mean = 0.9002361528553028, Std = 0.017413519839327315
		ROC-AUC: Mean = 0.9321969992360746, Std = 0.013590064720874503
	Training model: LR
		Accuracy: Mean = 0.9457492486045513, Std = 0.010992303218640165
		F1 Score: Mean = 0.9458694067024698, Std = 0.011777909804270877
		Precision: Mean = 0.9469629108865956, Std = 0.012582994761112349
		Recall: Mean = 0.9457492486045513, Std = 0.01099230321

  warn(
  warn(


		Accuracy: Mean = 0.9178939458995277, Std = 0.016226982172402784
		F1 Score: Mean = 0.9239637164409613, Std = 0.012913971829598862
		Precision: Mean = 0.9399382041039301, Std = 0.007256855057819846
		Recall: Mean = 0.9178939458995277, Std = 0.016226982172402784
		ROC-AUC: Mean = 0.9918471924841009, Std = 0.0021879960166243167
	Training model: RUSBoost
		Accuracy: Mean = 0.9047123228853586, Std = 0.009050202195213247
		F1 Score: Mean = 0.9116179456840282, Std = 0.00651269535783291
		Precision: Mean = 0.9311553837143629, Std = 0.00786814881322426
		Recall: Mean = 0.9047123228853586, Std = 0.009050202195213247
		ROC-AUC: Mean = 0.9837908308807523, Std = 0.007761858554775764
	Training model: Ada
		Accuracy: Mean = 0.9281236582224132, Std = 0.015049453978633281
		F1 Score: Mean = 0.9279691055195457, Std = 0.015075313704666674
		Precision: Mean = 0.9299670235612252, Std = 0.01422678405803844
		Recall: Mean = 0.9281236582224132, Std = 0.015049453978633281
		ROC-AUC: Mean = 0.9831122515437976

  warn(
  warn(


		Accuracy: Mean = 0.931097037355088, Std = 0.007354500215509612
		F1 Score: Mean = 0.9355488535030559, Std = 0.0057511250868261575
		Precision: Mean = 0.9484109420201776, Std = 0.006505908302679586
		Recall: Mean = 0.931097037355088, Std = 0.007354500215509612
		ROC-AUC: Mean = 0.9920118633071512, Std = 0.0021016236739619144
	Training model: RUSBoost
		Accuracy: Mean = 0.9032417346500644, Std = 0.006964383564512612
		F1 Score: Mean = 0.9110276058328362, Std = 0.007749641484064299
		Precision: Mean = 0.9334668387114098, Std = 0.01829789786923675
		Recall: Mean = 0.9032417346500644, Std = 0.006964383564512612
		ROC-AUC: Mean = 0.9790575111106146, Std = 0.009737315926572664
	Training model: Ada
		Accuracy: Mean = 0.9296049806784026, Std = 0.022083571483068318
		F1 Score: Mean = 0.9287282893459068, Std = 0.022624847937021086
		Precision: Mean = 0.9293249080295126, Std = 0.023443963037263003
		Recall: Mean = 0.9296049806784026, Std = 0.022083571483068318
		ROC-AUC: Mean = 0.982514233726274

  warn(
  warn(


		Accuracy: Mean = 0.9237548303993129, Std = 0.005853901129941093
		F1 Score: Mean = 0.9285592975970868, Std = 0.005811729442932693
		Precision: Mean = 0.9410192755650769, Std = 0.00827279038376343
		Recall: Mean = 0.9237548303993129, Std = 0.005853901129941093
		ROC-AUC: Mean = 0.990841019990866, Std = 0.0020405928904312738
	Training model: RUSBoost
		Accuracy: Mean = 0.8973164448261055, Std = 0.021980460337187924
		F1 Score: Mean = 0.905771201184925, Std = 0.017558818822074777
		Precision: Mean = 0.9277191064142128, Std = 0.01180848414881276
		Recall: Mean = 0.8973164448261055, Std = 0.021980460337187924
		ROC-AUC: Mean = 0.980523075253078, Std = 0.007652993351289365
	Training model: Ada
		Accuracy: Mean = 0.9281343924431086, Std = 0.01185001569899132
		F1 Score: Mean = 0.9267070205138289, Std = 0.011657590045662969
		Precision: Mean = 0.9274753908767673, Std = 0.01098418650113818
		Recall: Mean = 0.9281343924431086, Std = 0.01185001569899132
		ROC-AUC: Mean = 0.9814997495488769, Std

  warn(
  warn(


		Accuracy: Mean = 0.9208136539287247, Std = 0.005591859341205914
		F1 Score: Mean = 0.9259852373378555, Std = 0.004485828435359682
		Precision: Mean = 0.9390586434935523, Std = 0.007017867904853055
		Recall: Mean = 0.9208136539287247, Std = 0.005591859341205914
		ROC-AUC: Mean = 0.9902932723620055, Std = 0.002184383821330817
	Training model: RUSBoost
		Accuracy: Mean = 0.8943752683555173, Std = 0.022286501613271233
		F1 Score: Mean = 0.9037519470308648, Std = 0.01881958925727255
		Precision: Mean = 0.9294096025573978, Std = 0.015594027745958274
		Recall: Mean = 0.8943752683555173, Std = 0.022286501613271233
		ROC-AUC: Mean = 0.9793672063528931, Std = 0.00990639096849022
	Training model: Ada
		Accuracy: Mean = 0.929572778016316, Std = 0.021223440999975352
		F1 Score: Mean = 0.9289374322713583, Std = 0.020599088678541263
		Precision: Mean = 0.9293732280088272, Std = 0.019890008707579232
		Recall: Mean = 0.929572778016316, Std = 0.021223440999975352
		ROC-AUC: Mean = 0.980793515518117, S

  warn(
  warn(


		Accuracy: Mean = 0.9208243881494204, Std = 0.0028354195573609413
		F1 Score: Mean = 0.926160124257532, Std = 0.0033022247436882883
		Precision: Mean = 0.9397944318706042, Std = 0.007909299263847215
		Recall: Mean = 0.9208243881494204, Std = 0.0028354195573609413
		ROC-AUC: Mean = 0.9901424563932464, Std = 0.002062240950207673
	Training model: RUSBoost
		Accuracy: Mean = 0.9017067410905968, Std = 0.022770255194293577
		F1 Score: Mean = 0.9090780734647288, Std = 0.019248349372349052
		Precision: Mean = 0.9275511945132987, Std = 0.009594615913804535
		Recall: Mean = 0.9017067410905968, Std = 0.022770255194293577
		ROC-AUC: Mean = 0.9797339491968037, Std = 0.009720668685821011
	Training model: Ada
		Accuracy: Mean = 0.9310433662516102, Std = 0.019607469655946255
		F1 Score: Mean = 0.9312765394567117, Std = 0.018766688767584385
		Precision: Mean = 0.9333914805915413, Std = 0.016605501956776306
		Recall: Mean = 0.9310433662516102, Std = 0.019607469655946255
		ROC-AUC: Mean = 0.979574158690

In [19]:
%store datasets
%store le
%store Y_encoded

Stored 'datasets' (dict)
Stored 'le' (LabelEncoder)
Stored 'Y_encoded' (ndarray)
