# Modellierung der MPA-Sensordaten

In [2]:
!pip install librosa
!pip install catboost
!pip install optuna

Collecting librosa
  Downloading librosa-0.8.1-py3-none-any.whl (203 kB)
[K     |████████████████████████████████| 203 kB 7.4 MB/s eta 0:00:01
Collecting soundfile>=0.10.2
  Downloading SoundFile-0.10.3.post1-py2.py3-none-any.whl (21 kB)
Collecting audioread>=2.0.0
  Downloading audioread-2.1.9.tar.gz (377 kB)
[K     |████████████████████████████████| 377 kB 38.3 MB/s eta 0:00:01
Collecting resampy>=0.2.2
  Downloading resampy-0.2.2.tar.gz (323 kB)
[K     |████████████████████████████████| 323 kB 98.5 MB/s eta 0:00:01
Building wheels for collected packages: audioread, resampy
  Building wheel for audioread (setup.py) ... [?25ldone
[?25h  Created wheel for audioread: filename=audioread-2.1.9-py3-none-any.whl size=23154 sha256=5b8070368571885d10e7233fc52b8ef2f8230f94a25dedbf386466ee8e3814fa
  Stored in directory: /home/jovyan/.cache/pip/wheels/a2/a3/bd/ec1568ce7515115a11ab686d509ad302124c782af065de47ee
  Building wheel for resampy (setup.py) ... [?25ldone
[?25h  Created wheel for 

In [84]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import sys
sys.path.append('..')
import os
import yaml
import datetime as dt
from multiprocessing import Pool
from feature_extract import extract_highest_amplitude_features_with_mp, get_all_sensors_in_df, feature_extractor_wrapper
from plot import plot_residuals, plot_error_per_cat
from catboost import CatBoostRegressor
import optuna

from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px

# Helper Functions

In [85]:
# Preprocessing functions
def add_outlier_feature_length(df: pd.DataFrame) -> pd.DataFrame:
    """Adds new Feature declaring outliers"""
    # Calculate prediction for outlier
    max_len_col = [col for col in df.columns if 'max_len' in col][0]
    length_transformed = np.log(df[max_len_col])
    length_transformed_std = (length_transformed - np.mean(length_transformed)) / np.std(length_transformed)
    outlier_prediction = length_transformed_std > 3
    
    # add to df
    df['outlier'] = outlier_prediction
    
    return df

def concat_all_studies(studies: list, as_dataframe: bool =True) -> pd.DataFrame:
    """Concatenates all optuna studies"""
    
    if as_dataframe:
        df = pd.concat([study.trials_dataframe() for study in studies], axis=0)
        df = df.sort_values(by='value', ascending=False)
        return df
    else:
        all_studies = studies[0]
        for s in studies[1:]:
            all_studies.add_trials(s.get_trials())
        return all_studies

def load_studies_from_folder(folder_name: str, device_name: str) -> list:
    """Loads studies from a given folder with Device Name"""
    # Load study
    files = [f for f in os.listdir(folder_name) if device_name in f]
    studies= []
    for i, file in enumerate(files):
        filename = folder_name + file 
        with open(filename, 'rb') as pkl_file:
            study_pkl = pickle.load(pkl_file)
            studies.append(study_pkl)
    
    return studies

def plot_parallel_cordinates(df: pd.DataFrame, objective_maximize=True, columns=None):
    """Plots Opt. History as Parallel Cordinates"""
    if columns:
        columns.append('number')
        columns.append('value')
        df = df[columns]
        
    cols = [col.split('_') for col in df.columns]
    df.columns = ['_'.join(col[1:])  if len(col) > 1 else col[0] for col in cols] 
    
    if objective_maximize:
        fig = px.parallel_coordinates(data_frame=df.drop('number', axis=1), color='value', color_continuous_scale='Blues', 
                                      height=500, width=1500)
    else:
        fig = px.parallel_coordinates(data_frame=df.drop('number', axis=1), color='value', color_continuous_scale='Blues_r', 
                                      height=500, width=1500)
    return fig

# Hyperparam Optimization CatBoost

In [171]:
def objective_r2_simple(trial):
    X_transformed = X_train.copy()
    
    # Pre-Processing
    ## Polynomial Features
    X_transformed = add_outlier_feature_length(X_transformed)
    
    ## Feature Transformation to normalize data
    transformer = PowerTransformer(standardize=True)
    X_transformed = transformer.fit_transform(X_train)
        
    param = {
        'iterations': trial.suggest_int('iterations', 50, 5000),
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE', 'MAPE']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 1e0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 1),
        'depth': trial.suggest_int('depth', 1, 10),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 20),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20), 
        'silent': True
    }
    # Conditional Hyper-Parameters
    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    reg = CatBoostRegressor(**param, thread_count=10)
    scores = cross_val_score(estimator=reg, X=X_train,
                            y=y_train, cv=5, n_jobs=5)    
 
    return np.mean(scores)

In [173]:
def objective_r2_complex(trial):
    X_transformed = X_train.copy()
    
    # Pre-Processing
    ## Polynomial Features
    X_transformed = add_outlier_feature_length(X_transformed)
    
    ## Feature Transformation to normalize data
    transformer = PowerTransformer(standardize=True)
    X_transformed = transformer.fit_transform(X_train)
        
    param = {
        'iterations': trial.suggest_int('iterations', 50, 5000),
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE', 'MAPE']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 1e0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 1),
        'depth': trial.suggest_int('depth', 1, 10),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 20),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20), 
        'silent': True
    }
    # Conditional Hyper-Parameters
    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    reg = CatBoostRegressor(**param, thread_count=10)
    scores = cross_val_score(estimator=reg, X=X_train,
                            y=y_train, cv=5, n_jobs=5)    
 
    return np.mean(scores)

In [175]:
# './data/data_mpa.txt', './data/data_spg.txt', './data/data_sps.txt'
config = dict(
    FILEPATHS = ['./data/archive/extract_mpa.csv', './data/archive/extract_spg.csv', 
                 './data/archive/extract_sps.csv'],
    TUNING_ITER = 2,
    N_TRIALS = 50,
    DROP_COLUMNS = ['velocity', 'start_time', 'packnr'],
    LOG_SCALE_TARGET = False,
    MODEL_NAME = 'CAT',
    SAVE_DIR = './results/final2/complex/' , # Needs to end with '/' like './results/final/'
    EXTRACT_MAX_FEATURES = True,
    EXTRACT_MEAN_FEATURES = True,
    DEBUG_RUN = False,
    DATE_FROM = 2020,  # selcts Dates >= DATE_FROM
)

try:
    save_dir = config['SAVE_DIR'].split('/')[1:]
    check_dir = None
    for sdir in save_dir:
        if check_dir:
            check_dir = check_dir + '/' + sdir
        else:
            check_dir = sdir
        try:
            os.mkdir(check_dir)
        except FileExistsError as fe:
            print(fe, 'Skipping iteration')
except:
    os.mkdir(config['SAVE_DIR'])

# Save Parametr-configs of file
if 'config.yaml' in os.listdir(config['SAVE_DIR']):
    print(Exception('Config already exists!'))
    if input('Want Overwrite Existing? (y/n)')=='y':
        with open(config['SAVE_DIR'] + 'config.yaml', 'w') as yaml_file:
            yaml.dump(config, yaml_file)
else:
    with open(config['SAVE_DIR'] + 'config.yaml', 'w') as yaml_file:
        yaml.dump(config, yaml_file)
        
for path in config['FILEPATHS']:
    
    device_name = path.split('_')[-1].split('.')[0]

    print(10*'=', f'Starting Study for {device_name}', 10*'=')
    # Read processed data
    if config['DEBUG_RUN']:
        data = pd.read_table(path, sep=' ', nrows=1000)
    else:
        data = pd.read_table(path, sep=' ')
        
    # Resample data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)        
    
    # Extract Features
    #data = feature_extractor_wrapper(df=data, extract_max_features=config['EXTRACT_MAX_FEATURES'],
    #                                 extract_mean_features=config['EXTRACT_MEAN_FEATURES'], n_processes=6)
    #Filtering data
    if config['DATE_FROM'] and not config['DEBUG_RUN']:
        data['start_time'] = pd.to_datetime(data['start_time'])
        data = data.loc[data['start_time'].dt.year >= config['DATE_FROM']]
            
    # Splitting of Data
    print('INFO | Split Data X, y ...')
    feature_cols = data.columns.to_list()
    feature_cols.remove('size_mm')
    for col in config['DROP_COLUMNS']:
        feature_cols.remove(col)
    
    # Split row wise and Target-wise
    X_train, y_train = data[feature_cols], data['size_mm']

    print('INFO | Train-Test Split ...')
    if config['LOG_SCALE_TARGET']:
        y_train = np.log(y_train)

    print('INFO | Tune Model ...')
    for i in range(config['TUNING_ITER']):
        if config['DEBUG_RUN']:
            study = optuna.create_study(direction="maximize")
            study.optimize(objective_r2_complex, n_trials=0, n_jobs=1, timeout=5)
            break
        else:
            study = optuna.create_study(direction="maximize")
            study.optimize(objective_r2_complex, n_trials=config['N_TRIALS'], n_jobs=1)
            # Save best params
            study_name = '_'.join([config['MODEL_NAME'], device_name, str(i)])

            with open(f'{config["SAVE_DIR"]}{study_name}.pkl', 'wb') as pkl_file:
                pickle.dump(study, pkl_file)
    if config['DEBUG_RUN']:
        break
print(30*'=', ' Process Finished ', 30*'=')

[Errno 17] File exists: 'results' Skipping iteration
[Errno 17] File exists: 'results/final2' Skipping iteration
[Errno 17] File exists: 'results/final2/complex' Skipping iteration
[Errno 17] File exists: 'results/final2/complex/' Skipping iteration
Config already exists!


Want Overwrite Existing? (y/n) y




[32m[I 2021-12-27 19:09:04,771][0m A new study created in memory with name: no-name-b5f3a5ba-7854-4fc6-87c4-3ce16de21367[0m


INFO | Split Data X, y ...
INFO | Train-Test Split ...
INFO | Tune Model ...


[32m[I 2021-12-27 19:09:35,631][0m Trial 0 finished with value: -0.12458112264255106 and parameters: {'iterations': 4880, 'loss_function': 'MAPE', 'learning_rate': 0.0002850586928781377, 'l2_leaf_reg': 0.03620228340222952, 'colsample_bylevel': 0.9790955215295143, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 10, 'one_hot_max_size': 10, 'bagging_temperature': 0.7195670650986574}. Best is trial 0 with value: -0.12458112264255106.[0m
[32m[I 2021-12-27 19:09:42,599][0m Trial 1 finished with value: -0.4002340876735868 and parameters: {'iterations': 1215, 'loss_function': 'MAPE', 'learning_rate': 7.535753878823557e-05, 'l2_leaf_reg': 0.16653906028766025, 'colsample_bylevel': 0.6523418389683497, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 4, 'one_hot_max_size': 8, 'subsample': 0.7694735724598339}. Best is trial 0 with value: -0.12458112264255106.[0m
[32m[I 2021-12-27 19:11:13,366][0m Trial 2 finishe



[32m[I 2021-12-27 21:03:26,833][0m A new study created in memory with name: no-name-b95ac217-bd9d-4201-946e-9057b2321280[0m


INFO | Split Data X, y ...
INFO | Train-Test Split ...
INFO | Tune Model ...


[32m[I 2021-12-27 21:03:51,543][0m Trial 0 finished with value: 0.45476443737227923 and parameters: {'iterations': 2190, 'loss_function': 'RMSE', 'learning_rate': 0.00022946280125423423, 'l2_leaf_reg': 0.49196968284507736, 'colsample_bylevel': 0.4154404115223317, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 15, 'one_hot_max_size': 6, 'bagging_temperature': 4.812621889416755}. Best is trial 0 with value: 0.45476443737227923.[0m
[32m[I 2021-12-27 21:03:57,086][0m Trial 1 finished with value: -0.6657688257816231 and parameters: {'iterations': 1565, 'loss_function': 'MAPE', 'learning_rate': 0.00029874343563459147, 'l2_leaf_reg': 0.03451096938326208, 'colsample_bylevel': 0.3643737384630846, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 2, 'one_hot_max_size': 14, 'bagging_temperature': 4.216836045132673}. Best is trial 0 with value: 0.45476443737227923.[0m
[32m[I 2021-12-27 21:04:11,299][0m Trial 2 f



[32m[I 2021-12-27 23:01:13,147][0m A new study created in memory with name: no-name-788b9e74-5d53-4bec-bac2-ba65d3f11f5b[0m


INFO | Split Data X, y ...
INFO | Train-Test Split ...
INFO | Tune Model ...


[32m[I 2021-12-27 23:01:46,489][0m Trial 0 finished with value: 0.8062781520169295 and parameters: {'iterations': 3296, 'loss_function': 'RMSE', 'learning_rate': 0.0017340802779383976, 'l2_leaf_reg': 0.14630867930403296, 'colsample_bylevel': 0.5025160124418568, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 13, 'one_hot_max_size': 15, 'bagging_temperature': 0.10759932817057338}. Best is trial 0 with value: 0.8062781520169295.[0m
[32m[I 2021-12-27 23:01:54,485][0m Trial 1 finished with value: -0.5710435002102219 and parameters: {'iterations': 4305, 'loss_function': 'MAPE', 'learning_rate': 0.0002410240981519678, 'l2_leaf_reg': 0.26859073834910546, 'colsample_bylevel': 0.8266015483644044, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 17, 'one_hot_max_size': 14, 'bagging_temperature': 6.059981433366564}. Best is trial 0 with value: 0.8062781520169295.[0m
[32m[I 2021-12-27 23:03:05,416][0m Trial 2 fi



# Investigate Hyperparameters

In [None]:
from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px
from plot import plot_residuals, plot_error_per_cat

In [None]:
folder_name ='./results/final/final/'

In [None]:
with open(folder_name + 'config.yaml', 'r') as yaml_file:
    configs = yaml.load(yaml_file)
configs

In [None]:
studies = load_studies_from_folder(folder_name=folder_name, device_name='mpa')
df = concat_all_studies_df(studies)
df = df.sort_values(by='value', ascending=False)

In [None]:
df.head(10)

In [None]:
fig = plot_parallel_cordinates(df, objective_maximize=True)
HTML(fig.to_html())

# Train Final Model (Complex)

## Create Holdout Set for each System

In [169]:
files = [f for f in os.listdir('data') if '.csv' in f]
files

['extract_mpa.csv']

In [170]:
for fp in files:
    data = pd.read_table('./data/'+fp, sep=' ')
    X, y = train_test_split(data, test_size=.1)
    X.to_csv('./data/split_'+ fp, index=False, sep=' ')
    y.to_csv('./data/holdout/test_'+ fp, index=False, sep=' ')

## Helper Funcs

In [176]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import PowerTransformer

In [318]:
class OutlierFeatureAdder(TransformerMixin, BaseEstimator):
    """Wrapper Class for Sklearn Pipeline"""
    def __init__(self):
        """Adds Outlier feature column"""
        super().__init__()
        self.mean = None
        self.std = None

    def fit(self, X: pd.DataFrame, y=None):
        _ = self.add_outlier_feature_length(X)

        return self

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X)
        X = self.transform(X)

        return X

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = X.copy()
        X.loc[:, ('outlier')] = self.add_outlier_feature_length(X)

        return X

        # Preprocessing functions

    def add_outlier_feature_length(self, df: pd.DataFrame) -> pd.DataFrame:
        """Adds new Feature declaring outliers"""
        # Calculate prediction for outlier
        max_len_col = [col for col in df.columns if 'max_len' in col][0]
        length_transformed = np.log(df[max_len_col])
        if not self.mean:
            self.mean = np.mean(length_transformed)
            self.std = np.std(length_transformed)

        length_transformed = (length_transformed - self.mean) / self.std
        outlier_prediction = length_transformed > 3

        return outlier_prediction.to_numpy()

In [181]:
with open('./results/final2/complex/config.yaml', 'r') as yaml_file:
    configs = yaml.load(yaml_file, Loader=yaml.FullLoader)
configs

{'DATE_FROM': 2020,
 'DEBUG_RUN': False,
 'DROP_COLUMNS': ['velocity', 'start_time', 'packnr'],
 'EXTRACT_MAX_FEATURES': True,
 'EXTRACT_MEAN_FEATURES': True,
 'FILEPATHS': ['./data/archive/extract_mpa.csv',
  './data/archive/extract_spg.csv',
  './data/archive/extract_sps.csv'],
 'LOG_SCALE_TARGET': False,
 'MODEL_NAME': 'CAT',
 'N_TRIALS': 50,
 'SAVE_DIR': './results/final2/complex/',
 'TUNING_ITER': 2}

In [426]:
def preprocess_data(data, datefrom=None):
    #Filtering data
    if datefrom:
        data['start_time'] = pd.to_datetime(data['start_time'])
        data = data.loc[data['start_time'].dt.year >= configs['DATE_FROM']]

    # Split row wise and Target-wise
    feature_columns = data.columns.to_list()
    drop_columns = ['start_time', 'velocity', 'packnr', 'size_mm']
    for col in drop_columns:
        feature_columns.remove(col)
    X, y = data[feature_columns], data['size_mm']
    
    return X, y

## MPA

In [427]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/complex/', device_name='mpa')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [276]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
83,83,0.813759,2021-12-27 20:42:21.189549,2021-12-27 20:44:20.907061,0 days 00:01:59.717512,,Plain,MVS,0.793724,9,4395,0.032752,0.012236,RMSE,8,14,,COMPLETE
33,33,0.813759,2021-12-27 20:42:21.189549,2021-12-27 20:44:20.907061,0 days 00:01:59.717512,,Plain,MVS,0.793724,9,4395,0.032752,0.012236,RMSE,8,14,,COMPLETE


In [428]:
train = pd.read_table('./data/train/split_extract_mpa.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_extract_mpa.csv', sep=' ')
all_data = pd.read_table('./data/archive/extract_mpa.csv', sep=' ')

In [429]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [431]:
print(X.columns.to_list())

['max_centroid_frequency_M', 'max_centroid_frequency2_M', 'max_median_freq_M', 'max_flash_ind_M', 'max_cv_M', 'max_iqa_M', 'max_mab_M', 'max_imp_M', 'max_len_M', 'max_wavelet_mean_approx_M', 'max_wavelet_med_approx_M', 'max_wavelet_var_approx_M', 'max_wavelet_mean_coef_M', 'max_wavelet_med_coef_M', 'max_wavelet_var_coef_M', 'max_pack_sum_M', 'max_pack_kurtosis_M', 'max_pack_skew_M', 'max_zcr_M', 'max_rmse_M', 'max_spectral_flatness_M', 'max_spectral_rolloff_M', 'mean_wavelet_med_approx_M', 'mean_pack_kurtosis_M', 'mean_imp_M', 'mean_pack_sum_M', 'mean_len_M', 'mean_wavelet_var_approx_M', 'mean_spectral_rolloff_M', 'mean_cv_M', 'mean_wavelet_mean_approx_M', 'mean_mab_M', 'mean_rmse_M', 'mean_wavelet_mean_coef_M', 'mean_wavelet_med_coef_M', 'mean_wavelet_var_coef_M', 'mean_centroid_frequency_M', 'mean_zcr_M', 'mean_spectral_flatness_M', 'mean_flash_ind_M', 'mean_median_freq_M', 'mean_iqa_M', 'mean_pack_skew_M', 'mean_centroid_frequency2_M']


In [280]:
X.head(2)

Unnamed: 0,max_centroid_frequency_M,max_centroid_frequency2_M,max_median_freq_M,max_flash_ind_M,max_cv_M,max_iqa_M,max_mab_M,max_imp_M,max_len_M,max_wavelet_mean_approx_M,max_wavelet_med_approx_M,max_wavelet_var_approx_M,max_wavelet_mean_coef_M,max_wavelet_med_coef_M,max_wavelet_var_coef_M,max_pack_sum_M,max_pack_kurtosis_M,max_pack_skew_M,max_zcr_M,max_rmse_M,max_spectral_flatness_M,max_spectral_rolloff_M,mean_wavelet_med_approx_M,mean_pack_kurtosis_M,mean_imp_M,mean_pack_sum_M,mean_len_M,mean_wavelet_var_approx_M,mean_spectral_rolloff_M,mean_cv_M,mean_wavelet_mean_approx_M,mean_mab_M,mean_rmse_M,mean_wavelet_mean_coef_M,mean_wavelet_med_coef_M,mean_wavelet_var_coef_M,mean_centroid_frequency_M,mean_zcr_M,mean_spectral_flatness_M,mean_flash_ind_M,mean_median_freq_M,mean_iqa_M,mean_pack_skew_M,mean_centroid_frequency2_M
46123,3497.695364,3651.198231,3536.219986,0.001092,0.862842,0.001232,2.2195,6,50,0.010757,0.004717,0.060872,-0.048754,0.016504,0.429507,12.338051,9.746077,0.611091,0.4,0.496423,1.1e-05,9894.506836,0.000976,4.242255,1.5,3.335155,50.0,0.015228,10021.014404,1.07004,0.002651,0.575967,0.131418,-0.014385,0.004007,0.107506,3583.433065,0.36,1.7e-05,0.002151,3925.300634,0.000308,0.295696,4039.261582
46124,3199.658992,3221.798014,3199.297511,0.00153,1.0512,0.000913,1.717626,6,52,-0.000927,0.002873,0.136028,0.02348,-0.026114,0.214632,11.093694,6.678718,-1.344587,0.365385,0.419054,5e-06,8860.913086,0.001026,2.529704,1.5,2.892688,52.0,0.034012,9751.849365,0.781672,-0.000175,0.439087,0.10791,0.006889,-0.00577,0.053677,3052.273182,0.350962,2.4e-05,0.003199,3212.107735,0.000228,-0.260954,3390.56201


In [281]:
all_studies.best_trial.value

0.8137588385807424

In [282]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 4395,
 'loss_function': 'RMSE',
 'learning_rate': 0.012236384160613935,
 'l2_leaf_reg': 0.03275174155986356,
 'colsample_bylevel': 0.7937237938482521,
 'depth': 9,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 8,
 'one_hot_max_size': 14}

In [283]:
catboost_mpa_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                             ('power_transform', PowerTransformer()), 
                             ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [284]:
scores = cross_val_score(estimator=catboost_mpa_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [285]:
print('All Scores: ', scores)

All Scores:  [0.81655299 0.82897528 0.82991611 0.80818425 0.79688837 0.80734319
 0.80823731 0.80959279 0.81670264 0.81544138]


In [286]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.8137834306214307


In [287]:
catboost_mpa_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4ad8836730>)])

In [288]:
y_pred = catboost_mpa_pipe.predict(X_test)

In [289]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.248280260926552


In [290]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.8035975791350025


In [291]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error], axis=1)
pred_error.to_csv('./data/error/mpa/complex.csv', sep=' ', index=False)

In [292]:
catboost_mpa_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4ad8836730>)])

In [293]:
catboost_mpa_pipe.score(X, y)

0.9523600178624564

In [294]:
with open('./models/mpa/mpa_catboost_complex.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_mpa_pipe, file=pkl_file)

## SPG

In [432]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/complex/', device_name='spg')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [433]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
32,32,0.795274,2021-12-27 21:28:30.906806,2021-12-27 21:29:33.931640,0 days 00:01:03.024834,,Plain,Bernoulli,0.234813,9,4838,0.039844,0.022214,RMSE,20,7,0.7078,COMPLETE
43,43,0.795083,2021-12-27 21:35:00.894589,2021-12-27 21:35:49.822653,0 days 00:00:48.928064,,Plain,Bernoulli,0.152274,9,4621,0.079772,0.008476,RMSE,19,8,0.514965,COMPLETE


In [434]:
train = pd.read_table('./data/train/split_extract_spg.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_extract_spg.csv', sep=' ')
all_data = pd.read_table('./data/archive/extract_spg.csv', sep=' ')

In [436]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [437]:
print(X.columns.to_list())

['max_centroid_frequency_G', 'max_centroid_frequency2_G', 'max_median_freq_G', 'max_flash_ind_G', 'max_cv_G', 'max_iqa_G', 'max_mab_G', 'max_imp_G', 'max_len_G', 'max_wavelet_mean_approx_G', 'max_wavelet_med_approx_G', 'max_wavelet_var_approx_G', 'max_wavelet_mean_coef_G', 'max_wavelet_med_coef_G', 'max_wavelet_var_coef_G', 'max_pack_sum_G', 'max_pack_kurtosis_G', 'max_pack_skew_G', 'max_zcr_G', 'max_rmse_G', 'max_spectral_flatness_G', 'max_spectral_rolloff_G', 'mean_imp_G', 'mean_wavelet_var_coef_G', 'mean_rmse_G', 'mean_iqa_G', 'mean_wavelet_med_approx_G', 'mean_pack_sum_G', 'mean_spectral_rolloff_G', 'mean_zcr_G', 'mean_pack_kurtosis_G', 'mean_centroid_frequency_G', 'mean_median_freq_G', 'mean_len_G', 'mean_cv_G', 'mean_flash_ind_G', 'mean_spectral_flatness_G', 'mean_centroid_frequency2_G', 'mean_wavelet_med_coef_G', 'mean_pack_skew_G', 'mean_wavelet_var_approx_G', 'mean_wavelet_mean_coef_G', 'mean_mab_G', 'mean_wavelet_mean_approx_G']


In [325]:
X.head(2)

Unnamed: 0,max_centroid_frequency_G,max_centroid_frequency2_G,max_median_freq_G,max_flash_ind_G,max_cv_G,max_iqa_G,max_mab_G,max_imp_G,max_len_G,max_wavelet_mean_approx_G,max_wavelet_med_approx_G,max_wavelet_var_approx_G,max_wavelet_mean_coef_G,max_wavelet_med_coef_G,max_wavelet_var_coef_G,max_pack_sum_G,max_pack_kurtosis_G,max_pack_skew_G,max_zcr_G,max_rmse_G,max_spectral_flatness_G,max_spectral_rolloff_G,mean_imp_G,mean_wavelet_var_coef_G,mean_rmse_G,mean_iqa_G,mean_wavelet_med_approx_G,mean_pack_sum_G,mean_spectral_rolloff_G,mean_zcr_G,mean_pack_kurtosis_G,mean_centroid_frequency_G,mean_median_freq_G,mean_len_G,mean_cv_G,mean_flash_ind_G,mean_spectral_flatness_G,mean_centroid_frequency2_G,mean_wavelet_med_coef_G,mean_pack_skew_G,mean_wavelet_var_approx_G,mean_wavelet_mean_coef_G,mean_mab_G,mean_wavelet_mean_approx_G
30126,1966.146784,1977.787415,2052.54952,0.003117,1.118455,2.939599e-07,0.021001,0,48,-0.000291,5.8e-05,3e-06,5.8e-05,0.000232,1e-06,0.053677,0.057112,-0.021368,0.166667,0.001522,1e-05,5620.166016,0.0,6.4e-05,0.00729,1.492359e-07,-0.000756,0.256417,5738.598633,0.166667,0.377448,1707.673669,1595.989783,48.0,1.089652,0.003773,7e-06,1384.447053,0.000291,-0.007381,0.000108,5.8e-05,0.011508,-0.000591
30127,456.02908,323.956298,334.713115,0.012305,3.281048,1.906679e-06,0.024618,2,82,-0.002979,-0.003778,0.000451,-5.1e-05,0.0,5e-06,1.101781,-1.244783,0.279877,0.02439,0.015249,2e-06,1227.392578,1.0,3e-06,0.012867,1.404183e-06,-0.002965,0.904834,1227.392578,0.030488,-1.048765,498.960138,340.760367,82.0,3.099531,0.010059,2e-06,326.830427,0.000116,0.176797,0.000333,3.1e-05,0.022358,-0.002494


In [326]:
all_studies.best_trial.value

0.7952740153746174

In [327]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 4838,
 'loss_function': 'RMSE',
 'learning_rate': 0.022213705233803352,
 'l2_leaf_reg': 0.03984439188320381,
 'colsample_bylevel': 0.23481282445179058,
 'depth': 9,
 'boosting_type': 'Plain',
 'bootstrap_type': 'Bernoulli',
 'min_data_in_leaf': 20,
 'one_hot_max_size': 7,
 'subsample': 0.7078003943966793}

In [328]:
catboost_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                          ('power_transform', PowerTransformer()), 
                          ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [329]:
scores = cross_val_score(estimator=catboost_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [330]:
print('All Scores: ', scores)

All Scores:  [0.78967431 0.79224348 0.79496507 0.80813139 0.77893316 0.78234629
 0.81100033 0.80284354 0.79951643 0.80340736]


In [331]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.7963061357784622


In [332]:
catboost_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad001c40>)])

In [333]:
y_pred = catboost_pipe.predict(X_test)

In [334]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.20466053996270728


In [335]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.7821742591467037


In [336]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error], axis=1)
pred_error.to_csv('./data/error/spg/complex.csv', sep=' ', index=False)

In [337]:
catboost_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad001c40>)])

In [338]:
catboost_pipe.score(X, y)

0.9879624987295088

In [340]:
with open('./models/spg/spg_catboost_complex.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_pipe, file=pkl_file)

## SPS (Achtung Features von Allen Sensoren extrahieren!)

In [438]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/complex/', device_name='sps')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [439]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
88,88,0.830486,2021-12-27 23:28:00.915552,2021-12-27 23:30:21.304239,0 days 00:02:20.388687,,Ordered,Bernoulli,0.765491,6,3726,0.050225,0.030044,RMSE,11,10,0.83545,COMPLETE
38,38,0.830486,2021-12-27 23:28:00.915552,2021-12-27 23:30:21.304239,0 days 00:02:20.388687,,Ordered,Bernoulli,0.765491,6,3726,0.050225,0.030044,RMSE,11,10,0.83545,COMPLETE


In [440]:
train = pd.read_table('./data/train/split_extract_sps.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_extract_sps.csv', sep=' ')
all_data = pd.read_table('./data/archive/extract_sps.csv', sep=' ')

In [441]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [442]:
print(X.columns.to_list())

['max_centroid_frequency_M', 'max_centroid_frequency2_M', 'max_median_freq_M', 'max_flash_ind_M', 'max_cv_M', 'max_iqa_M', 'max_mab_M', 'max_imp_M', 'max_len_M', 'max_wavelet_mean_approx_M', 'max_wavelet_med_approx_M', 'max_wavelet_var_approx_M', 'max_wavelet_mean_coef_M', 'max_wavelet_med_coef_M', 'max_wavelet_var_coef_M', 'max_pack_sum_M', 'max_pack_kurtosis_M', 'max_pack_skew_M', 'max_zcr_M', 'max_rmse_M', 'max_spectral_flatness_M', 'max_spectral_rolloff_M', 'max_centroid_frequency_S', 'max_centroid_frequency2_S', 'max_median_freq_S', 'max_flash_ind_S', 'max_cv_S', 'max_iqa_S', 'max_mab_S', 'max_imp_S', 'max_len_S', 'max_wavelet_mean_approx_S', 'max_wavelet_med_approx_S', 'max_wavelet_var_approx_S', 'max_wavelet_mean_coef_S', 'max_wavelet_med_coef_S', 'max_wavelet_var_coef_S', 'max_pack_sum_S', 'max_pack_kurtosis_S', 'max_pack_skew_S', 'max_zcr_S', 'max_rmse_S', 'max_spectral_flatness_S', 'max_spectral_rolloff_S', 'max_centroid_frequency_G', 'max_centroid_frequency2_G', 'max_median_

In [346]:
X.head(2)

Unnamed: 0,max_centroid_frequency_M,max_centroid_frequency2_M,max_median_freq_M,max_flash_ind_M,max_cv_M,max_iqa_M,max_mab_M,max_imp_M,max_len_M,max_wavelet_mean_approx_M,max_wavelet_med_approx_M,max_wavelet_var_approx_M,max_wavelet_mean_coef_M,max_wavelet_med_coef_M,max_wavelet_var_coef_M,max_pack_sum_M,max_pack_kurtosis_M,max_pack_skew_M,max_zcr_M,max_rmse_M,max_spectral_flatness_M,max_spectral_rolloff_M,max_centroid_frequency_S,max_centroid_frequency2_S,max_median_freq_S,max_flash_ind_S,max_cv_S,max_iqa_S,max_mab_S,max_imp_S,max_len_S,max_wavelet_mean_approx_S,max_wavelet_med_approx_S,max_wavelet_var_approx_S,max_wavelet_mean_coef_S,max_wavelet_med_coef_S,max_wavelet_var_coef_S,max_pack_sum_S,max_pack_kurtosis_S,max_pack_skew_S,max_zcr_S,max_rmse_S,max_spectral_flatness_S,max_spectral_rolloff_S,max_centroid_frequency_G,max_centroid_frequency2_G,max_median_freq_G,max_flash_ind_G,max_cv_G,max_iqa_G,max_mab_G,max_imp_G,max_len_G,max_wavelet_mean_approx_G,max_wavelet_med_approx_G,max_wavelet_var_approx_G,max_wavelet_mean_coef_G,max_wavelet_med_coef_G,max_wavelet_var_coef_G,max_pack_sum_G,max_pack_kurtosis_G,max_pack_skew_G,max_zcr_G,max_rmse_G,max_spectral_flatness_G,max_spectral_rolloff_G,mean_pack_sum_G,mean_wavelet_var_coef_G,mean_cv_G,mean_mab_G,mean_imp_G,mean_wavelet_med_approx_G,mean_pack_kurtosis_G,mean_pack_skew_G,mean_centroid_frequency2_G,mean_centroid_frequency_G,mean_rmse_G,mean_len_G,mean_iqa_G,mean_wavelet_mean_coef_G,mean_wavelet_mean_approx_G,mean_spectral_flatness_G,mean_wavelet_var_approx_G,mean_median_freq_G,mean_flash_ind_G,mean_spectral_rolloff_G,mean_wavelet_med_coef_G,mean_zcr_G
0,7371.324388,2679.227894,6483.983503,0.008534,1.337432,2.586849e-07,0.030517,0,73,-0.013705,-0.021579,9.2e-05,-0.001458,0.0,2.6e-05,0.717141,-1.181921,0.282109,0.164384,0.012501,6.2e-05,7353.588867,6475.116681,3302.480303,5674.218237,0.004337,1.03801,0.034014,18,0,73,4.643972,4.242641,17.771366,-0.057333,0.0,1.442659,252.0,11.019919,2.625684,0.082192,4.532984,4.1e-05,6115.429688,8153.438439,7257.609334,7621.698593,0.00257,0.674025,1.16704e-07,0.045203,1,73,0.002785,0.002274,7.1e-05,3.9e-05,0.0,6.1e-05,0.325088,12.591499,1.557369,0.109589,0.008397,9.8e-05,6438.427734,0.29601,5.1e-05,0.664478,0.040871,1.0,0.001863,12.52513,1.410314,7318.446979,8181.575808,0.007619,73.0,9.708416e-08,3.15845e-05,0.002509,9.7e-05,5.9e-05,7662.46296,0.002521,6508.410645,0.0,0.109589
1,7086.113317,6697.399727,6743.588474,0.006784,2.258637,0.001837043,2.715982,10,139,-0.017571,-0.021579,0.914387,0.001541,0.0,0.242652,70.401904,1.816349,-0.028449,0.122302,0.763434,0.000129,4080.541992,7301.457307,5199.687184,6302.180373,0.014169,1.021832,0.540045,37,7,139,-2.676904,-4.242641,287.655612,-0.111117,0.0,51.694796,1408.0,0.384598,-0.006413,0.122302,13.089619,0.00185,6642.993164,3450.442567,2618.793204,2680.641145,0.007678,2.432687,2.90227e-07,0.03359,0,139,-0.001586,-0.00199,0.000174,-4e-06,-3.2e-05,6e-06,0.899509,1.929469,-0.553192,0.05036,0.009596,7.4e-05,2077.954102,0.553248,3e-06,2.301496,0.01876,0.0,-0.001958,0.996765,-0.057585,2153.075864,3831.391048,0.005672,139.0,1.499342e-07,-9.024142e-07,-0.001576,0.000126,8.9e-05,2559.758527,0.010828,3402.246094,-1.6e-05,0.035971


In [347]:
all_studies.best_trial.value

0.8304859554570726

In [348]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 3726,
 'loss_function': 'RMSE',
 'learning_rate': 0.03004368080909306,
 'l2_leaf_reg': 0.05022531263702171,
 'colsample_bylevel': 0.7654913547542036,
 'depth': 6,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'min_data_in_leaf': 11,
 'one_hot_max_size': 10,
 'subsample': 0.8354498857818983}

In [349]:
catboost_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                          ('power_transform', PowerTransformer()), 
                          ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [350]:
scores = cross_val_score(estimator=catboost_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [351]:
print('All Scores: ', scores)

All Scores:  [0.8486582  0.84226447 0.80210019 0.82610314 0.85023519 0.83072819
 0.81717284 0.82221892 0.82938714 0.8320408 ]


In [352]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.8300909087092909


In [353]:
catboost_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad9acb50>)])

In [354]:
y_pred = catboost_pipe.predict(X_test)

In [355]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.28635788164312076


In [356]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.820534495020491


In [358]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error],axis=1)
pred_error.to_csv('./data/error/sps/complex.csv', sep=' ', index=False)

In [359]:
catboost_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad9acb50>)])

In [360]:
catboost_pipe.score(X, y)

0.9526443373431462

In [361]:
with open('./models/sps/sps_catboost_complex.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_pipe, file=pkl_file)

# Train Final Model (Simple)

In [443]:
with open('./results/final/simple/config.yaml', 'r') as yaml_file:
    configs = yaml.load(yaml_file, Loader=yaml.FullLoader)
configs

{'DATE_FROM': 2020,
 'DEBUG_RUN': False,
 'DROP_COLUMNS': ['velocity', 'start_time', 'packnr'],
 'EXTRACT_MAX_FEATURES': True,
 'EXTRACT_MEAN_FEATURES': False,
 'FILEPATHS': ['./data/data_mpa.txt',
  './data/data_spg.txt',
  './data/data_sps.txt'],
 'LOG_SCALE_TARGET': False,
 'MODEL_NAME': 'CAT',
 'N_TRIALS': 100,
 'SAVE_DIR': './results/final/simple/',
 'TUNING_ITER': 3}

## MPA

In [444]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/simple/', device_name='mpa')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [445]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
91,91,0.757531,2021-12-27 15:49:35.406232,2021-12-27 15:50:02.830097,0 days 00:00:27.423865,,Plain,MVS,0.658465,10,2740,0.021644,0.013027,RMSE,4,19,,COMPLETE
191,191,0.757531,2021-12-27 15:49:35.406232,2021-12-27 15:50:02.830097,0 days 00:00:27.423865,,Plain,MVS,0.658465,10,2740,0.021644,0.013027,RMSE,4,19,,COMPLETE


In [446]:
train = pd.read_table('./data/train/split_data_max_mpa.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_data_max_mpa.csv', sep=' ')
all_data = pd.read_table('./data/archive/data_max_mpa.csv', sep=' ')

In [447]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [448]:
print(X.columns.to_list())

['max_centroid_frequency_M', 'max_centroid_frequency2_M', 'max_median_freq_M', 'max_flash_ind_M', 'max_cv_M', 'max_iqa_M', 'max_mab_M', 'max_imp_M', 'max_len_M']


In [370]:
X.head(2)

Unnamed: 0,max_centroid_frequency_M,max_centroid_frequency2_M,max_median_freq_M,max_flash_ind_M,max_cv_M,max_iqa_M,max_mab_M,max_imp_M,max_len_M
46123,3497.695364,3651.198231,3536.219986,0.001092,0.862842,0.001232,2.2195,6,50
46124,3199.658992,3221.798014,3199.297511,0.00153,1.0512,0.000913,1.717626,6,52


In [371]:
all_studies.best_trial.value

0.7575312735925648

In [372]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 2740,
 'loss_function': 'RMSE',
 'learning_rate': 0.013026901294162932,
 'l2_leaf_reg': 0.02164361803562735,
 'colsample_bylevel': 0.6584649538042444,
 'depth': 10,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 4,
 'one_hot_max_size': 19}

In [373]:
catboost_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                             ('power_transform', PowerTransformer()), 
                             ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [374]:
scores = cross_val_score(estimator=catboost_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [375]:
print('All Scores: ', scores)

All Scores:  [0.75745599 0.73846527 0.75788592 0.77146571 0.74663499 0.74516038
 0.75591134 0.75845298 0.75917339 0.76361361]


In [376]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.7554219569952801


In [377]:
catboost_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad001490>)])

In [378]:
y_pred = catboost_pipe.predict(X_test)

In [379]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.27998366885011733


In [380]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.7677266097688191


In [381]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error], axis=1)
pred_error.to_csv('./data/error/mpa/simple.csv', sep=' ', index=False)

In [382]:
catboost_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad001490>)])

In [383]:
catboost_pipe.score(X, y)

0.8730367653471209

In [384]:
with open('./models/mpa/mpa_catboost_simple.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_pipe, file=pkl_file)

## SPG

In [449]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/simple/', device_name='spg')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [450]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
90,90,0.77429,2021-12-27 16:29:33.834645,2021-12-27 16:29:46.076929,0 days 00:00:12.242284,,Plain,MVS,0.909189,8,2667,0.187837,0.010489,RMSE,12,4,,COMPLETE
163,163,0.774255,2021-12-27 16:42:53.778691,2021-12-27 16:43:07.590405,0 days 00:00:13.811714,,Plain,Bernoulli,0.699447,7,4648,0.775859,0.005046,RMSE,19,12,0.714922,COMPLETE


In [451]:
train = pd.read_table('./data/train/split_data_max_spg.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_data_max_spg.csv', sep=' ')
all_data = pd.read_table('./data/archive/data_max_spg.csv', sep=' ')

In [452]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [453]:
print(X.columns.to_list())

['max_centroid_frequency_G', 'max_centroid_frequency2_G', 'max_median_freq_G', 'max_flash_ind_G', 'max_cv_G', 'max_iqa_G', 'max_mab_G', 'max_imp_G', 'max_len_G']


In [390]:
X.head(2)

Unnamed: 0,max_centroid_frequency_G,max_centroid_frequency2_G,max_median_freq_G,max_flash_ind_G,max_cv_G,max_iqa_G,max_mab_G,max_imp_G,max_len_G
30126,1966.146784,1977.787415,2052.54952,0.003117,1.118455,2.939599e-07,0.021001,0,48
30127,456.02908,323.956298,334.713115,0.012305,3.281048,1.906679e-06,0.024618,2,82


In [391]:
all_studies.best_trial.value

0.7742899023019836

In [392]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 2667,
 'loss_function': 'RMSE',
 'learning_rate': 0.010488721255868626,
 'l2_leaf_reg': 0.18783728913290168,
 'colsample_bylevel': 0.9091893755627428,
 'depth': 8,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 12,
 'one_hot_max_size': 4}

In [393]:
catboost_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                          ('power_transform', PowerTransformer()), 
                          ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [394]:
scores = cross_val_score(estimator=catboost_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [395]:
print('All Scores: ', scores)

All Scores:  [0.76882783 0.76886545 0.76937802 0.78294534 0.78540828 0.79383241
 0.77764034 0.78665871 0.76285174 0.77435771]


In [396]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.7770765838709519


In [397]:
catboost_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aacffea30>)])

In [398]:
y_pred = catboost_pipe.predict(X_test)

In [399]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.22865376582712002


In [400]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.7624840634531107


In [402]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error], axis=1)
pred_error.to_csv('./data/error/spg/simple.csv', sep=' ', index=False)

In [403]:
catboost_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aacffea30>)])

In [404]:
catboost_pipe.score(X, y)

0.846373173445221

In [405]:
with open('./models/spg/spg_catboost_simple.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_pipe, file=pkl_file)

## SPS (Achtung Features von S und G auch extrahieren!)

In [456]:
# Load best study
studies = load_studies_from_folder(folder_name='./results/final2/simple/', device_name='sps')
all_studies = concat_all_studies(studies=studies, as_dataframe=False)
df = concat_all_studies(studies=studies, as_dataframe=True)

In [457]:
df.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_temperature,params_boosting_type,params_bootstrap_type,params_colsample_bylevel,params_depth,params_iterations,params_l2_leaf_reg,params_learning_rate,params_loss_function,params_min_data_in_leaf,params_one_hot_max_size,params_subsample,state
97,97,0.810945,2021-12-27 18:58:01.812742,2021-12-27 18:59:00.416322,0 days 00:00:58.603580,,Plain,MVS,0.497866,9,4439,0.104369,0.011973,RMSE,12,6,,COMPLETE
76,76,0.810823,2021-12-27 18:41:14.051321,2021-12-27 18:41:52.348461,0 days 00:00:38.297140,,Plain,MVS,0.573557,8,4988,0.108458,0.009632,RMSE,12,3,,COMPLETE


In [458]:
train = pd.read_table('./data/train/split_data_max_sps.csv', sep=' ')
holdout = pd.read_table('./data/holdout/test_data_max_sps.csv', sep=' ')
all_data = pd.read_table('./data/archive/data_max_sps.csv', sep=' ')

In [459]:
X_train, y_train = preprocess_data(train, datefrom=configs['DATE_FROM'])
X_test, y_test = preprocess_data(holdout, datefrom=configs['DATE_FROM'])
X, y = preprocess_data(all_data, datefrom=configs['DATE_FROM'])

In [460]:
print(X.columns.to_list())

['max_centroid_frequency_G', 'max_centroid_frequency2_G', 'max_median_freq_G', 'max_flash_ind_G', 'max_cv_G', 'max_iqa_G', 'max_mab_G', 'max_imp_G', 'max_len_G', 'max_centroid_frequency_M', 'max_centroid_frequency2_M', 'max_median_freq_M', 'max_flash_ind_M', 'max_cv_M', 'max_iqa_M', 'max_mab_M', 'max_imp_M', 'max_len_M', 'max_centroid_frequency_S', 'max_centroid_frequency2_S', 'max_median_freq_S', 'max_flash_ind_S', 'max_cv_S', 'max_iqa_S', 'max_mab_S', 'max_imp_S', 'max_len_S']


In [411]:
X.head(2)

Unnamed: 0,max_centroid_frequency_G,max_centroid_frequency2_G,max_median_freq_G,max_flash_ind_G,max_cv_G,max_iqa_G,max_mab_G,max_imp_G,max_len_G,max_centroid_frequency_M,max_centroid_frequency2_M,max_median_freq_M,max_flash_ind_M,max_cv_M,max_iqa_M,max_mab_M,max_imp_M,max_len_M,max_centroid_frequency_S,max_centroid_frequency2_S,max_median_freq_S,max_flash_ind_S,max_cv_S,max_iqa_S,max_mab_S,max_imp_S,max_len_S
0,8153.438439,7257.609334,7621.698593,0.00257,0.674025,1.16704e-07,0.045203,1,73,7371.324388,2679.227894,6483.983503,0.008534,1.337432,2.586849e-07,0.030517,0,73,6475.116681,3302.480303,5674.218237,0.004337,1.03801,0.034014,18,0,73
1,3450.442567,2618.793204,2680.641145,0.007678,2.432687,2.90227e-07,0.03359,0,139,7086.113317,6697.399727,6743.588474,0.006784,2.258637,0.001837043,2.715982,10,139,7301.457307,5199.687184,6302.180373,0.014169,1.021832,0.540045,37,7,139


In [412]:
all_studies.best_trial.value

0.8109448016485663

In [413]:
params = all_studies.best_trial
model_params = params.params
model_params

{'iterations': 4439,
 'loss_function': 'RMSE',
 'learning_rate': 0.011972943132649651,
 'l2_leaf_reg': 0.1043691311999729,
 'colsample_bylevel': 0.49786616572837195,
 'depth': 9,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 12,
 'one_hot_max_size': 6}

In [414]:
catboost_pipe = Pipeline([('outlier_feature', OutlierFeatureAdder()),
                             ('power_transform', PowerTransformer()), 
                             ('cat_boost', CatBoostRegressor(**model_params, silent=True))])

In [415]:
scores = cross_val_score(estimator=catboost_pipe, X=X_train, y=y_train, cv=10, n_jobs=10)

In [416]:
print('All Scores: ', scores)

All Scores:  [0.84535999 0.82200116 0.81797561 0.80947276 0.82106946 0.79485543
 0.81397613 0.81784603 0.79143244 0.77046446]


In [417]:
print('Mean Score: ', np.mean(scores))

Mean Score:  0.8104453479637781


In [418]:
catboost_pipe.fit(X_train, y_train)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad9ab730>)])

In [419]:
y_pred = catboost_pipe.predict(X_test)

In [420]:
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.2787549654141174


In [421]:
print('R2-Score:', r2_score(y_test, y_pred))

R2-Score: 0.806843929814534


In [422]:
pred_error = pd.DataFrame(data={'y_pred':y_pred, 'y_test':y_test})
pred_error = pd.concat([holdout[['start_time', 'velocity', 'packnr', 'size_mm']], pred_error], axis=1)
pred_error.to_csv('./data/error/sps/simple.csv', sep=' ', index=False)

In [423]:
catboost_pipe.fit(X, y)

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7f4aad9ab730>)])

In [424]:
catboost_pipe.score(X, y)

0.9873441228782358

In [425]:
with open('./models/sps/sps_catboost_simple.pkl', 'wb') as pkl_file:
    pickle.dump(obj=catboost_pipe, file=pkl_file)

# Check predictions

In [None]:
catboost_pipe.predict