In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import yaml
import sys
sys.path.append('..')
import os
import yaml
import datetime as dt
from multiprocessing import Pool
from feature_extract import extract_highest_amplitude_features_with_mp, get_all_sensors_in_df, feature_extractor_wrapper
from plot import plot_residuals, plot_error_per_cat
from catboost import CatBoostRegressor
import optuna

from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px

# Helper Functions

In [12]:
class OutlierFeatureAdder(TransformerMixin, BaseEstimator):
    """Wrapper Class for Sklearn Pipeline"""
    def __init__(self):
        """Adds Outlier feature column"""
        super().__init__()
        self.mean = None
        self.std = None

    def fit(self, X: pd.DataFrame, y=None):
        _ = self.add_outlier_feature_length(X)

        return self

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X)
        X = self.transform(X)

        return X

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X = X.copy()
        X.loc[:, ('outlier')] = self.add_outlier_feature_length(X)

        return X

        # Preprocessing functions

    def add_outlier_feature_length(self, df: pd.DataFrame) -> pd.DataFrame:
        """Adds new Feature declaring outliers"""
        # Calculate prediction for outlier
        max_len_col = [col for col in df.columns if 'max_len' in col][0]
        length_transformed = np.log(df[max_len_col])
        if not self.mean:
            self.mean = np.mean(length_transformed)
            self.std = np.std(length_transformed)

        length_transformed = (length_transformed - self.mean) / self.std
        outlier_prediction = length_transformed > 3

        return outlier_prediction.to_numpy()

# Train Models on 2021

In [30]:
with open('config.yaml', 'r') as yaml_file:
    config = yaml.load(yaml_file, yaml.FullLoader)

## MPA Simple
 


In [47]:
def _load_model(model_path):
    model_path = '../models/mpa/mpa_catboost_simple.pkl'  
    with open(model_path, 'rb') as pkl_file:
        model = pickle.load(pkl_file)
        print(model)
    
    return model
        
def _load_train_test(train_data_path, test_data_path):
    train_data = pd.read_csv(train_data_path, sep=' ')
    test_data = pd.read_csv(test_data_path, sep=' ')
    drop_columns = ['start_time', 'packnr', 'velocity', 'size_mm']
    X_train, y_train = train_data.drop(drop_columns, axis=1), train_data['size_mm']
    X_test, y_test = test_data.drop(drop_columns, axis=1), test_data['size_mm']
    print('Data Loaded')
    test_data = test_data[drop_columns]
    
    return X_train, X_test, y_train, y_test, test_data

def _check_features(X_train, X_test, features_from_yaml):
    assert all([True if feature in X_train.columns else False for feature in features_from_yaml])
    assert all([True if feature in X_test.columns else False for feature in features_from_yaml])
    print('Test successfull, Features are present.')
    
def prediction_wrapper(model_path, train_data_path, test_data_path, check_features, save_predictions_path):
    model = _load_model(model_path=model_path)
    X_train, X_test, y_train, y_test, test_data = _load_train_test(train_data_path=train_data_path, 
                                                                   test_data_path=test_data_path)
    _check_features(X_train, X_test, features_from_yaml=check_features)
    print('Fitting Model')
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    test_data['predictions'] = predictions
    test_data.to_csv(save_predictions_path, index=False, sep=' ')
    print('Process Finished')

## MPA Simple

In [48]:
prediction_wrapper(model_path='../models/mpa/mpa_catboost_simple.pkl'  , train_data_path='../data/compare2021/train/2021_data_max_mpa.csv',
                   test_data_path='../data/compare2021/test/2021_data_max_mpa.csv', check_features=config['_features']['simple']['mpa'],
                   save_predictions_path='./predictions/mpa_simple.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd864362b50>)])
Data Loaded
Test successfull, Features are present.
Process Finished


## MPA Complex
 


In [49]:
prediction_wrapper(model_path='../models/mpa/mpa_catboost_complex.pkl'  , train_data_path='../data/compare2021/train/2021_extract_mpa.csv',
                   test_data_path='../data/compare2021/test/2021_extract_mpa.csv', check_features=config['_features']['complex']['mpa'],
                   save_predictions_path='./predictions/mpa_comnplex.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd864362cd0>)])
Data Loaded
Test successfull, Features are present.
Process Finished


## SPG Simple
 


In [50]:
prediction_wrapper(model_path='../models/spg/spg_catboost_simple.pkl'  , train_data_path='../data/compare2021/train/2021_data_max_spg.csv',
                   test_data_path='../data/compare2021/test/2021_data_max_spg.csv', check_features=config['_features']['simple']['spg'],
                   save_predictions_path='./predictions/spg_simple.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd87d37cd90>)])
Data Loaded
Test successfull, Features are present.
Process Finished


## SPG Complex 


In [51]:
prediction_wrapper(model_path='../models/spg/spg_catboost_complex.pkl'  , train_data_path='../data/compare2021/train/2021_extract_spg.csv',
                   test_data_path='../data/compare2021/test/2021_extract_spg.csv', check_features=config['_features']['complex']['spg'],
                   save_predictions_path='./predictions/spg_complex.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd87d0bb7c0>)])
Data Loaded
Test successfull, Features are present.
Process Finished


## SPS Simple
 


In [52]:
prediction_wrapper(model_path='../models/sps/sps_catboost_simple.pkl'  , train_data_path='../data/compare2021/train/2021_data_max_sps.csv',
                   test_data_path='../data/compare2021/test/2021_data_max_sps.csv', check_features=config['_features']['simple']['sps'],
                   save_predictions_path='./predictions/sps_simple.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd864362a00>)])
Data Loaded
Test successfull, Features are present.
Process Finished


## SPS Complex
 


In [53]:
prediction_wrapper(model_path='../models/sps/sps_catboost_complex.pkl'  , train_data_path='../data/compare2021/train/2021_extract_sps.csv',
                   test_data_path='../data/compare2021/test/2021_extract_sps.csv', check_features=config['_features']['complex']['sps'],
                   save_predictions_path='./predictions/sps_complex.csv')

Pipeline(steps=[('outlier_feature', OutlierFeatureAdder()),
                ('power_transform', PowerTransformer()),
                ('cat_boost',
                 <catboost.core.CatBoostRegressor object at 0x7fd86c9448e0>)])
Data Loaded
Test successfull, Features are present.
Process Finished


In [55]:
pd.read_csv('./predictions/mpa_comnplex.csv', sep=' ')

Unnamed: 0,start_time,packnr,velocity,size_mm,predictions
0,2021-06-23 15:06:18,117,2.5,28.1,20.691870
1,2021-06-23 14:16:00,13,2.5,17.4,18.059771
2,2021-06-24 13:53:02,31,3.2,17.4,21.632027
3,2021-06-22 16:29:53,39,1.8,17.4,15.945985
4,2021-06-22 15:58:27,64,1.8,12.3,12.329885
...,...,...,...,...,...
2162,2021-06-22 18:07:03,43,1.8,7.7,13.355368
2163,2021-06-23 14:58:50,41,2.5,28.1,31.858794
2164,2021-06-22 16:34:11,106,1.8,17.4,17.615521
2165,2021-06-24 15:57:26,66,3.2,71.3,96.208188


In [None]:
from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px
from plot import plot_residuals, plot_error_per_cat

In [None]:
folder_name ='./results/20211220_WV_ZCR_PV_RMSE_STAT/'

In [None]:
with open(folder_name + 'config.yaml', 'r') as yaml_file:
    configs = yaml.load(yaml_file)
configs

In [None]:
df = load_studies_from_folder(folder_name=folder_name, device_name='mpa')
df = concat_all_studies_df(df)
df = df.sort_values(by='value', ascending=False)

In [None]:
df.head(10)

In [None]:
fig = plot_parallel_cordinates(df, objective_maximize=True)
HTML(fig.to_html())