# Modellierung der MPA-Sensordaten

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import sys
sys.path.append('..')
import os
import yaml
import datetime as dt
from multiprocessing import Pool
from helper.feature_extract import extract_highest_amplitude_features_with_mp, get_all_sensors_in_df, feature_extractor_wrapper
from helper.plot import plot_residuals, plot_error_per_cat
from catboost import CatBoostRegressor
import optuna

from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px

# Helper Functions

In [6]:
# Preprocessing functions
def add_outlier_feature_length(df: pd.DataFrame) -> pd.DataFrame:
    """Adds new Feature declaring outliers"""
    # Calculate prediction for outlier
    max_len_col = [col for col in df.columns if 'max_len' in col][0]
    length_transformed = np.log(df[max_len_col])
    length_transformed_std = (length_transformed - np.mean(length_transformed)) / np.std(length_transformed)
    outlier_prediction = length_transformed_std > 3
    
    # add to df
    df['outlier'] = outlier_prediction
    
    return df


def concat_all_studies_df(studies: list) -> pd.DataFrame:
    """Concatenates all optuna studies"""
    df = pd.concat([study.trials_dataframe() for study in studies], axis=0)
    df = df.sort_values(by='value', ascending=False)
    
    return df

def load_studies_from_folder(folder_name: str, device_name: str) -> list:
    """Loads studies from a given folder with Device Name"""
    # Load study
    files = [f for f in os.listdir(folder_name) if device_name in f]
    studies= []
    for i, file in enumerate(files):
        filename = folder_name + file 
        with open(filename, 'rb') as pkl_file:
            study_pkl = pickle.load(pkl_file)
            studies.append(study_pkl)
    
    return studies

def plot_parallel_cordinates(df: pd.DataFrame, objective_maximize=True, columns=None):
    """Plots Opt. History as Parallel Cordinates"""
    if columns:
        columns.append('number')
        columns.append('value')
        df = df[columns]
        
    cols = [col.split('_') for col in df.columns]
    df.columns = ['_'.join(col[1:])  if len(col) > 1 else col[0] for col in cols] 
    
    if objective_maximize:
        fig = px.parallel_coordinates(data_frame=df.drop('number', axis=1), color='value', color_continuous_scale='Blues', 
                                      height=500, width=1500)
    else:
        fig = px.parallel_coordinates(data_frame=df.drop('number', axis=1), color='value', color_continuous_scale='Blues_r', 
                                      height=500, width=1500)
    return fig

# Hyperparam Optimization CatBoost

In [7]:
def objective_r2(trial):
    X_transformed = X_train.copy()
    
    # Pre-Processing
    ## Polynomial Features
    add_outlier = trial.suggest_categorical('outlier_feature', [True, False])    
    if add_outlier:
        X_transformed = add_outlier_feature_length(X_transformed)
    
    ## Feature Transformation to normalize data
    apply_feature_transformation = trial.suggest_categorical('apply_feature_transformation', [True, False])
    if apply_feature_transformation:
        transformer = PowerTransformer(standardize=trial.suggest_categorical('pt_standardize', [True, False]))
        X_transformed = transformer.fit_transform(X_train)
        
    param = {
        'iterations': trial.suggest_int('iterations', 50, 5000),
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE', 'MAPE']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.5),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 1e0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 1),
        'depth': trial.suggest_int('depth', 1, 10),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 20),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20), 
        'silent': True
    }
    # Conditional Hyper-Parameters
    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    reg = CatBoostRegressor(**param, thread_count=6)
    score = cross_val_score(estimator=reg, X=X_train,
                            y=y_train, cv=10, n_jobs=4)    
 
    return np.mean(scores)

In [9]:
# './data/data_mpa.txt', './data/data_spg.txt', './data/data_sps.txt'
config = dict(
    FILEPATHS = ['../data/data_mpa.txt', '../data/data_spg.txt', 
                 '../data/data_sps.txt'],
    TUNING_ITER = 3,
    N_TRIALS = 100,
    DROP_COLUMNS = ['velocity', 'start_time', 'packnr'],
    LOG_SCALE_TARGET = False,
    MODEL_NAME = 'LGBM',
    SAVE_DIR = './results/final/' , # Needs to end with '/' like './results/final/'
    EXTRACT_MAX_FEATURES = True,
    EXTRACT_MEAN_FEATURES = True,
    DEBUG_RUN = True,
    DATE_FROM = 2020,  # selcts Dates >= DATE_FROM
)

try:
    save_dir = config['SAVE_DIR'].split('/')[1:]
    check_dir = None
    for sdir in save_dir:
        if check_dir:
            check_dir = check_dir + '/' + sdir
        else:
            check_dir = sdir
        try:
            os.mkdir(check_dir)
        except FileExistsError as fe:
            print(fe, 'Skipping iteration')
except:
    os.mkdir(config['SAVE_DIR'])

# Save Parametr-configs of file
if 'config.yaml' in os.listdir(config['SAVE_DIR']):
    print(Exception('Config already exists!'))
    if input('Want Overwrite Existing? (y/n)')=='y':
        with open(config['SAVE_DIR'] + 'config.yaml', 'w') as yaml_file:
            yaml.dump(config, yaml_file)
else:
    with open(config['SAVE_DIR'] + 'config.yaml', 'w') as yaml_file:
        yaml.dump(config, yaml_file)
        
for path in config['FILEPATHS']:
    
    device_name = path.split('_')[-1].split('.')[0]

    print(10*'=', f'Starting Study for {device_name}', 10*'=')
    # Read processed data
    if config['DEBUG_RUN']:
        data = pd.read_table(path, sep=' ', nrows=1000)
    else:
        data = pd.read_table(path, sep=' ')
        
    # Resample data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)        
    
    # Extract Features
    data = feature_extractor_wrapper(df=data, extract_max_features=config['EXTRACT_MAX_FEATURES'],
                                     extract_mean_features=config['EXTRACT_MEAN_FEATURES'])
    #Filtering data
    if config['DATE_FROM'] and not config['DEBUG_RUN']:
        data['start_time'] = pd.to_datetime(data['start_time'])
        data = data.loc[data['start_time'].dt.year >= config['DATE_FROM']]
            
    # Splitting of Data
    print('INFO | Split Data X, y ...')
    feature_cols = data.columns.to_list()
    feature_cols.remove('size_mm')
    for col in config['DROP_COLUMNS']:
        feature_cols.remove(col)
    
    # Split row wise and Target-wise
    data, holdout = train_test_split(data, test_size=.1)
    X_train, y_train = data[feature_cols], data['size_mm']

    print('INFO | Train-Test Split ...')
    if config['LOG_SCALE_TARGET']:
        y_train = np.log(y_train)

    print('INFO | Tune Model ...')
    for i in range(config['TUNING_ITER']):
        if config['DEBUG_RUN']:
            study = optuna.create_study(direction="maximize")
            study.optimize(objective_r2, n_trials=0, n_jobs=1, timeout=5)
            break
        else:
            study = optuna.create_study(direction="maximize")
            study.optimize(objective_r2, n_trials=config['N_TRIALS'], n_jobs=1)
            # Save best params
            study_name = '_'.join([config['MODEL_NAME'], device_name, str(i)])

            with open(f'{config["SAVE_DIR"]}{study_name}.pkl', 'wb') as pkl_file:
                pickle.dump(study, pkl_file)
    if config['DEBUG_RUN']:
        break
print(30*'=', ' Process Finished ', 30*'=')

[WinError 183] Eine Datei kann nicht erstellt werden, wenn sie bereits vorhanden ist: 'results' Skipping iteration
[WinError 183] Eine Datei kann nicht erstellt werden, wenn sie bereits vorhanden ist: 'results/final' Skipping iteration
[WinError 183] Eine Datei kann nicht erstellt werden, wenn sie bereits vorhanden ist: 'results/final/' Skipping iteration
Config already exists!


Want Overwrite Existing? (y/n) n


INFO || Extracting Mean Features
INFO || Extracting Max Features for types: ['M01', 'M02', 'M04', 'M03']


[32m[I 2021-12-21 16:41:39,742][0m A new study created in memory with name: no-name-e40059dc-eb98-42fa-bf73-d87ca66dcd49[0m


INFO | Split Data X, y ...
INFO | Train-Test Split ...
INFO | Tune Model ...


# Investigate Hyperparameters

In [None]:
from IPython.display import HTML
pd.set_option("display.max_columns", 100)
import plotly.express as px
from plot import plot_residuals, plot_error_per_cat

In [None]:
folder_name ='./results/20211220_WV_ZCR_PV_RMSE_STAT/'

In [None]:
with open(folder_name + 'config.yaml', 'r') as yaml_file:
    configs = yaml.load(yaml_file)
configs

In [None]:
df = load_studies_from_folder(folder_name=folder_name, device_name='mpa')
df = concat_all_studies_df(df)
df = df.sort_values(by='value', ascending=False)

In [None]:
df.head(10)

In [None]:
fig = plot_parallel_cordinates(df, objective_maximize=True)
HTML(fig.to_html())