## Weight & Biases
Zur Nachverfolgung und Analyse der Model-Performance

### Dataset
- **Train:** Modell wird trainiert & validiert. Modell & Pipeline serialisiert
- **Test:** Serialisierte Pipeline & Modell wird auf Test-Set angewendet

In [1]:
train_run = True
residuals_plot = False
plot_save_feature_importance = False

### Initialisieren

In [2]:
from typing import Final

import wandb
import pandas as pd
from joblib import load
from numpy import mean
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, mean_absolute_percentage_error
from sklearn.model_selection import RepeatedKFold

import src.package.consts as c
import src.package.importer as im
import src.package.importer_usages as imp_usg
import src.package.ml_helper as ml_helper
import src.package.shared as sh
import src.package.charts as charts

pd.set_option('display.max_rows', 500)  # to show all value_counts

MODEL_GRADIENT_BOOSTING: Final = 'GradientBoosting'
MODEL_LINEAR_REGRESSION: Final = 'LinearRegression'
current_model = MODEL_GRADIENT_BOOSTING

TAG_TRAIN_SET: Final = 'train-set'
TAG_TEST_SET: Final = 'test-set'

HIGHEST_ONLY: Final = True # requires LabelEncoder to be activated
INCLUDE_GARAGES: Final = True
COMBINE_GARAGES: Final = True
HO_MAX_FIELDS: Final = 1 # available for 'HIGHEST_ONLY' only – max possible = 4 (primary, secondary, tertiary and quaternary)
HO_PERCENTAGE_ONLY: Final = True # available for 'HIGHEST_ONLY' only – requires to deactivate LabelEncoder
LOSS_FUNCTION: Final = 'lad'

def load_dataset(residual_test_run: bool = False):
    # set_file = 'train_set.csv' if train_run else 'test_set.csv'
    set_file = 'full_dataset.csv'
    df = im.get_extended_dataset(f'../package/datasets/{set_file}')
    df, new_fields = imp_usg.extract_usage_details(df,
                                                   highest_only=HIGHEST_ONLY,
                                                   include_garages=INCLUDE_GARAGES,
                                                   combine_garages=COMBINE_GARAGES,
                                                   max_fields=HO_MAX_FIELDS,
                                                   percentages_only=HO_PERCENTAGE_ONLY
                                                   )

    # additional_features = [c.FIELD_AREA_TOTAL_FLOOR_416]
    additional_features = [c.FIELD_AREA_MAIN_USAGE]
    additional_features.extend(new_fields)

    # field_to_predict = c.FIELD_AREA_USAGE
    field_to_predict = c.FIELD_AREA_TOTAL_FLOOR_416
    # field_to_predict = c.FIELD_AREA_MAIN_USAGE

    if train_run and not residual_test_run:
        df = ml_helper.remove_outliers(df, factor=2.3)
        return ml_helper.ml_dataset_full(df, field_to_predict=field_to_predict,
                                         additional_features=additional_features)
    else:
        # load transform pipeline for test_set
        pipeline = load(sh.export_directory_path(f'fitted_pipeline.joblib'))
        return ml_helper.ml_dataset_full(df, field_to_predict=field_to_predict,
                                         additional_features=additional_features,
                                         fitted_pipeline=pipeline)


def init_wandb(X, tag_set, model, related_run=None):
    wandb.login()

    config = {
        'model'        : model,
        'features'     : ', '.join(X.columns.values),
        'dataset-count': len(X.index),
        'description'  : 'New Dataset, Impute volume 416 (clustered), Removed outliers (2.3), Drop all, OneHotEncoding, Default Hyperparameter'
    }

    if related_run is not None:
        config['related_run'] = related_run

    print(config)
    wandb.init(project='Metriken Bauwesen', entity='devcore', config=config, tags=[tag_set])


### Train-Run

In [3]:
if train_run:
    X, y = load_dataset()
    init_wandb(X, TAG_TRAIN_SET, current_model)

    regr = GradientBoostingRegressor(random_state=0,
                                     learning_rate=0.03551,
                                     subsample=0.528,
                                     n_estimators=1024,
                                     max_depth=8,
                                     min_samples_split=5,
                                     loss=LOSS_FUNCTION)

    scores_map = ml_helper.cross_validation(regr, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0))

    # log result of cross validation to wandb
    if not residuals_plot:
        for key in scores_map.keys():
            if key in {'fit_time', 'score_time'}:
                continue
            scores = scores_map[key]
            wandb.log({f'{key}_mean': mean(scores)})
            print({f'{key}_mean': mean(scores)})

    if plot_save_feature_importance:
        regr = regr.fit(X, y)
        PO = '_PO' if HO_PERCENTAGE_ONLY else ''
        KOMB = '_KOMB' if COMBINE_GARAGES else ''
        HO = '_HO' if HIGHEST_ONLY else '_OHE'
        GRG = '_GRG' if INCLUDE_GARAGES else ''
        SET_TYPE = 'TRAIN' if train_run else 'TEST'
        charts.plot_feature_importance(regr.feature_importances_,
                                       X.columns,
                                       current_model,
                                       save_label=f"{SET_TYPE}_{LOSS_FUNCTION}{HO}{GRG}{KOMB}_{HO_MAX_FIELDS}{PO}")

    # serialize model for test set
    regr.fit(X, y)
    sh.serialize_object(regr, 'fitted_model')

    # sync residual plot
    if residuals_plot:
        X_test, y_test = load_dataset(residual_test_run=True)
        wandb.sklearn.plot_regressor(regr, X, X_test, y, y_test)

Location: D:\Development\FHNW\ip5-metriken-bauwesen\export\cluster_means.joblib
Location: D:\Development\FHNW\ip5-metriken-bauwesen\export\fitted_pipeline.joblib
{'model': 'GradientBoosting', 'features': 'num_floors_underground, num_floors_overground, garage_combined, total_expenses, primary_percentage, volume_total_416, area_main_usage, x0_ANDERES, x0_BEHERBERGUNG, x0_BETRIEB, x0_BUERO, x0_GESUNDHEIT, x0_HALLEN, x0_KULTUS_KULTUR, x0_OFFENE_BAUTEN, x0_SCHULEN, x0_TECHNIK, x0_WOHNEN', 'dataset-count': 194, 'description': 'New Dataset, Impute volume 416 (clustered), Removed outliers (2.3), Drop all, OneHotEncoding, Default Hyperparameter'}
{'test_r2_mean': 0.935587805818986}
{'test_neg_mean_absolute_percentage_error_mean': -0.13113324185130057}
{'test_neg_root_mean_squared_error_mean': -1817.9294513911161}
{'test_neg_mean_absolute_error_mean': -812.3490011255444}
{'test_max_error_mean': -8638.913090441836}
Location: D:\Development\FHNW\ip5-metriken-bauwesen\export\fitted_model.joblib


### Test-Run

In [4]:
if not train_run:
    X, y = load_dataset()
    init_wandb(X, TAG_TEST_SET, current_model, related_run='peach-universe-66')

    regr = load(sh.export_directory_path(f'fitted_model.joblib'))
    predictions = regr.predict(X)

    r2_score = regr.score(X, y)
    print("R^2 value: ", r2_score)
    wandb.log({'test_r2_mean': r2_score})

    neg_mean_squared_error = -mean_squared_error(y, predictions, squared=True)
    print("neg_mean_squared_error", neg_mean_squared_error)
    wandb.log({'test_neg_root_mean_squared_error_mean': neg_mean_squared_error})

    neg_root_mean_squared_error = -mean_squared_error(y, predictions, squared=False)
    print("neg_root_mean_squared_error", neg_root_mean_squared_error)
    wandb.log({'test_neg_root_mean_squared_error_mean': neg_mean_squared_error})

    neg_mean_absolute_error = -mean_absolute_error(y, predictions)
    print("neg_mean_absolute_error", neg_mean_absolute_error)
    wandb.log({'test_neg_mean_absolute_error_mean': neg_mean_absolute_error})

    neg_mape = -mean_absolute_percentage_error(y, predictions)
    print("neg_mean_absolute_percentage_error", neg_mape)
    wandb.log({'test_neg_mean_absolute_percentage_error_mean': neg_mape})

    neg_max_error = -max_error(y, predictions)
    print("neg_max_error", neg_max_error)
    wandb.log({'test_max_error_mean': neg_max_error})

    if plot_save_feature_importance:
        regr = regr.fit(X, y)
        PO = '_PO' if HO_PERCENTAGE_ONLY else ''
        KOMB = '_KOMB' if COMBINE_GARAGES else ''
        HO = '_HO' if HIGHEST_ONLY else '_OHE'
        GRG = '_GRG' if INCLUDE_GARAGES else ''
        SET_TYPE = 'TRAIN' if train_run else 'TEST'
        charts.plot_feature_importance(regr.feature_importances_,
                                       X.columns,
                                       current_model,
                                       save_label=f"{SET_TYPE}_{LOSS_FUNCTION}{HO}{GRG}{KOMB}_{HO_MAX_FIELDS}{PO}")
