## Weight & Biases
Zur Nachverfolgung und Analyse der Model-Performance

### Dataset
- **Train:** Modell wird trainiert & validiert. Modell & Pipeline serialisiert
- **Test:** Serialisierte Pipeline & Modell wird auf Test-Set angewendet

In [1]:
train_run = True

### Initialisieren

In [2]:
from typing import Final

import pandas as pd
import wandb
from joblib import load
from numpy import mean
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import src.package.importer as im
import src.package.importer_usages as imp_usg
import src.package.ml_helper as ml_helper
import src.package.shared as sh
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, mean_absolute_percentage_error

pd.set_option('display.max_rows', 500)  # to show all value_counts

MODEL_GRADIENT_BOOSTING: Final = 'GradientBoosting'
MODEL_LINEAR_REGRESSION: Final = 'LinearRegression'
current_model = MODEL_GRADIENT_BOOSTING

TAG_TRAIN_SET: Final = 'train-set'
TAG_TEST_SET: Final = 'test-set'

def load_dataset():
    set_file = 'train_set.csv' if train_run else 'test_set.csv'
    df = im.get_extended_dataset(f'../package/datasets/{set_file}')
    df = imp_usg.extract_usage_details(df)
    df = imp_usg.extract_garage_details(df)

    if train_run:
        df = ml_helper.remove_outliers(df, factor=2.3)
        return ml_helper.hnf_dataset_full(df)
    else:
        # load transform pipeline for test_set
        pipeline = load(sh.export_directory_path(f'fitted_pipeline.joblib'))
        return ml_helper.hnf_dataset_full(df, fitted_pipeline=pipeline)

def init_wandb(X, tag_set, model):
    wandb.login()

    config = {
        'model': model,
        'features': ', '.join(X.columns.values),
        'dataset-count': len(X.index),
        'description': 'Impute volume 416 (clustered), Removed outliers (2.3), Drop all, OneHotEncoding, Default Hyperparameter'
    }
    print(config)
    wandb.init(project='Metriken Bauwesen', entity='devcore', config=config, tags=[tag_set])

def wandb_plot_regressor(regr, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
    regr.fit(X_train, y_train)

    wandb.sklearn.plot_regressor(regr, X_train, X_test, y_train, y_test)

### Train-Run

In [3]:
if train_run:
    X, y = load_dataset()
    #init_wandb(X, TAG_TRAIN_SET, current_model)

    regr = GradientBoostingRegressor(random_state=0)
    scores_map = ml_helper.cross_validation(regr, X, y)

    # log result of cross validation to wandb
    for key in scores_map.keys():
        if key in {'fit_time', 'score_time'}:
            continue
        scores = scores_map[key]
        # wandb.log({f'{key}_mean': mean(scores)})
        print({f'{key}_mean': mean(scores)})

    #wandb_plot_regressor(regr, X, y)

    # serialize model for test set
    regr.fit(X, y)
    sh.serialize_object(regr, 'fitted_model')

Location: C:\Daten\Development\FHNW\ip5-metriken-bauwesen\export\fitted_pipeline.joblib
{'test_r2_mean': 0.9413018221158677}
{'test_neg_mean_absolute_percentage_error_mean': -0.17279546940067625}
{'test_neg_root_mean_squared_error_mean': -906.51337653121}
{'test_neg_mean_absolute_error_mean': -479.4254723749493}
{'test_max_error_mean': -4147.609003948322}
Location: C:\Daten\Development\FHNW\ip5-metriken-bauwesen\export\fitted_model.joblib


### Test-Run

In [4]:
if not train_run:
    X, y = load_dataset()
    #init_wandb(X, TAG_TEST_SET, current_model)

    regr = load(sh.export_directory_path(f'fitted_model.joblib'))

    predictions = regr.predict(X)
    r2_score = regr.score(X, y)
    print("R^2 value: ", r2_score)

    print("mean_absolute_error", mean_absolute_error(y, predictions))
    print("mean_absolute_percentage_error", mean_absolute_percentage_error(y, predictions))
    print("mean_squared_error", mean_squared_error(y, predictions))
    print("max_error", max_error(y, predictions))

    #wandb_plot_regressor(regr, X, y)

# charts.plot_feature_importance(regr.feature_importances_, X.columns, 'GRADIENT BOOSTING')