In [2]:
import os
import sys
from pathlib import Path

import numpy as np
import xgboost as xgb
from rich.console import Console
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error

ROOT_DIR = os.path.dirname(
os.path.dirname(os.path.abspath('')))


sys.path.append(os.path.join(ROOT_DIR, "code"))
from tools.data_loader import TestSet, TestSplit, data_loader
from tools.save import save_as_baseline, save_datasets, save_models
from tools.train import evaluate_models, print_test_samples, train_models, print_problematic_samples
from tools.utils import StructureEncoding, Target, check_xgboost_gpu

# Define global variables
DATA_DIR = os.path.join(ROOT_DIR, "data/")

DATA_PATH = os.path.join(DATA_DIR, "data.csv")


In [3]:
console = Console(record=True)
prompt_user = False

encoding = StructureEncoding.ATOMIC

console.log(f"[bold green]Started pipeline for {encoding}")
target = Target.DELTA_E
test_sets_cfg = [
    TestSet("Parameter gen.", size=0.1, split=TestSplit.ROW),
    TestSet("Structure gen.", size=0.1, split=TestSplit.STRUCTURE),
]

# Data Loading
X_train, y_train, test_sets = data_loader(
    target=target,
    encoding=encoding,
    data_path=DATA_PATH,
    test_sets_cfg=test_sets_cfg,
    console=console,
    remove_ref_rows=True,
)

In [31]:
base = 10
def magnitude(x):
    return int(np.ceil(np.log(x) / np.log(base)))

def magnitude_inv(x):
    return base**float(x-1)
    # for the regression transformation, taking the prediction minus 1 yields way better MAPE scores
    # I think that's because predicting a result with an order of magnitude superior to the actual value
    # yields an APE error of 10 (1000%) while predicting a result with an order of magnitude inferior to the actual value
    # yields an APE error of at most 1 (100%)

def magnitude_transform(a):
    return np.vectorize(magnitude)(a)

def magnitude_inv_transform(a):
    return np.vectorize(magnitude_inv)(a)

In [23]:
x = 1E-6
magnitude(x)

-5

In [24]:
magnitude_y_train = magnitude_transform(y_train)

In [25]:
print("max magnitude: ", np.max(magnitude_y_train))
print("min magnitude: ", np.min(magnitude_y_train))

max magnitude:  1
min magnitude:  -8


In [26]:
model = RandomForestClassifier()
model.fit(X_train, magnitude_y_train)

RandomForestClassifier()

In [32]:
for test_name, X_test, y_test in test_sets:
    magnitude_y_pred = model.predict(X_test)
    magnitude_y_test = magnitude_transform(y_test)
    score = accuracy_score(magnitude_y_test, magnitude_y_pred)
    mape = mean_absolute_percentage_error(y_test, magnitude_inv_transform(magnitude_y_pred))
    console.print(f"Accuracy {test_name}: {score}")
    console.print(f"MAPE {test_name}: {mape}")

In [33]:
class RegModel:
    def __init__(self, class_model):
        self.class_model = class_model

    def predict(self, X):
        return magnitude_inv_transform(self.class_model.predict(X))
reg_models = {"RandomForest": RegModel(model)}

In [34]:
evaluate_models(reg_models, X_train, y_train, test_sets, console)

In [35]:
print_test_samples(reg_models, test_sets, console)