In [32]:
import os
import sys
from pathlib import Path

import numpy as np
import xgboost as xgb
from rich.console import Console
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import accuracy_score

ROOT_DIR = os.path.dirname(
os.path.dirname(os.path.abspath('')))


sys.path.append(os.path.join(ROOT_DIR, "code"))
from tools.data_loader import TestSet, TestSplit, data_loader
from tools.save import save_as_baseline, save_datasets, save_models
from tools.train import evaluate_models, print_test_samples, train_models, print_problematic_samples
from tools.utils import StructureEncoding, Target, check_xgboost_gpu

# Define global variables
DATA_DIR = os.path.join(ROOT_DIR, "data/")

DATA_PATH = os.path.join(DATA_DIR, "data.csv")


In [7]:
console = Console(record=True)
prompt_user = False

encoding = StructureEncoding.ATOMIC

console.log(f"[bold green]Started pipeline for {encoding}")
target = Target.DELTA_E
test_sets_cfg = [
    TestSet("Parameter gen.", size=0.1, split=TestSplit.ROW),
    TestSet("Structure gen.", size=0.1, split=TestSplit.STRUCTURE),
]

# Data Loading
X_train, y_train, test_sets = data_loader(
    target=target,
    encoding=encoding,
    data_path=DATA_PATH,
    test_sets_cfg=test_sets_cfg,
    console=console,
    remove_ref_rows=True,
)

Output()

In [22]:
def magnitude(x):
    return int(np.floor(np.log10(x)))

def magnitude_transform(a):
    return -np.vectorize(magnitude)(a)

In [23]:
x = 1E-6
magnitude(x)

-6

In [26]:
magnitude_y_train = magnitude_transform(y_train)

In [27]:
print("max magnitude: ", np.max(magnitude_y_train))
print("min magnitude: ", np.min(magnitude_y_train))

max magnitude:  9
min magnitude:  0


In [28]:
model = RandomForestClassifier()
model.fit(X_train, magnitude_y_train)

RandomForestClassifier()

In [33]:
for test_name, X_test, y_test in test_sets:
    magnitude_y_pred = model.predict(X_test)
    magnitude_y_test = magnitude_transform(y_test)
    score = accuracy_score(magnitude_y_test, magnitude_y_pred)
    console.print(f"Accuracy {test_name}: {score}")