# PSyKE's demo for regression comparison

Some imports.

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from os.path import exists
from psyke import Extractor
import time
from psyke.utils import Target

import warnings
warnings.simplefilter("ignore")

Loading black-box models

In [2]:
PATH = "../test/resources/"
datasets = ["contingency", "contingency", "anticipate", "anticipate", "contingency", "anticipate"]
models = [
    "CONTINGENCY_no_input-memory_DecisionTree_MaxDepth10",
    "CONTINGENCY_no_input-time_DecisionTree_MaxDepth10",
    "ANTICIPATE_no_input-memory_DecisionTree_MaxDepth10",
    "ANTICIPATE_no_input-time_DecisionTree_MaxDepth10",
    "CONTINGENCY_input-cost_DecisionTree_MaxDepth15",
    "ANTICIPATE_input-cost_DecisionTree_MaxDepth15"
]
models = [
    pickle.load(open(f"{PATH}predictors/{dataset}/{path}", 'rb')) for path, dataset in zip(models, datasets)
]

Function to pre-process data

In [3]:
def process(dataset):
    df = pd.read_csv(f"{PATH}datasets/{dataset}_trainDataset.csv")

    # Removes header entries
    df = df[df['sol(keuro)'] != 'sol(keuro)']

    # Fixed stuff which is always there
    df['PV(kW)'] = df['PV(kW)'].map(lambda entry: entry[1:-1].split())
    df['PV(kW)'] = df['PV(kW)'].map(lambda entry: list(np.float_(entry)))
    df['Load(kW)'] = df['Load(kW)'].map(lambda entry: entry[1:-1].split())
    df['Load(kW)'] = df['Load(kW)'].map(lambda entry: list(np.float_(entry)))

    X = pd.DataFrame()

    X['PV_mean'] = df['PV(kW)'].map(lambda entry: np.array(entry).mean())
    X['PV_std'] = df['PV(kW)'].map(lambda entry: np.array(entry).std())
    X['Load_mean'] = df['Load(kW)'].map(lambda entry: np.array(entry).mean())
    X['Load_std'] = df['Load(kW)'].map(lambda entry: np.array(entry).std())
    X['nScenarios'] = df['nScenarios']
    X['cost'] = df['sol(keuro)']
    X['time'] = df['time(sec)']
    X['memo'] = df['memAvg(MB)']

    X.to_csv(f"{PATH}datasets/{dataset}.csv", index = False)

    return X

Experiment setting

In [4]:
toRemove = [
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'time', 'cost'],
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'memo', 'cost'],
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'time', 'cost'],
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'memo', 'cost'],
    ["time", "memo"],
    ["time", "memo"]
]

features = [
    ["nTraces"],
    ["nTraces"],
    ["nScenarios"],
    ["nScenarios"],
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'nTraces'],
    ['PV_mean', 'PV_std', 'Load_mean', 'Load_std', 'nScenarios']
]

targets = ["memo", "time", "memo", "time", "cost", "cost"]

Computational time assessment for ORCHiD w.r.t. different amounts of input features and input instances.
Data averaged on 100 executions

In [5]:
i = 0
rem, feat, target, dataset, model = toRemove[i], features[i], targets[i], datasets[i], models[i]

print(dataset, target, len(feat), rem[:-2], rem[-2:])
print()
name = f"{PATH}datasets/{dataset}.csv"

if not exists(name):
    process(dataset)

dataset = pd.read_csv(name).drop(rem[-2 :], axis = 1)

for r in range(-1, len(rem) - 2):
    if r >= 0:
        dataset = dataset.drop([rem[r]], axis = 1)

    train, test = train_test_split(dataset, test_size=0.1, random_state=10)
    model.fit(train.iloc[:, :-1], train.iloc[:, -1])

    print(f"{len(dataset.columns) - 1} variables\n")
    for j in [100, 500, 1000, 2000, 4000, 7000, 10000]:
        res = []
        for i in range(100):
            print(f"{j} instances: {i} runs", end="\r")
            t0 = time.time()
            orchid = Extractor.orchid(model, depth=1, error_threshold=.8, output=Target.REGRESSION)
            _ = orchid.extract(train.iloc[:j, :])
            t1 = time.time()
            res.append(t1 - t0)
        res = np.array(res)
        print(f'{j} instances: {np.mean(res):.2f} +- {np.std(res):.2f} sec')
    print()

contingency memo 1 ['PV_mean', 'PV_std', 'Load_mean', 'Load_std'] ['time', 'cost']

5 variables

100 instances: 0.06 +- 0.03 sec
500 instances: 0.06 +- 0.01 sec
1000 instances: 0.09 +- 0.03 sec
2000 instances: 0.17 +- 0.03 sec
4000 instances: 0.36 +- 0.02 sec
7000 instances: 0.92 +- 0.08 sec
10000 instances: 1.68 +- 0.15 sec

4 variables

100 instances: 0.07 +- 0.01 sec
500 instances: 0.07 +- 0.01 sec
1000 instances: 0.09 +- 0.01 sec
2000 instances: 0.15 +- 0.02 sec
4000 instances: 0.37 +- 0.03 sec
7000 instances: 0.97 +- 0.07 sec
10000 instances: 1.76 +- 0.07 sec

3 variables

100 instances: 0.06 +- 0.01 sec
500 instances: 0.08 +- 0.01 sec
1000 instances: 0.09 +- 0.01 sec
2000 instances: 0.14 +- 0.02 sec
4000 instances: 0.33 +- 0.03 sec
7000 instances: 0.83 +- 0.07 sec
10000 instances: 1.75 +- 0.12 sec

2 variables

100 instances: 0.04 +- 0.01 sec
500 instances: 0.05 +- 0.01 sec
1000 instances: 0.08 +- 0.01 sec
2000 instances: 0.13 +- 0.01 sec
4000 instances: 0.32 +- 0.03 sec
7000 ins