In [2]:
import os 
os.chdir('..')

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
from tqdm import tqdm
import itertools 
from matplotlib import pyplot as plt
import pickle as pkl


from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import median_absolute_error


In [3]:
from src.elegant_fuzzy_genetic_algorithms.helpers.generalized_param_inferrer import GeneralizedParamInferencer
from src.elegant_fuzzy_genetic_algorithms.helpers.all_params_wrapper import AllEFGAParamsParallelWrapper
from src.elegant_fuzzy_genetic_algorithms.param_diff_simulation import simulation_param_diff
from src.common.fitness import (griewank, schwefel, ackley, rastrigin)
from src.common.approximation_helpers import generate_search_space, init_param_index
from src.common.scales import scales

In [4]:
gpi = AllEFGAParamsParallelWrapper(3, 3)

In [5]:
priorities = []

for fn in [griewank, schwefel, rastrigin, ackley]:
    fn_name = fn.__name__
    priorities_ = simulation_param_diff(N=100, epochs=200, fitness_fn=fn, population_scale=scales[fn_name][0], 
                      mutation_scale=scales[fn_name][1], seed=1)
    priorities.append(priorities_)

priorities_df = list(itertools.chain(*priorities))
priorities_df = pd.DataFrame(priorities_df)

100%|██████████| 200/200 [00:02<00:00, 69.61it/s] 
100%|██████████| 200/200 [00:01<00:00, 174.69it/s]
100%|██████████| 200/200 [00:00<00:00, 332.30it/s]
100%|██████████| 200/200 [00:00<00:00, 316.74it/s]


In [6]:
priorities_df

Unnamed: 0,xRate,mRate,subPopSize,best_fitness,avg_fitness,avg_fit_change
0,[0.669579],[0.02357091],[0.185216],0.100325,0.265879,0.037544
1,[0.66371304],[0.021830916],[0.1854116],0.100325,0.249000,0.015726
2,[0.6641028],[0.021830916],[0.185216],0.100325,0.254578,0.006794
3,[0.6647559],[0.021830916],[0.1854116],0.100325,0.252894,0.003249
4,[0.6650075],[0.021830916],[0.185216],0.100325,0.254069,0.002857
...,...,...,...,...,...,...
795,[0.85499007],[0.03799035],[0.2169291],0.929401,0.962536,0.000937
796,[0.8580492],[0.038121507],[0.2169291],0.929401,0.964902,0.000505
797,[0.8543547],[0.03775251],[0.21667424],0.929401,0.961737,0.001031
798,[0.85517293],[0.03799035],[0.2169291],0.929401,0.963227,0.000200


In [7]:
param_chg = np.vstack([np.diff(pd.DataFrame(priorities[i])[['xRate', 'mRate', 'subPopSize']].to_numpy(), axis=0) for i in range(len(priorities))])

In [8]:
keys = list(priorities[0][0].keys())[:3]

for i in range(len(keys)):
    param_stats = np.abs(param_chg[:, i])
    conf_int = st.t.interval(alpha=0.95, df=len(param_stats)-1, loc=np.mean(param_stats), scale=st.sem(param_stats)) 
    print(f'{keys[i]=}, {conf_int=}')

keys[i]='xRate', conf_int=(array([0.00104294]), array([0.00136852]))
keys[i]='mRate', conf_int=(array([5.99071666e-05]), array([9.05979811e-05]))
keys[i]='subPopSize', conf_int=(array([0.00011543]), array([0.00016387]))


The goal in this case is to obtain max error confidence interval higher range lower than lower interval of error

In [9]:
priorities_df.best_fitness.min(), priorities_df.best_fitness.max()

(0.10032480226915341, 0.9294013403865408)

In [10]:
priorities_df.avg_fitness.min(), priorities_df.avg_fitness.max()

(0.016547844920411454, 0.972623828244773)

In [11]:
priorities_df.avg_fit_change.min(), priorities_df.avg_fit_change.max()

(0.0, 0.03754400608188846)

In [12]:
search_space = generate_search_space((20, 20, 20), [(priorities_df.best_fitness.min(), priorities_df.best_fitness.max(),), 
                                           (priorities_df.avg_fitness.min(), priorities_df.avg_fitness.max()), 
                                           (priorities_df.avg_fit_change.min(), priorities_df.avg_fit_change.max())])


In [13]:
y = [gpi.infer(*search_space[i]) for i in tqdm(range(search_space.shape[0]))]

100%|██████████| 8000/8000 [00:28<00:00, 282.10it/s]


In [14]:
y_ml = pd.DataFrame(y)['xRate']

In [134]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [15]:
from sklearn.neural_network import MLPRegressor

In [16]:
from xgboost import XGBRegressor

In [17]:
mlpr = XGBRegressor(n_estimators=10000).fit(search_space, y_ml)

In [18]:
N = 2000
np.random.seed(1)
test_set = np.vstack([np.random.uniform(priorities_df.best_fitness.min(), priorities_df.best_fitness.max(), N), 
                      np.random.uniform(priorities_df.avg_fitness.min(), priorities_df.avg_fitness.max(), N), 
                    np.random.uniform(priorities_df.avg_fit_change.min(), priorities_df.avg_fit_change.max(), N), 

                      ]).T
test_set

array([[0.44606796, 0.56818251, 0.01347252],
       [0.69752894, 0.85348599, 0.02454445],
       [0.10041963, 0.59838264, 0.02127211],
       ...,
       [0.78740777, 0.08676222, 0.00098332],
       [0.12602448, 0.57082051, 0.03569789],
       [0.885235  , 0.71784585, 0.02139699]])

In [19]:
y_actual = [gpi.infer(*test_set[i]) for i in tqdm(range(test_set.shape[0]))]

100%|██████████| 2000/2000 [00:11<00:00, 177.70it/s]


In [21]:
mlpr.predict(test_set)[:]

array([0.7100048, 0.7100048, 0.691    , ..., 0.565378 , 0.691    ,
       0.7100048], dtype=float32)

In [22]:
param_index = init_param_index(search_space)

In [23]:
np.array(y)

array([{'xRate': array([0.5214198], dtype=float32), 'mRate': array([0.01017134], dtype=float32), 'subPopSize': array([0.19896418], dtype=float32)},
       {'xRate': array([0.5226926], dtype=float32), 'mRate': array([0.01017134], dtype=float32), 'subPopSize': array([0.19613487], dtype=float32)},
       {'xRate': array([0.5213062], dtype=float32), 'mRate': array([0.01017134], dtype=float32), 'subPopSize': array([0.19720678], dtype=float32)},
       ...,
       {'xRate': array([0.86451346], dtype=float32), 'mRate': array([0.0383985], dtype=float32), 'subPopSize': array([0.21667424], dtype=float32)},
       {'xRate': array([0.86451346], dtype=float32), 'mRate': array([0.0383985], dtype=float32), 'subPopSize': array([0.21667424], dtype=float32)},
       {'xRate': array([0.86451346], dtype=float32), 'mRate': array([0.0383985], dtype=float32), 'subPopSize': array([0.21667424], dtype=float32)}],
      dtype=object)

In [24]:
D, I = param_index.search(test_set, k=1)


In [25]:
y_pred = pd.DataFrame(list(itertools.chain(*np.array(y)[I].tolist())))

In [26]:
diff_df = y_pred - pd.DataFrame.from_records(y_actual)

In [27]:
for col in diff_df.columns:
    diff_df[col] = np.abs(diff_df[col])
    diff_arr = diff_df[col]
    conf_int = st.t.interval(alpha=0.95, df=len(diff_arr)-1, loc=np.mean(diff_arr), scale=st.sem(diff_arr)) 
    print(col, conf_int)

    

xRate (array([0.00625767]), array([0.00869486]))
mRate (array([0.00051845]), array([0.00069701]))
subPopSize (array([0.0007201]), array([0.00107408]))


In [28]:
for col in ['xRate', 'mRate', 'subPopSize']:
    X_train, X_test, y_train, y_test = train_test_split(priorities_df[['best_fitness', 'avg_fitness', 'avg_fit_change']], 
                                                        priorities_df[col])
    mlpr = XGBRegressor(tree_method="hist", max_depth=1).fit(X_train, y_train)
    y_pred = mlpr.predict(X_test)
    print(f'Rate per 1000: {median_absolute_error(y_pred, y_test) * 1000}')
    print(f'Bias: {median_absolute_error(y_pred, y_test)}')

Rate per 1000: 0.7379055023193359
Bias: 0.0007379055023193359
Rate per 1000: 0.0700727105140686
Bias: 7.00727105140686e-05
Rate per 1000: 0.13290345668792725
Bias: 0.00013290345668792725


Provided that most errors are centered around 0 (there's no systematic overstating or understating of some metric), as well as rates being less than <.15 per 1000, the approximation would be considered correct. Let's train a model and then use it for the future inference. 

In [29]:
for col in ['xRate', 'mRate', 'subPopSize']:
    mlpr = XGBRegressor(tree_method="hist",).fit(priorities_df[['best_fitness', 'avg_fitness', 'avg_fit_change']], 
                                                        priorities_df[col])
    
    with open(f'./indices/xgb_{col}.pkl', 'wb') as f:
        pkl.dump(mlpr, f)
    

In [36]:
search_space = generate_search_space((50, 50, 50), [(priorities_df.best_fitness.min(), priorities_df.best_fitness.max(),), 
                                           (priorities_df.avg_fitness.min(), priorities_df.avg_fitness.max()), 
                                           (priorities_df.avg_fit_change.min(), priorities_df.avg_fit_change.max())])

In [37]:
for col in ['xRate', 'mRate', 'subPopSize']:
    mlpr = XGBRegressor(tree_method="hist",).fit(priorities_df[['best_fitness', 'avg_fitness', 'avg_fit_change']], 
                                                        priorities_df[col])
    
    y = mlpr.predict(search_space)

    
    with open(f'./indices/xgb_{col}_y.pkl', 'wb') as f:
        pkl.dump(y, f)
    

In [38]:
search_space = init_param_index(search_space)
    

In [39]:
import faiss

In [40]:
faiss.write_index(search_space, './indices/params.index')

In [4]:
gpi  =GeneralizedParamInferencer(3)

In [5]:
gpi.infer(.21, .22, .2)

{'xRate': 0.654632827324478,
 'mRate': 0.021219402277039844,
 'subPopSize': 0.1871930958501653}

In [6]:
from src.elegant_fuzzy_genetic_algorithms.helpers.param_inference_approximation import ParamInferenceApprox

In [7]:
pia = ParamInferenceApprox()

TypeError: Wrong number or type of arguments for overloaded function 'new_IndexIVFFlat'.
  Possible C/C++ prototypes are:
    faiss::IndexIVFFlat::IndexIVFFlat(faiss::Index *,size_t,size_t,faiss::MetricType)
    faiss::IndexIVFFlat::IndexIVFFlat(faiss::Index *,size_t,size_t)
    faiss::IndexIVFFlat::IndexIVFFlat()


In [None]:
pia.infer(.21, .22, .2)

{'xRate': array([0.6537803], dtype=float32),
 'mRate': array([0.0218903], dtype=float32),
 'subPopSize': array([0.18691383], dtype=float32)}