In [1]:
import os 
os.chdir('/home/oleksandr/UT/Thesis/Implementations/')

In [10]:
from sklearn.model_selection import train_test_split
import faiss
import numpy as np
from scipy.special import softmax
from scipy import stats as st
from tqdm import tqdm
import pandas as pd

In [5]:
from src.common.approximation_helpers import generate_search_space
from src.elegant_fuzzy_genetic_algorithms.helpers.all_params_wrapper import AllEFGAParamsParallelWrapper
from src.elegant_fuzzy_genetic_algorithms.priority_diff_simulation import simulation_priorities
from src.common.fitness import griewank
from src.common.scales import scales

## Empirical difference between closest item priorities for N partitions

In [6]:
resulting_intervals = {}


for n_terms in [3, 5, 7]:
    history = simulation_priorities(N=50, epochs=100, n_terms_priority=n_terms, ndim=5, fitness_fn=griewank, mutation_scale=scales['griewank'][1], 
                                population_scale=scales['griewank'][0])
    
    # Sorting by priorities and finding difference between closest items
    diffs = np.abs(np.diff(np.sort(np.array(history), axis=1)[:, ::-1], axis=1).ravel())

    # Getting 95% confidence interval
    conf_interval = st.t.interval(alpha=0.95, df=len(diffs)-1, loc=np.mean(diffs), scale=st.sem(diffs))

    resulting_intervals[n_terms] = conf_interval
    

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:09<00:00, 10.32it/s]
100%|██████████| 100/100 [00:09<00:00, 10.47it/s]
100%|██████████| 100/100 [00:09<00:00, 10.13it/s]


In [8]:
resulting_intervals_estimation_error = {}

for n_terms in tqdm([3, 5, 7]):
    X = generate_search_space(50)
    y = AllEFGAParamsParallelWrapper(n_terms_priority=n_terms, use_approx=False).infer_priority(X[:, 0], X[:, 1])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)
    param_index = faiss.IndexFlatL2(X_train.shape[1])
    param_index.add(X_train)

    D, I = param_index.search(X_test, k=10)
    y_pred = (y_train[I] * softmax(D, axis=1)).sum(axis=1)
    errors = np.abs(y_pred.ravel() - y_test.ravel())
    error_confint = st.t.interval(alpha=0.95, df=len(errors)-1, loc=np.mean(errors), scale=st.sem(errors)) 
    resulting_intervals_estimation_error[n_terms] = error_confint

100%|██████████| 3/3 [00:05<00:00,  1.93s/it]


In [11]:
df = []

for n_terms in [3, 5, 7]:
    record = {
        'n_terms': n_terms, 
        'priority_difference_95%_confint': resulting_intervals[n_terms], 
        'priority_estimation_error_95%_confint': resulting_intervals_estimation_error[n_terms]
    }
    df.append(record)

df = pd.DataFrame(df)
df

Unnamed: 0,n_terms,priority_difference_95%_confint,priority_estimation_error_95%_confint
0,3,"(0.07264316891113803, 0.0834474455869724)","(0.004143535086433256, 0.005945887181003547)"
1,5,"(0.08426089359469648, 0.09712742831978542)","(0.004568798208505782, 0.0060947738291328305)"
2,7,"(0.09399059825815591, 0.10860154766821269)","(0.00483617588822278, 0.006179077698847528)"


In [12]:
df.to_csv('results/efga_proof_confints.csv')