In [1]:
import os
os.chdir('..')

In [2]:
import numpy as np
from scipy.spatial import distance_matrix
import scipy.stats as st
import faiss
from tqdm import tqdm


from src.common.scales import scales
from src.common.utils import generate_population, mutation, crossover
from src.common.fitness import (griewank, schwefel, ackley, rastrigin)
from src.elegant_fuzzy_genetic_algorithms.helpers.all_params_wrapper import AllEFGAParamsParallelWrapper
from src.elegant_fuzzy_genetic_algorithms.priority_diff_simulation import simulation_priorities
from src.common.approximation_helpers import (generate_search_space, init_param_index, estimate_by_index)
from src.gendered_selection.faster_fuzzy_logic.generalized_partition_inferrer import GeneralizedInferrer
from src.gendered_selection.age_diff_estimator import Simulation as AgeDiffSimulation
from src.gendered_selection.conf.gendered_selection_config import Config as GenderedSelectionConfig


np.random.seed(1)

Given that 90+% of time spent running fuzzy genetic algorithms is spent on using fuzzy logic, the idea is to make an approximation using nearest neighbors. 

## Priority estimation (EFGA)

### Getting the error confidence interval

In [20]:
priority_inferencer = AllEFGAParamsParallelWrapper(n_terms_params=3, n_terms_priority=7, use_approx=False)

In [21]:
c1_range = np.linspace(start=0, stop=1, num=70)
c2_range = np.linspace(start=0, stop=1, num=70)

In [22]:
params_combinations = np.array(np.meshgrid(c1_range, c2_range)).T.reshape(-1, 2)
priorities = priority_inferencer.infer_priority(c1=params_combinations[:, 0], c2=params_combinations[:, 1])

In [23]:
entries = np.random.uniform(0, 1, size=(200, 2))

In [24]:
params_ind = init_param_index(params_combinations, approx_inf=True)
D, I = params_ind.search(entries, 1)

In [18]:
priorities[I.ravel()]

array([[-7.32840033e-01],
       [ 2.39626746e-01],
       [-3.85676711e-01],
       [ 5.10860225e-01],
       [ 4.84343095e-01],
       [ 2.77913890e-01],
       [-4.95247867e-01],
       [-1.13012749e-01],
       [-2.71460925e-01],
       [-4.43611228e-01],
       [ 7.61964131e-01],
       [ 8.41058728e-01],
       [ 4.64187243e-01],
       [ 2.04538238e-01],
       [-1.56245434e-01],
       [ 1.31142563e-01],
       [ 7.61646792e-02],
       [-1.95521368e-01],
       [ 4.40670139e-02],
       [ 1.17566214e-01],
       [ 3.65601423e-01],
       [ 4.57940571e-01],
       [-7.95062769e-01],
       [-6.98840472e-01],
       [ 6.98525604e-01],
       [ 1.81633154e-01],
       [-8.32505681e-01],
       [-4.52743009e-01],
       [-3.20664231e-01],
       [ 5.80963199e-01],
       [ 9.85408271e-02],
       [ 6.02256435e-01],
       [-1.19599802e-01],
       [-2.44776694e-01],
       [ 1.39029247e-01],
       [-2.95636576e-01],
       [ 6.13288314e-02],
       [ 2.64518622e-02],
       [-6.1

In [28]:
priorities_est = priorities[I].ravel()
priorities_actual =  priority_inferencer.infer_priority(c1=entries[:, 0], c2=entries[:, 1])

In [30]:
diff = np.abs(priorities_est - priorities_actual)

In [32]:
conf_int = st.t.interval(alpha=0.95, df=len(diff)-1, loc=np.mean(diff), scale=st.sem(diff)) 
conf_int

(0.0050007552921728865, 0.006088810387731133)

We know the 95% confidence interval of the priority estimation error. If we prove that it's smaller than 95% of differences between the current priority and next best, then usage of this estimation is appropriate. 

### Estimating priority difference

In [33]:
priorities = []

for fn in [griewank, schwefel, rastrigin, ackley]:
    fn_name = fn.__name__
    priorities_ = simulation_priorities(N=100, epochs=200, fitness_fn=fn, population_scale=scales[fn_name][0], 
                      mutation_scale=scales[fn_name][1], seed=1)
    priorities.append(priorities_)

100%|██████████| 200/200 [00:22<00:00,  9.05it/s]
100%|██████████| 200/200 [00:24<00:00,  8.04it/s]
100%|██████████| 200/200 [00:24<00:00,  8.20it/s]
100%|██████████| 200/200 [00:20<00:00,  9.76it/s]


In [34]:
priorities_mtr = np.array(priorities)
priorities_mtr = np.sort(priorities_mtr, axis=1)
priority_difference_mtr = np.diff(priorities_mtr, axis=1)
priority_difference =  priority_difference_mtr.ravel()

In [35]:
conf_int_actual_diff = st.t.interval(alpha=0.95, df=len(priority_difference)-1, loc=np.mean(priority_difference), scale=st.sem(priority_difference)) 

In [36]:
conf_int_actual_diff, conf_int

((0.026625617917013247, 0.027913588650583764),
 (0.0050007552921728865, 0.006088810387731133))

In [37]:
first_second_diff = np.abs(np.diff(priority_difference_mtr[:, :2], axis=1).ravel())
conf_int_first_second = st.t.interval(alpha=0.95, df=len(first_second_diff)-1, loc=np.mean(first_second_diff), scale=st.sem(first_second_diff)) 

In [38]:
conf_int_first_second

(0.05903070832699521, 0.09724286275938558)

### Conclusion

Provided that the actual difference between first and second entry, as well as difference is much bigger than actual errror of approximation (even in the worst case the error is 10 times smaller), we can successfully use approximation techniques. 

## Partner age estimation (Gendered selection)

### Getting actual error for age estimation

In [40]:
params_combinations = generate_search_space(n_splits=200, ranges=[(0, 1), (0, 10)])
gi = GeneralizedInferrer(7)

In [41]:
y = [gi.infer_partner_age(*params_combinations[i, :]) for i in tqdm(range(params_combinations.shape[0]))]

100%|██████████| 40000/40000 [03:44<00:00, 178.01it/s]


In [42]:
y = np.array(y)

In [43]:
entries_test = np.hstack([np.random.uniform(0, 1, size=(200, 1)), np.random.uniform(0, 10, size=(200, 1))])
# age_est = y[np.argmin(distance_matrix(entries_test, params_combinations), axis=1)]
age_actual = np.array([gi.infer_partner_age(*entries_test[i, :]) for i in tqdm(range(entries_test.shape[0]))])

100%|██████████| 200/200 [00:01<00:00, 180.72it/s]


In [44]:
params_ind = init_param_index(params_combinations, approx_inf=True)
D, I = params_ind.search(entries_test, 1)
age_est = y[I.ravel()]

In [45]:
diff = np.abs(age_est - age_actual)
conf_int_age_error = st.t.interval(alpha=0.95, df=len(diff)-1, loc=np.mean(diff), scale=st.sem(diff)) 
conf_int_age_error

(0.0008273533625253398, 0.0011571999001171419)

### Estimating confidence intervals for age difference for females

In [46]:

ages = []

for fn in [griewank, schwefel, rastrigin, ackley]:
    fn_name = fn.__name__
    sim = AgeDiffSimulation(conf=GenderedSelectionConfig(100), fitness_fn=fn, mutation=mutation, crossover=crossover)
    ages_ = sim.run(n_epochs=200, n_partitions=7, population_scale=scales[fn_name][0], mutation_scale=scales[fn_name][1], seed=1)
    ages.append(ages_)

  0%|          | 1/200 [00:00<00:20,  9.84it/s]

[-249.35633071 -246.12872579 -245.56878051 -219.57428921 -219.41105259
 -218.75044051 -213.22218905 -210.92547902 -209.88358201 -201.62965166]


 67%|██████▋   | 134/200 [00:13<00:06,  9.79it/s]


Early stopping at epoch: 134, population died


  0%|          | 1/200 [00:00<00:20,  9.80it/s]

[-2587.49367996 -2495.30734206 -2439.01083313 -2421.61577749
 -2378.82107373 -2378.82107373 -2328.32286449 -2230.96814427
 -2222.00363909 -2194.21488081]


100%|██████████| 200/200 [00:18<00:00, 10.53it/s]
  0%|          | 1/200 [00:00<00:25,  7.91it/s]

[-164.04324218 -142.81374051 -124.27624217 -110.47992893 -109.60358413
 -108.34798061 -108.12620076 -107.1076225  -105.39232329 -101.18524974]


100%|██████████| 200/200 [00:29<00:00,  6.89it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

[-22.237698   -22.2148927  -22.00886026 -21.99755589 -21.98829155
 -21.97395631 -21.96101749 -21.95221456 -21.94448626 -21.91396373]


 51%|█████     | 102/200 [00:09<00:09, 10.32it/s]

Early stopping at epoch: 102, population died





In [47]:
diffs_age = np.concatenate([np.concatenate([np.diff(np.sort(i)) for i in ages[j]]) for j in range(len(ages))])
conf_int_age_diff = st.t.interval(alpha=0.95, df=len(diffs_age)-1, loc=np.mean(diffs_age), scale=st.sem(diffs_age)) 
conf_int_age_diff

(0.002322242730583462, 0.0025186984783038373)

In [48]:
diffs_age = np.concatenate([np.concatenate([np.diff(np.sort(i)[:2]) for i in ages[j]]) for j in range(len(ages))])
conf_int_age_diff = st.t.interval(alpha=0.95, df=len(diffs_age)-1, loc=np.mean(diffs_age), scale=st.sem(diffs_age)) 
conf_int_age_diff

(0.01107746234098387, 0.023435579298824324)

### Conclusions

Given that upper range of 95% confidence interval for error is smaller than lower range of 95% range of age difference between adjacent female ages, we can conclude that error is small enought to not be noticeable in partner selection. 