In [None]:
import pandas as pd
import numpy as np

from src.data import plot_calibrate_result, get_predictions

from sklearn.linear_model import Ridge

from sko.GA import GA
from sko.DE import DE

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_test = pd.read_csv('data/raw/data_raw_10_test.csv')
df_train = pd.read_csv('data/raw/data_raw_10_train.csv')
SKIP = 5
N_RECIPES = 330
# df_test = pd.read_csv('data/array_size/test.csv')
# df_train = pd.read_csv('data/array_size/train.csv')

In [None]:
from sklearn.metrics import mean_squared_error


def eval(p):
    rec_cols = []
    for i in p:
        rec_cols += [f"r_{int(i) * SKIP}", f"g_{int(i) * SKIP}", f"b_{int(i) * SKIP}"]
        # rec_cols += [f"r_{int(i)}", f"g_{int(i)}", f"b_{int(i)}"]

    x_train = df_train[rec_cols].values
    y_train = df_train[["conc_water", "conc_co2", "conc_nh3"]].values
    x_test = df_test[rec_cols].values
    y_test = df_test[["conc_water", "conc_co2", "conc_nh3"]].values
    model = Ridge(alpha=0.02)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # print(rmse)
    # distances[i//96].append(np.mean(tmp))
    return rmse

In [None]:
from multiprocessing import Pool

def ga_optimization(x):
    n_dim, random_state = x[0], x[1]
    np.random.seed(random_state)
    # print(f'random_state: {np.random.randint()} ; n_dim: {n_dim}')
    ga = GA(func=eval, n_dim=n_dim, size_pop=50, max_iter=200, prob_mut=0.03, lb=[0] * n_dim, ub=[N_RECIPES//SKIP] * n_dim, precision=1)
    _, best_y = ga.run()
    print(f'best_x: {ga.best_x} ; best_y: {best_y[0]} ;')
    return best_y[0]

In [None]:
best_rmses = []
best_rmses_x = []
best_rmses_y = []
N = 10
for size in range(3):
    n_dim = size+1
    print(f"------------------- {n_dim} -------------------")
    pool = Pool(N)
    items = [(n_dim, i+1) for i in range(N)]
    tmp = pool.map(ga_optimization, items)
    pool.close()
    pool.join()
    tmpp = []
    for t in tmp:
        best_rmses_x.append(n_dim)
        best_rmses_y.append(t)
        tmpp.append(t)
    best_rmses.append(tmpp)

In [None]:
pd.DataFrame(best_rmses, columns=[f"rmse_{i}" for i in range(N)]).to_csv("results/array_size/best_rmses.csv", index=False)

In [None]:
plt.scatter(best_rmses_x, best_rmses_y)

# Full GA

In [None]:
n_dim = 10
ga = GA(
    func=eval, n_dim=n_dim,
    size_pop=200, max_iter=1000, prob_mut=0.02,
    lb=[0] * n_dim, ub=[N_RECIPES] * n_dim,
    precision=1
)
best_x, best_y = ga.run()
print('best_x:', best_x, '\n', 'best_y:', best_y)

In [None]:
Y_history = pd.DataFrame(ga.all_history_Y)
fig, ax = plt.subplots(2, 1)
ax[0].plot(Y_history.index, Y_history.values, '.', color='red')
Y_history.min(axis=1).cummin().plot(kind='line')
plt.show()

# Iterative

In [None]:
selected = []
for _ in range(10):
    best_ = 100
    for i in range(N_RECIPES):
        if i in selected:
            continue
        tmp = selected + [i]
        _, rmse = get_predictions(tmp, model_type=Ridge, alpha=0.02, DF=df_train)
        if rmse < best_:
            best_ = rmse
            best_i = i
    selected.append(best_i)

In [None]:
selected