In [1]:
import xgboost as xgb
import pickle
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Загрузим данные. Я их немножно подправил ещё до загрузки.
df = pd.read_excel("data/data_00.xlsx")

# Уберём ненужные столбцы и пустые значения
df = df.drop(columns=["Номер образца", "Адсорбент"]).dropna()

# Ещё один нюанс
df = df[df["Sme, м2/г"] != "-"]

# Сменим тип данных у таблицы
df["Sme, м2/г"] = df["Sme, м2/г"].astype(np.float64, copy=False)
df["m (соли), г"] = df["m (соли), г"].astype(np.float64, copy=False)
df["Vпр. (р-ля), мл"] = df["Vпр. (р-ля), мл"].astype(np.float64, copy=False)

# У нас есть 3 категориальных признака, которых мы закодируем числами
list_of_cats = ["Металл", "Лиганд", "Растворитель"]
cat2id = {cat:{v:i for i, v in enumerate(df[cat].drop_duplicates().values)} for cat in list_of_cats}
id2cat = {cat:{i:v for i, v in enumerate(df[cat].drop_duplicates().values)} for cat in list_of_cats}
for cat in list_of_cats:
    df[cat] = df[cat].apply(lambda x: cat2id[cat][x])
    
y_cols = ['W0, см3/г', 'SБЭТ, м2/г']
X_cols = ['Tрег, ᵒС\n', 'Металл', 'Лиганд', 'Растворитель', 'm (соли), г', 'm(кис-ты), г', 'Vсин. (р-ля), мл', 'Т.син., °С', 'Vпр. (р-ля), мл', 'Т суш., °С']

In [3]:
X = df[X_cols]
y1 = df[y_cols[0]]
y2 = df[y_cols[1]]

In [4]:
y = y1

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
model_1 = pickle.load(open("xgbregressor_01.pkl", "rb"))

In [7]:
scaler = pickle.load(open("scaler.pkl", "rb"))

In [8]:
model_2 = pickle.load(open("xgbregressor_02.pkl", "rb"))

In [9]:
model_1.predict(scaler.transform(X.values[0,:].reshape((1,10))))

array([0.5028835], dtype=float32)

In [10]:
model_2.predict(scaler.transform(X.values[0,:].reshape((1,10))))

array([1320.786], dtype=float32)

In [11]:
y1[0], y2[0]

(0.52, 1234.0)

In [12]:
X.values[0,:]

array([130.   ,   0.   ,   0.   ,   0.   ,   1.074,   0.491,  28.   ,
       130.   , 150.   , 130.   ])

# Постановка задачи

Требуется подобрать такие параметры, чтобы W0 и SБЭТ были минимальными.

#### Предельные значения для всех данных

In [13]:
max_X = X.max().values
min_X = X.min().values
max_y1, max_y2 = y1.max(), y2.max()
min_y1, min_y2 = y1.min(), y2.min()

Функция для генерации случайного значения

In [14]:
RANDOM_SEED = 42

In [15]:
import random

In [16]:
def generate_random_init():
    return [random.uniform(min_X[i], max_X[i]) for i in range(10)]

Общая ошибка, в которой каждому `y` отбодится равная доля

In [17]:
max_y1, max_y2

(1.07, 3036.4)

In [18]:
min_y1, min_y2

(0.1630870864808529, 416.9)

In [19]:
max_y1 - min_y1 # 100%

0.9069129135191472

In [20]:
true_y1 = .56
pred_y1 = .23

In [21]:
diff_y1 = abs(true_y1 - pred_y1)

In [22]:
50 * diff_y1 / (max_y1 - min_y1)

18.1935881097713

In [23]:
def equal_error(y1_pred, y2_pred, y1_true, y2_true):
    y1_part = abs(y1_true - y1_pred) / (max_y1 - min_y1)
    y2_part = abs(y2_true - y2_pred) / (max_y2 - min_y2)
    return 50 * y1_part + 50 * y2_part

In [24]:
equal_error(0.6, 1000, y1[0], y2[0])

8.877068055184797

In [25]:
equal_error(0.19, 1230, y1[0], y2[0])

18.269938558330182

In [26]:
max_y2 - min_y2

2619.5

In [27]:
y2_pred = (max_y2 - min_y2) * np.random.rand(len(y1),1) + min_y2

In [28]:
y1_pred = (max_y1 - min_y1) * np.random.rand(len(y1),1) + min_y1

In [29]:
y_pred = np.hstack((y1_pred, y2_pred))

In [30]:
y_pred.shape

(312, 2)

In [31]:
y_true = df[y_cols].values

In [32]:
def mean_equal_error(y_pred, y_true):
    return np.mean([equal_error(*y_pred[i], *y_true[i]) for i in range(len(y_true))])

In [33]:
mean_equal_error(y_pred, y_true)

32.75445775982288

# Генетический алгоритм

Надо получить такие значения входных параметром, чтобы на выходе получить такие числа:

In [34]:
Y_TRUE = [*y_true[0]]
Y_TRUE

[0.52, 1234.0]

## 1. Создадим популяцию

In [35]:
POPULATION = 10

In [36]:
X_population = np.array([generate_random_init() for i in range(POPULATION)])

In [37]:
y1_prediction = model_1.predict(scaler.transform(X_population))
y2_prediction = model_2.predict(scaler.transform(X_population))

In [38]:
y_pred_population = np.vstack((y1_prediction, y2_prediction)).T

## 2. Вычислим вероятность выбора каждой хромосомы

In [39]:
errors = [equal_error(*y_pred, *Y_TRUE) for y_pred in y_pred_population]

In [40]:
np.mean(errors)

17.2248394875733

In [41]:
reverse_coefs = [1/i for i in errors]

In [42]:
choice_coefs = [r/sum(reverse_coefs) for r in reverse_coefs]

In [43]:
thresh_proba = []
thresh = 0
for proba in choice_coefs:
    thresh += proba
    thresh_proba.append(thresh)

## 3. Выбираем родителей

In [44]:
def take_parent(thresh_proba):
    rand_th = random.random()
    parent = None
    for i, th in enumerate(thresh_proba):
        if th > rand_th:
            return i

In [45]:
parent_1, parent_2 = take_parent(thresh_proba), take_parent(thresh_proba)
parent_1, parent_2 = X_population[parent_1], X_population[parent_2]

## 4. Скрещивание

In [46]:
new_generation = []
for a in range(POPULATION):
    ancestor = []
    for i in range(len(parent_1)):
        parent_1, parent_2 = take_parent(thresh_proba), take_parent(thresh_proba)
        parent_1, parent_2 = X_population[parent_1], X_population[parent_2]
        ancestor.append(random.choice((parent_1[i], parent_2[i])))
    new_generation.append(ancestor)

In [47]:
X_population = np.array(new_generation)

## Теперь ол-тугезер

In [48]:
POPULATION = 100
Y_TRUE = [*y_true[0]]
N_GENERATIONS = 50

In [68]:
def create_first_population(POPULATION = 100):
    return np.array([generate_random_init() for i in range(POPULATION)])

In [50]:
X_population = create_first_population()

In [51]:
history = []

In [52]:
def evaluate_population(X_population):
    y1_prediction = model_1.predict(scaler.transform(X_population))
    y2_prediction = model_2.predict(scaler.transform(X_population))
    y_pred_population = np.vstack((y1_prediction, y2_prediction)).T
    errors = [equal_error(*y_pred, *Y_TRUE) for y_pred in y_pred_population]
    return np.mean(errors), min(errors), errors

In [53]:
e_mean, e_min, errors = evaluate_population(X_population)
print(f"Средняя ошибка: {e_mean}; Минимальная ошибка: {e_min}")
history.append((e_mean, e_min))

Средняя ошибка: 14.797456305222017; Минимальная ошибка: 4.161540898761777


In [54]:
P_MUTATION = .1

In [55]:
def evolve(X_population):
    
    _, _, errors = evaluate_population(X_population)
    reverse_coefs = [1/i for i in errors]
    choice_coefs = [r/sum(reverse_coefs) for r in reverse_coefs]
    
    # Thresh
    thresh_proba = []
    thresh = 0
    for proba in choice_coefs:
        thresh += proba
        thresh_proba.append(thresh)
    
    #Take parents
    parent_1, parent_2 = take_parent(thresh_proba), take_parent(thresh_proba)
    parent_1, parent_2 = X_population[parent_1], X_population[parent_2]
    
    new_generation = []
    for a in range(POPULATION):
        ancestor = []
        for i in range(len(parent_1)):
            parent_1, parent_2 = take_parent(thresh_proba), take_parent(thresh_proba)
            parent_1, parent_2 = X_population[parent_1], X_population[parent_2]
            ancestor.append(random.choice((parent_1[i], parent_2[i])))
        # Mutation
        #ancestor = [a if random.random() > P_MUTATION else a * (1 + (-P_MUTATION) ** random.randint(0,1)) for a in ancestor]
        new_generation.append(ancestor)
    X_population = np.array(new_generation)
    return X_population

In [56]:
X_population = create_first_population()

history = [(evaluate_population(X_population))]

In [57]:
import tqdm

In [58]:
for generation in tqdm.tqdm(range(N_GENERATIONS)):
    X_population = evolve(X_population)
    history.append(evaluate_population(X_population))

100%|██████████| 50/50 [00:00<00:00, 84.82it/s]


## Запишем в функцию

In [69]:
def generate_synth_pars(y_true, POPULATION = 100, N_GENERATIONS = 50):
    Y_TRUE = y_true
    X_population = create_first_population(POPULATION)
    history = [(evaluate_population(X_population))]
    for generation in range(N_GENERATIONS):
        X_population = evolve(X_population)
        history.append(evaluate_population(X_population))
    return X_population[np.argmin(history[-1][2])]

# Проверка всего решения

In [60]:
def restore_normalization(z_value, col):
    min_, max_ = normalization_weights[col]
    value = (z_value - 0.01) * (max_ - min_) + min_
    return value

def NSWE(true, pred):
    list_of_allowed_err = [10, 0, 1, 2, 3, 3, 5, 1, 10, 5]
    list_of_importancy = [(abs(10 - i) + 5) / 110 for i in list_of_allowed_err]
    return sum(abs(true - pred) * np.array(list_of_importancy))

def normalize(value, col):
    min_, max_ = normalization_weights[col]
    z_value = (value - min_) / (max_ - min_)
    return z_value + 0.01

In [71]:
# Загрузим данные. Я их немножно подправил ещё до загрузки.
df = pd.read_excel("data/data_00.xlsx")

# Уберём ненужные столбцы и пустые значения
df = df.drop(columns=["Номер образца", "Адсорбент"]).dropna()

# Ещё один нюанс
df = df[df["Sme, м2/г"] != "-"]

# Сменим тип данных у таблицы
df["Sme, м2/г"] = df["Sme, м2/г"].astype(np.float64, copy=False)
df["m (соли), г"] = df["m (соли), г"].astype(np.float64, copy=False)
df["Vпр. (р-ля), мл"] = df["Vпр. (р-ля), мл"].astype(np.float64, copy=False)

# У нас есть 3 категориальных признака, которых мы закодируем числами
list_of_cats = ["Металл", "Лиганд", "Растворитель"]
cat2id = {cat:{v:i for i, v in enumerate(df[cat].drop_duplicates().values)} for cat in list_of_cats}
id2cat = {cat:{i:v for i, v in enumerate(df[cat].drop_duplicates().values)} for cat in list_of_cats}
for cat in list_of_cats:
    df[cat] = df[cat].apply(lambda x: cat2id[cat][x])
    
y_cols = ['W0, см3/г', 'SБЭТ, м2/г']
X_cols = ['Tрег, ᵒС\n', 'Металл', 'Лиганд', 'Растворитель', 'm (соли), г', 'm(кис-ты), г', 'Vсин. (р-ля), мл', 'Т.син., °С', 'Vпр. (р-ля), мл', 'Т суш., °С']

cols = list(df.columns)
normalization_weights = {}
for col in cols:
    min_ = df.loc[:,col].min()
    max_ = df.loc[:,col].max()
    normalization_weights[col] = min_, max_
    
for col in cols:
    df.loc[:,col] = df.loc[:,col].apply(lambda value: normalize(value, col))
X = df[X_cols]
y = df[y_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=1)
X_generated = []
for y in tqdm.tqdm(y_test.values):
    X_generated.append(generate_synth_pars(y, POPULATION=1000, N_GENERATIONS=100))

gen = pd.DataFrame(X_generated)
gen.columns = X_test.columns

for col in cols:
    if col in gen.columns:
        gen.loc[:,col] = gen.loc[:,col].apply(lambda value: normalize(value, col))

trues = X_test.values
preds = gen.values

MNSWE = 0
for pred, true in zip(preds, trues):
    MNSWE += NSWE(true, pred)
MNSWE /= len(preds)
print(MNSWE)

100%|██████████| 78/78 [01:23<00:00,  1.08s/it]

0.380628160303491





Ошибка работы генетического алгоритма: 38%