<a href="https://colab.research.google.com/github/ArtyomKopan/GlowByte-Internship/blob/main/industry/Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from random import randint, uniform
from sklearn.metrics import mean_absolute_error

In [None]:
data = pd.read_csv('optimization_data1.csv')

In [None]:
data.head()

Unnamed: 0,DateTime,Ni1,S1,Fe3O41,Ni2,S2,Fe3O42,Ni3,S3,Fe3O43,...,ro2,ro3,ro4,ro_final,ust1,ust2,ust3,ust4,v_final,level_final
0,2021-01-01 00:00:00,5.45,22.39,1.5,3.87,17.52,1.7,8.62,13.21,5.0,...,1.6891,2.0167,1.86565,1.8684,0.0,0.0,44,97.0,179,69.2796
1,2021-01-01 00:01:00,5.45,22.39,1.5,3.87,17.52,1.7,8.62,13.21,5.0,...,1.69955,2.029,1.86145,1.8752,0.0,0.0,44,97.0,179,69.1331
2,2021-01-01 00:02:00,5.45,22.39,1.5,3.87,17.52,1.7,8.62,13.21,5.0,...,1.6998,2.0453,1.8521,1.8635,0.0,0.0,44,97.0,179,66.7399
3,2021-01-01 00:03:00,5.45,22.39,1.5,3.87,17.52,1.7,8.62,13.21,5.0,...,1.69845,2.0523,1.8442,1.8572,0.0,0.0,44,97.0,179,64.2247
4,2021-01-01 00:04:00,5.45,22.39,1.5,3.87,17.52,1.7,8.62,13.21,5.0,...,1.6947,2.06035,1.8395,1.8526,0.0,0.0,44,97.0,179,62.2955


In [None]:
columns = ['Ni1', 'S1', 'Fe3O41', 'Ni2', 'S2', 'Fe3O42',
       'Ni3', 'S3', 'Fe3O43', 'Ni4', 'S4', 'Fe3O44', 'Ni_final', 'S_final',
       'Fe3O4_final', 'ro1', 'ro2', 'ro3', 'ro4', 'ro_final', 'ust1', 'ust2',
       'ust3', 'ust4', 'v_final', 'level_final']

Я попробовал использовать обучение с подкреплением (а именно Q-Learning), чтобы варьировать значения уставок, удерживая концентрацию и плотность химических веществ в нужном диапазоне. При этом я не стал выкидывать значения, которые не укладывались в нужные диапазоны, т.к. это может нарушить временную зависимость.

In [None]:
# Версия алгоритма Q-обучения с Викиконспектов

class QLearningAgent:
    def __init__(self,
                 n_states: int,
                 n_actions: int,
                 alpha: float,
                 gamma: float,
                 data: pd.DataFrame,
                 n_bins: int = 10
                ):
        self.n_states = n_states
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.Q = {}
        self.data = data
        self.total_reward = 0
        self.discrete_values = [-0.05, 0, 0.05]
        self.n_bins = n_bins
        self.bins = [np.linspace(self.data[c].min(), self.data[c].max(), self.n_bins) for c in columns]

    def discretize_state(self, state):
        return (np.digitize(s, b) - 1 for s, b in zip(state, self.bins))

    def T(self, s: int, a: int, j: int) -> int: # функция перехода между состояниями
        i1, i2, i3, i4 = np.unravel_index(int(a), (3, 3, 3, 3))
        delta_ust1 = self.discrete_values[i1]
        delta_ust2 = self.discrete_values[i2]
        delta_ust3 = self.discrete_values[i3]
        delta_ust4 = self.discrete_values[i4]
        current_item = data.iloc[j + 1].copy()
        current_item['ust1'] += delta_ust1
        current_item['ust2'] += delta_ust2
        current_item['ust3'] += delta_ust3
        current_item['ust4'] += delta_ust4
        s_next = self.discretize_state(current_item[columns].values)
        reward = self.R(current_item)
        return s_next, reward

    def R(self, item) -> float: # функция награды
        ro, Ni, S, Fe3O4 = item['ro_final'], item['Ni_final'], item['S_final'], item['Fe3O4_final']
        ro_reward = 1 if 1.85 <= ro <= 2.5 else -abs(ro - 2.175)
        Ni_reward = 1 if 7.275 <= Ni <= 7.725 else -abs(Ni - 7.5)
        S_reward = 1 if 24.735 <= S <= 26.265 else -abs(S - 25.5)
        Fe3O4_reward = 1 if 1.649 <= Fe3O4 <= 1.751 else -abs(Fe3O4 - 1.75)
        return ro_reward + Ni_reward + S_reward + Fe3O4_reward

    def train(self, epochs: int):
        rewards = []
        total_n_iters = 0
        for i in range(epochs):
            n_iters = 0
            iter_reward = 0
            start_state = randint(0, len(data) - 2)
            for j in range(start_state, len(data) - 1):
                n_iters += 1
                total_n_iters += 1
                s = self.discretize_state(data.iloc[start_state][columns].values)
                a = self.argmax(s)
                s_next, r = self.T(s, a, j)
                self.Q[(s_next, a)] = (1 - self.alpha) * self.Q.get((s_next, a), 0.0) + self.alpha * (r + self.gamma * self.maxQ(s_next))
                s = s_next
                self.total_reward += r
                iter_reward += r
            rewards.append(self.total_reward)
            mae_iter = abs(iter_reward) / n_iters
            mae_total = abs(self.total_reward) / total_n_iters
            print(f'Epoch {i + 1}: total reward = {self.total_reward}, MAE_iter = {mae_iter}, MAE_total = {mae_total}')

    def argmax(self, s) -> int:
        current_argmax = 0
        for a in range(self.n_actions):
            if self.Q.get((s, a), 0.0) > self.Q.get((s, current_argmax), 0.0):
                current_argmax = a
        return current_argmax

    def maxQ(self, s) -> float:
        current_max = -1e9
        for a in range(self.n_actions):
            current_max = max(current_max, self.Q.get((s, a), 0.0))
        return current_max

In [None]:
agent = QLearningAgent(10 * len(columns), 81, 0.1, 0.99, data)
agent.train(10)

Epoch 1: total reward = 243826.42930122322, MAE_iter = 1.0830087737353233, MAE_total = 1.0830087737353233
Epoch 2: total reward = 261073.51290141587, MAE_iter = 0.8144637136379251, MAE_total = 1.059921534713479
Epoch 3: total reward = 563436.4442031507, MAE_iter = 0.9794019580773755, MAE_total = 1.0151349537744412
Epoch 4: total reward = 841916.5584050857, MAE_iter = 1.0073107194974846, MAE_total = 1.012533519029081
Epoch 5: total reward = 1183188.8195949977, MAE_iter = 0.7153564940732577, MAE_total = 0.9041908016477624
Epoch 6: total reward = 1235264.7336909117, MAE_iter = 0.9474032437640878, MAE_total = 0.9059327961662039
Epoch 7: total reward = 1552012.0449785332, MAE_iter = 0.9113719214546017, MAE_total = 0.9070375780522765
Epoch 8: total reward = 1625046.3633728374, MAE_iter = 0.9693191197936378, MAE_total = 0.909664426459137
Epoch 9: total reward = 1957172.6055607735, MAE_iter = 0.6619053679682744, MAE_total = 0.8553339618751242
Epoch 10: total reward = 1998513.9207575074, MAE_it

Можно сделать вывод, что алгоритм постепенно сходится, и его точность улучшается. Но для полной сходимости нужно сделать больше итераций. На обычном ноутбуке сделать это затруднительно. Возможно, есть библиотеки для RL, которые позволяют использовать GPU для ускорения вычислений, их стоит попробовать при более детальной оптимизации.