In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler, LabelEncoder 
from sklearn.pipeline import Pipeline   
from preprocess_func_and_classes import BasePeakMarker, read_emg8
from sklearn.base import BaseEstimator, TransformerMixin
# Метрики
from sklearn.metrics import classification_report
# Для аннотаций
from typing import List, Any

In [39]:
# Работа с табличными данными

import pandas as pd
import numpy as np


# Визуализация

import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_dark'
pio.renderers.default = 'notebook'
from motorica.emg8.utils import fig_montage # кастомная функция визуализации


# Пайплайн

# константы
from motorica.emg8.constants import *
# чтение данных и разметка по фактическим жестам
from motorica.emg8.pipeline import read_emg8
from motorica.emg8.markers import BasePeakMarker, TransMarker
# создание экземпляра пайплайна
from motorica.emg8.pipeline import create_grad_logreg_pipeline


# Метрики
from sklearn.metrics import classification_report


# Для вывода докстрингов с форматированием markdown
from IPython.display import Markdown as md

# Работа с файлами
import os

# Для оценки скорости инференса
from time import time

# Сериализация модели (пайплайна)
import pickle

In [None]:
# ----------------------------------------------------------------------------------------------
# ПАРАМЕТРЫ ЧТЕНИЯ ИСХОДНЫХ ДАННЫХ И РАЗМЕТКИ

DATA_DIR = 'data/new'

N_OMG_CH = 16         # количество каналов OMG-датчиков
OMG_COL_PRFX = 'omg'  # префикс в названиях столбцов датафрейма, соответствующих OMG-датчикам
STATE_COL = 'state'   # столбец с названием жеста, соответствующего команде
CMD_COL = 'id'        # столбец с меткой команды на выполнение жеста
TS_COL = 'ts'         # столбец метки времени

NOGO_STATE = 'Neutral'      # статус, обозначающий нейтральный жест
BASELINE_STATE = 'Baseline' # доп. служебный статус в начале монтажа
FINISH_STATE   = 'Finish'   # доп. служебный статус в конце монтажа

# Список с названиями всех столбцов OMG
OMG_CH = [OMG_COL_PRFX + str(i) for i in range(N_OMG_CH)]

# Новые столбцы:
SYNC_COL = 'sample'   # порядковый номер размеченного жеста
GROUP_COL = 'group'   # новый группы (цикла протокола)
TARGET = 'act_label'  # таргет (метка фактически выполняемого жеста)

# ----------------------------------------------------------------------------------------------

In [7]:
# --------------------------------------------------------------------------------------------
# ВИЗУАЛИЗАЦИЯ ДАННЫХ

import plotly.express as px


def fig_montage(
        fig_data: pd.DataFrame,
        title: str = '', 
        width: int = 1200, 
        height: int = 700,
        mult_labels: int = 1_000_000,
        **extra_labels
        ):

    fig_data = fig_data.copy()

    for extra_label in extra_labels:
        fig_data[extra_label] = extra_labels[extra_label] * mult_labels
        
    fig = px.line(fig_data, width=width, height=height, title=title)
    fig.update_traces(line=dict(width=1))
    return fig

In [8]:
# Папка с файлами данных
DATA_DIR = 'data/new'

montages = sorted(filter(lambda f: f.endswith('.emg8'), os.listdir(DATA_DIR)))
montages
['2024-12-02_14-03-05.emg8',
 '2024-12-04_12-22-13.emg8',
 '2024-12-09_11-22-43.emg8']
montage = montages[-1]
gestures_raw = pd.read_csv(os.path.join(DATA_DIR, montage), sep=' ')
fig_montage(
    gestures_raw[OMG_CH], y_cmd=gestures_raw['id'], 
    title=f"<i>{montage}</i> – исходные данные"
).show()

```py 
# Функция для объединения данных из нескольких файлов
def combine_data_from_files(file_list, dir=DATA_DIR):
    combined_data = pd.DataFrame()
    for file in file_list:
        data = read_emg8(file, dir)
        combined_data = pd.concat([combined_data, data], ignore_index=True)
    return combined_data

# Список файлов для объединения
file_list = ['example1.emg8', 'example2.emg8', 'example3.emg8']  # Замените на ваши файлы

# Объединение данных из нескольких файлов
combined_data(data) = combine_data_from_files(file_list)
```

In [None]:
# Преобразование данных в последовательности
def create_sequences(X, y, timesteps):
    X = []
    y = []
    for i in range(len(X) - timesteps):
        X.append(X[i:i + timesteps, :-1])
        y.append(y[i + timesteps, -1])
    return np.array(X), np.array(y)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import optuna
import time

# Функция для создания модели LSTM
def create_lstm_model(input_shape, n_units=64, dropout_rate=0.01, learning_rate=0.001):
    model = Sequential()
    model.add(LSTM(n_units, input_shape=input_shape, return_sequences=True))
    model.add(LayerNormalization())
    model.add(Dropout(dropout_rate))
    model.add(LSTM(n_units, return_sequences=False))
    model.add(LayerNormalization())
    model.add(Dense(y_train_encoded.shape[1], activation='relu'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def optimize_lstm(X_train, y_train, X_val, y_val):
    def objective(trial):
        n_units = trial.suggest_int('n_units', 32, 128)
        dropout_rate = trial.suggest_float('dropout_rate', 0.01, 0.05)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
        model = create_lstm_model((X_train.shape[1], X_train.shape[2]), n_units, dropout_rate, learning_rate)
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=64, callbacks=[early_stopping], verbose=0)
        val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
        return val_accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)
    return study.best_params


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [11]:
def build_evaluate_model(timesteps, X_train_seq, y_train_encoded, X_test_seq, y_test_encoded):
    # Разделение данных на тренировочные и валидационные наборы
    train_size = int(len(X_train_seq) * 0.8)
    X_train, X_val = X_train_seq[:train_size], X_train_seq[train_size:]
    y_train, y_val = y_train_encoded[:train_size], y_train_encoded[train_size:]

    # Оптимизация гиперпараметров
    best_params = optimize_lstm(X_train, y_train, X_val, y_val)

    # Построение модели с оптимальными гиперпараметрами
    model = create_lstm_model((timesteps, X_train_seq.shape[2]), **best_params)
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train_seq, y_train_encoded, epochs=30, batch_size=64, validation_split=0.2, callbacks=[early_stopping], verbose=1)

    # Оценка на тестовых данных
    test_loss, test_accuracy = model.evaluate(X_test_seq, y_test_encoded, verbose=0)
    print(f"Test accuracy: {test_accuracy}")

    return model

In [None]:
# Пример использования
_, _, _, _, data = read_emg8(montage)
timesteps = 2  # Значение по умолчанию

# Кодирование меток
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data[TARGET])
# Преобразование меток в one-hot encoding
y_sequences = to_categorical(y_sequences, num_classes=8)
# Создание последовательностей
X, y = create_sequences(np.hstack((data[OMG_CH].values, y_encoded.reshape(-1, 1))), timesteps)

# Разделение данных по временной оси
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Убедимся, что метки имеют целочисленный тип
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Преобразование меток в one-hot encoding
#y_train_encoded = np.eye(len(label_encoder.classes_))[y_train]
#y_test_encoded = np.eye(len(label_encoder.classes_))[y_test]

In [20]:
from tensorflow.keras.utils import to_categorical

# Преобразование меток в one-hot кодировку
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

In [38]:
X_train

array([[[14614175., 12879374.,  8433556., ..., 11699413., 13505758.,
         12006664.],
        [14614702., 12880539.,  8419582., ..., 11702904., 13494134.,
         12011390.]],

       [[14614702., 12880539.,  8419582., ..., 11702904., 13494134.,
         12011390.],
        [14611442., 12874160.,  8414936., ..., 11694616., 13490237.,
         12005435.]],

       [[14611442., 12874160.,  8414936., ..., 11694616., 13490237.,
         12005435.],
        [14607506., 12859053.,  8409412., ..., 11690328., 13481402.,
         11996330.]],

       ...,

       [[14908660., 14268160.,  8906202., ...,  9530849., 12186936.,
         10661653.],
        [14907594., 14272252.,  8906464., ...,  9524150., 12172009.,
         10650993.]],

       [[14907594., 14272252.,  8906464., ...,  9524150., 12172009.,
         10650993.],
        [14902554., 14266725.,  8902855., ...,  9512562., 12160654.,
         10652869.]],

       [[14902554., 14266725.,  8902855., ...,  9512562., 12160654.,
        

In [24]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

# Исправить кодировку

In [25]:
label_encoder.classes_

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [37]:
y_train[::5]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Построение и оценка модели
model = build_evaluate_model(timesteps, X_train, y_train, X_test, y_test)

# Оценка модели на тестовых данных
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)


[I 2024-12-12 14:55:45,502] A new study created in memory with name: no-name-ae2def04-4eef-49da-9fb1-7f521a532a37

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

[W 2024-12-12 14:55:45,814] Trial 0 failed with parameters: {'n_units': 61, 'dropout_rate': 0.157485558489788, 'learning_rate': 2.64403596586558e-05} because of the following error: ValueError('Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 8)').
Traceback (most recent call last):
  File "c:\Users\meleh\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  Fi

ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 8)

In [None]:
y_pred

array([[0.28854445, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.28854445, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.28854445, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.9227735 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.9227735 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.9227735 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

# 3. Оценка модели
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, zero_division=0))

In [None]:
# print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, zero_division=0))

# 4. Симуляция инференса в реальном времени (не доделал)

In [None]:
def preprocessing(buffer, omg_sample, best_timesteps):
    buffer.append(omg_sample)
    
    # Функция для создания последовательностей
    def create_sequences_real(data, timesteps):
        X = []
        for i in range(len(data) - timesteps + 1):
            X.append(data[i:i + timesteps])
        return np.array(X)
    
    if len(buffer) >= best_timesteps:
        X_seq = create_sequences_real(np.array(buffer), best_timesteps)
        # Убедимся, что X_seq имеет правильную форму
        X_seq = X_seq[-1].reshape((1, best_timesteps, N_OMG_CH))
        return X_seq, buffer
    else:
        return None, buffer
    
    
# Функция для выполнения инференса (предсказания)
def inference(model, X_seq):
    y_pred_prob = model.predict(X_seq)
    #y_pred = np.argmax(y_pred_prob, axis=1)[0]  # Получаем скалярное значение
    return y_pred_prob

def postprocessing(y_pred_prob, GESTURES):
    y_pred = np.argmax(y_pred_prob, axis=1)[0]  # Получаем скалярное значение
    return y_pred

In [None]:
df_sim = data.iloc[3300:]
print(df_sim.shape)

(2200, 44)


In [None]:
# Основной цикл для обработки данных в реальном времени
TIMEOUT = 0.040
DEBUG = False

i = 0
ts_old = time.time()
ts_diff = 0

y_previous = None
y_dct = {
    'omg_sample': [],
    'sample_preprocessed': [],
    'y_predicted': [],
    'y_postprocessed': [],
}

buffer = []

while True:
    # [Data reading]
    ts_start = time.time()

    try:
        # [Sim data]
        if i < len(df_sim):
            sample = df_sim.values[i]
        else:
            break
        # [/Sim data]
        td_ts, omg_sample, _ = np.array_split(sample, [2, 2 + N_OMG_CH])

    except Exception as e:
        print(e)

    # [/Data Reading]

    # [Data preprocessing]
    X_seq, buffer = preprocessing(buffer, omg_sample.astype(float), 2)#best_timesteps)
    if X_seq is not None:
        sample_preprocessed = X_seq
    else:
        continue
    # [/Data preprocessing]

    # [Inference]
    y_predicted = inference(model, sample_preprocessed)
    # [/Inference]

    # [Inference Postprocessing]
    y_postprocessed = postprocessing(y_predicted, y_previous)
    # [/Inference Postprocessing]

    # [Commands composition]
    # NO COMMANDS FORMING IN SIMULATION
    # [/Commands composition]

    # [Commands sending]
    # NO COMMANDS SENDING IN SIMULATION
    # [/Commands sending]

    # [Data logging]
    y_dct['omg_sample'].append(omg_sample)
    y_dct['sample_preprocessed'].append(sample_preprocessed)
    y_dct['y_predicted'].append(y_predicted)
    y_dct['y_postprocessed'].append(y_postprocessed)
    # [/Data logging]

    y_previous = y_postprocessed

    if DEBUG:
        # Sanity check: Sizes of SAMPLE=65, OMG=16
        print(f'SAMPLE SIZE: {len(sample)}, OMG: {len(omg_sample)}', end='             \r')

    ts_diff = time.time() - ts_start
    # assert(ts_diff < TIMEOUT), 'Calculation cycle takes more than TIMEOUT, halting...'
    ts_old = ts_start
    i += 1

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19

In [None]:
for key, val in y_dct.items():
    # print(f"len({key}) = {len(y_dct[key])}")
    y_dct[key] = np.stack(val)
    print(f"{key}.shape = {y_dct[key].shape}")

omg_sample.shape = (2200, 16)
sample_preprocessed.shape = (2200, 1, 2, 16)
y_predicted.shape = (2200, 1, 8)
y_postprocessed.shape = (2200,)


In [None]:
y_predicted

array([[0.92277336, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]], dtype=float32)

In [None]:
label_encoder.classes_

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [None]:
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, zero_division=0))

TypeError: object of type 'numpy.int64' has no len()

```py # Симуляция инференса в реальном времени на отложенной выборке
y_test_pred = []
comp_durations = []

# Предыдущее предсказание
prev_pred = None

for i in range(X_test.shape[0]):
    start_time = time.time()
    pred = model.predict(X_test[i].reshape(1, X_test.shape[1], X_test.shape[2]))
    pred_class = np.argmax(pred, axis=1)[0]

    # Если время предсказания превышает 39 мс, используем предыдущее предсказание
    comp_duration = (time.time() - start_time) * 1000
    if comp_duration > 39:
        if prev_pred is not None:
            y_test_pred.append(prev_pred)
        else:
            y_test_pred.append(pred_class)
    else:
        y_test_pred.append(pred_class)
        prev_pred = pred_class

    comp_durations.append(comp_duration)

print(f"Максимальное время: {np.round(np.max(comp_durations), 2)} мс")
print(f"Среднее время: {np.round(np.mean(comp_durations), 2)} мс")

# Визуализация результатов
#import plotly.express as px

#fig = px.scatter(
#    x=comp_durations * 1000, y=y_test_pred, color=y_test_pred, width=1200, height=500,
#    title='Время инференса при последовательных предсказаниях примеров тестовой выборки',
#    labels={'value': 'время, мс'}
#)
#fig.update_coloraxes(showscale=False)
#fig.show()

# print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_, zero_division=0))