In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import optuna
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Define the function to read OMG data from a CSV file
def read_omg_csv(path_palm_data: str, 
                 n_omg_channels: int, 
                 n_acc_channels: int = 0, 
                 n_gyr_channels: int = 0, 
                 n_mag_channels: int = 0, 
                 n_enc_channels: int = 0,
                 button_ch: bool = True, 
                 sync_ch: bool = True, 
                 timestamp_ch: bool = True) -> pd.DataFrame:
    
    df_raw = pd.read_csv(path_palm_data, sep=' ', 
                         header=None, 
                         skipfooter=1, 
                         skiprows=1, 
                         engine='python')
    columns = np.arange(n_omg_channels).astype('str').tolist()
    
    for label, label_count in zip(['ACC', 'GYR', 'MAG', 'ENC'], 
                                  [n_acc_channels, n_gyr_channels, n_mag_channels, n_enc_channels]):
        columns = columns + ['{}{}'.format(label, i) for i in range(label_count)]
        
    if button_ch:
        columns = columns + ['BUTTON']
        
    if sync_ch:
        columns = columns + ['SYNC']
        
    if timestamp_ch:
        columns = columns + ['ts']
        
    df_raw.columns = columns
    
    return df_raw

In [14]:
def prepare_training_data(path_palm_data, path_protocol_data, path_meta_data, 
                          n_omg_channels=50, n_acc_channels=3, n_gyr_channels=3, 
                          n_mag_channels=0, n_enc_channels=6, 
                          standardize=True, normalize=True,
                          DO_REPLACE_TO_MOVING_AVERAGE=True, 
                          DO_CALCULATE_DERIVATIVE=True,
                          DO_SHIFT_GESTURE=True,
                          selected_channels='ALL'):
    """
    Подготовка данных для обучения и тестирования из файлов данных palm, protocol и meta.
    
    Аргументы:
    path_palm_data (str): Путь к файлу данных palm.
    path_protocol_data (str): Путь к файлу данных protocol.
    path_meta_data (str): Путь к файлу данных meta.
    n_omg_channels, n_acc_channels, и т.д. (int): Количество каналов сенсоров.
    standardize (bool): Если True, стандартизирует признаки.
    normalize (bool): Если True, нормализует признаки.
    DO_REPLACE_TO_MOVING_AVERAGE (bool): Если True, применяет скользящее среднее к данным OMG.
    DO_CALCULATE_DERIVATIVE (bool): Если True, вычисляет производные данных OMG.
    DO_SHIFT_GESTURE (bool): Если True, смещает целевой признак на максимальный скачок в данных.
    selected_channels (str): Выбор каналов данных ('OMG', 'ACC_GYR', 'ALL').
    
    Возвращает:
    tuple: Кортеж, содержащий данные для обучения и тестирования.
    """
    # Чтение данных OMG
    omg_data = read_omg_csv(path_palm_data, n_omg_channels, n_acc_channels, n_gyr_channels, 
                            n_mag_channels, n_enc_channels)
    
    # Чтение данных протокола и кодирование жестов
    gestures_protocol = pd.read_csv(path_protocol_data)
    le = LabelEncoder()
    gestures_protocol['gesture'] = le.fit_transform(
        gestures_protocol[[
            "Thumb", "Index", "Middle", "Ring", "Pinky",
            'Thumb_stretch', 'Index_stretch', 'Middle_stretch', 'Ring_stretch', 'Pinky_stretch'
        ]].apply(lambda row: str(tuple(row)), axis=1)
    )
    
    # Чтение метаинформации
    df_meta = pd.read_csv(path_meta_data)
    palm_file = path_palm_data.split('/')[-1]
    last_train_idx = df_meta[df_meta['montage'] == palm_file].to_dict(orient='records')[0]['last_train_idx']
    
    # Синхронизация меток жестов с данными OMG, используя канал SYNC
    y_cmd = np.array([gestures_protocol['gesture'].loc[s] for s in omg_data['SYNC'].values])
    
    # Подготовка названий признаков для данных OMG
    OMG_CH = [str(i) for i in range(n_omg_channels)]
    ACC_CH = ['ACC0', 'ACC1', 'ACC2']
    GYR_CH = ['GYR0', 'GYR1', 'GYR2']
    ALL_CH = OMG_CH + ACC_CH + GYR_CH

    # Выбор каналов в соответствии с параметром selected_channels
    if selected_channels == 'OMG':
        selected_features = OMG_CH
    elif selected_channels == 'ACC_GYR':
        selected_features = ACC_CH + GYR_CH
    else:
        selected_features = ALL_CH
    
    if DO_REPLACE_TO_MOVING_AVERAGE:
        # Замена на скользящее среднее
        for col in selected_features:
            omg_data[col] = omg_data[col].rolling(window=5).mean().bfill()
    
    if DO_CALCULATE_DERIVATIVE:
        # Вычисление производных данных
        OMG_DERIV = [f'{col}_deriv' for col in OMG_CH]
        for col in OMG_CH:
            omg_data[f'{col}_next'] = omg_data[col].shift(-1).ffill()
            omg_data[f'{col}_deriv'] = omg_data[f'{col}_next'] - omg_data[col]
        selected_features += OMG_DERIV

    if DO_SHIFT_GESTURE:
        # Смещение целевого признака
        id_max = 0
        cur_gesture = 0
        for i in range(y_cmd.shape[0]):
            if i < id_max:  # Пропускаем все значения до id_max
                continue
            prev_gesture = cur_gesture  # предыдущий жест
            cur_gesture = y_cmd[i]  # текущий жест
            if cur_gesture != prev_gesture:  # Если сменился жест
                id_max = omg_data[OMG_DERIV][i:i+35].abs().sum(axis=1).idxmax()  # Нахождение максимального скачка
                y_cmd[i:id_max] = prev_gesture  # Замена всех значений до id_max на предыдущий жест
    
    # Разделение данных на обучающие и тестовые наборы
    X_train = omg_data[selected_features].iloc[:last_train_idx].values
    y_train = y_cmd[:last_train_idx]
    X_test = omg_data[selected_features].iloc[last_train_idx:].values
    y_test = y_cmd[last_train_idx:]
    
    # Стандартизация и нормализация
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if normalize:
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    return (X_train, y_train), (X_test, y_test)

path_palm_data = 'data/2023-05-31_17-14-41.palm'
path_protocol_data = 'data/2023-05-31_17-14-41.palm.protocol.csv'
path_meta_data = 'data/meta_information.csv'

(X_train, y_train), (X_test, y_test) = prepare_training_data(path_palm_data, path_protocol_data, path_meta_data, standardize=True, normalize=False)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15679, 106), (15679,), (3889, 106), (3889,))

In [15]:
def build_and_train_model(X_train, y_train, X_test, y_test, epochs=100, batch_size=50):
    num_classes = len(np.unique(y_train))  # Determine the number of unique classes

    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    

    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1)
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

    return model, history, test_loss, test_accuracy

# Example usage:
model, history, test_loss, test_accuracy = build_and_train_model(X_train, y_train, X_test, y_test)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

Epoch 1/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4393 - loss: 1.7728 - val_accuracy: 0.8195 - val_loss: 0.5047
Epoch 2/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7979 - loss: 0.6538 - val_accuracy: 0.8785 - val_loss: 0.3265
Epoch 3/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8500 - loss: 0.4584 - val_accuracy: 0.9043 - val_loss: 0.2397
Epoch 4/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8750 - loss: 0.3696 - val_accuracy: 0.9225 - val_loss: 0.2024
Epoch 5/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8955 - loss: 0.3163 - val_accuracy: 0.9381 - val_loss: 0.1650
Epoch 6/100
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9119 - loss: 0.2501 - val_accuracy: 0.9436 - val_loss: 0.1447
Epoch 7/100
[1m251/25

In [16]:
def objective(trial):
    # Предлагаем параметры
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.7)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    epochs = 100  # Можно также оптимизировать количество эпох

    # Создание модели
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    
    # Компиляция модели
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Обучение модели
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
    
    # Оценка модели
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return accuracy  # Максимизация точности

# Создание исследования
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Можно изменить количество испытаний в зависимости от времени/ресурсов

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-05 14:10:02,127] A new study created in memory with name: no-name-8fd5d20a-e31c-47c0-845a-92be38e5dc17
[I 2024-05-05 14:10:19,755] Trial 0 finished with value: 0.9285163283348083 and parameters: {'learning_rate': 0.04361435912677133, 'dropout_rate': 0.6177066453749649, 'batch_size': 128}. Best is trial 0 with value: 0.9285163283348083.
[I 2024-05-05 14:11:00,256] Trial 1 finished with value: 0.9408588409423828 and parameters: {'learning_rate': 3.610914682869556e-05, 'dropout_rate': 0.1584580341477136, 'batch_size': 32}. Best is trial 1 with value: 0.9408588409423828.
[I 2024-05-05 14:11:41,350] Trial 2 finished with value: 0.9264592528343201 and parameters: {'learning_rate': 0.0041015267942916085, 'dropout_rate': 0.6053398055749268, 'batch_size': 32}. Best is trial 1 with value: 0.9408588409423828.
[I 2024-05-05 14:11:59,581] Trial 3 finished with value: 0.9341732859611511 and parameters: {'learning_rate': 0.011213552508349664, 'dropout_rate': 0.2900131770771293, 'batch_size

Best trial:
  Value: 0.9470300674438477
  Params: 
    learning_rate: 9.046882745965463e-05
    dropout_rate: 0.18391531398895708
    batch_size: 32


In [17]:
# Загрузка лучшей модели
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(trial.params['dropout_rate']),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(trial.params['dropout_rate']),
    Dense(len(np.unique(y_train)), activation='softmax')
])
model.compile(optimizer=Adam(learning_rate=trial.params['learning_rate']),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Повторное обучение лучшей модели
model.fit(X_train, y_train, epochs=100, batch_size=trial.params['batch_size'], verbose=0)

# Предсказания
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Отчёт о классификации
report = classification_report(y_test, y_pred_classes)
print(report)

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 819us/step
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2608
           1       0.99      0.97      0.98       246
           2       0.94      0.99      0.96       279
           3       0.81      0.67      0.73       252
           4       0.86      0.97      0.91       246
           5       0.96      0.94      0.95       258

    accuracy                           0.95      3889
   macro avg       0.92      0.92      0.92      3889
weighted avg       0.95      0.95      0.95      3889

