## CUSTOMIZATION
---

In [0]:
import tensorflow as tf
tf.test.gpu_device_name()
# подключить GPU: Runtime -> Change runtime type -> Hardware accelerator: GPU
# вывouyод: '/device:GPU:0'

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
# монтируем гугл диск

In [0]:
# копируем и разархивируем файлы в colab
! cp /content/gdrive/'My Drive'/Mouse/dataset.zip . # тут точка!
! unzip -q dataset
! rm dataset.zip
! ls

dataset  gdrive  sample_data


## LSTM
---

### Libraries
---

In [0]:
from tensorflow.keras.models import Sequential, Model, save_model, load_model
from tensorflow.keras.layers import (Dense, LSTM, BatchNormalization, Dropout,
                                     ReLU, Input)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import logcosh, mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from sklearn.preprocessing import (StandardScaler,
                                   QuantileTransformer,
                                   PowerTransformer)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import glob
import os

### Global variables
---


In [0]:
N_EXPERIMENT = 1

MODEL_NAME = f'{N_EXPERIMENT}_mouse_model'
WEIGHT_NAME = f'{N_EXPERIMENT}_mouse_weight'
LOG_NAME = f'{N_EXPERIMENT}_training_log.csv'
GDRIVE_PATH = f'./gdrive/My Drive/Mouse/{N_EXPERIMENT}'
CHECKPOINT_PATH = f'{GDRIVE_PATH}/{WEIGHT_NAME}''_{epoch:03d}_loss-{loss:.3f}_valloss-{val_loss:.3f}.h5'
N_FEATURES = 79
BATCH_SIZE = 8

### Load data
---

Option:

In [0]:
DATASET = 'BALABIT'
USERNAME = 'user07'
SESSION = 'session_all'

#### experiments

In [0]:
def load_data(dataset: str,
              username: str,
              session: str,
              mode: str,
              legal: bool = True) -> np.array:
    if legal:
        path = f"./dataset/{dataset}/{mode}_features/{username}/{session}"
        X = pd.read_csv(path, sep=',', header=None).values
    else:
        X = None
        for path in glob.glob(f"./dataset/{dataset}/{mode}_features/user*"):
            if os.path.basename(path) == username:
                continue
            session_path = os.path.join(path, session)
            features = pd.read_csv(session_path, sep=',', header=None).values
            X = np.vstack((X, features)) if X is not None else features
    return X

Train data:

In [0]:
X = load_data(DATASET, USERNAME, SESSION, 'train')
N_FEATURES = X.shape[1]
print(f"Train data shape: {X.shape}")

Train data shape: (3417, 79)


Validation data:

In [0]:
X_valid = load_data(DATASET, USERNAME, SESSION, 'test')
print(f"Validation data shape: {X_valid.shape}")

Validation data shape: (3457, 79)


Data preprocessing:

In [0]:
preprocessingFunction = StandardScaler()
X = preprocessingFunction.fit_transform(X)
X_valid = preprocessingFunction.transform(X_valid)

Check:

In [0]:
print(f"Train data: mean - {X.mean():.3f}, var - {X.var():.3f}")
print(f"Train data: mean - {X_valid.mean():.3f}, var - {X_valid.var():.3f}")

for LSTM:

In [0]:
def temporalize(X, lookback):
    output_X = list()
    for i in range(len(X)-lookback+1):
        output_X.append(X[i: i+lookback, :])
    return np.squeeze(np.array(output_X))

In [0]:
timesteps = 3

In [0]:
X_lstm = temporalize(X, timesteps)
print(f"X_LSTM: (n_samples x timesteps x n_features)\n"
      f"                   {X_lstm.shape}")

X_LSTM: (n_samples x timesteps x n_features)
                   (3415, 3, 79)


Check:

In [0]:
print(f"X:\n{X[:5, :3]}\n.\n.\n.\n{X[-5:, :3]}")

In [0]:
print(f"X_lstm:\n{X_lstm[:3, :, :3]}\n.\n.\n.\n{X_lstm[-3:, :, :3]}")

#### final

In [0]:
preprocessingFunction = StandardScaler()
timesteps = 3

In [0]:
def load_data(dataset: str,
              username: str,
              session: str,
              mode: str,
              legal: bool = True,
              timesteps: int = 3) -> np.array:
    if legal:
        path = f"./dataset/{dataset}/{mode}_features/{username}/{session}"
        X = pd.read_csv(path, sep=',', header=None).values
    else:
        X = None
        for path in glob.glob(f"./dataset/{dataset}/{mode}_features/user*"):
            if os.path.basename(path) == username:
                continue
            session_path = os.path.join(path, session)
            features = pd.read_csv(session_path, sep=',', header=None).values
            X = np.vstack((X, features)) if X is not None else features
    
    global preprocessingFunction
    X = preprocessingFunction.fit_transform(X)
    
    X_lstm = list()
    for i in range(len(X)-timesteps+1):
        X_lstm.append(X[i: i+timesteps, :])

    return X[:-(timesteps-1)], np.squeeze(np.array(X_lstm))

In [0]:
y, X = load_data(DATASET, USERNAME, SESSION, 'train',
              legal=True, timesteps=timesteps)
y_valid, X_valid = load_data(DATASET, USERNAME, SESSION, 'test',
                    legal=True, timesteps=timesteps)
N_FEATURES = X.shape[-1]

### Model
---

In [0]:
def get_model(timesteps, n_features): # "LSTM"
    model = Sequential()
    model.add(Input(shape=(timesteps, n_features)))
    model.add(LSTM(units=64, return_sequences=True,
                   kernel_regularizer=l2(0.001)))
    model.add(LSTM(units=32, return_sequences=False,
                   kernel_regularizer=l2(0.001)))
    model.add(Dense(units=64, activation=ReLU(),
                    kernel_regularizer=l2(0.001)))
    model.add(Dense(units=n_features))

    return model

In [0]:
model = get_model(*X.shape[1:])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_20 (LSTM)               (None, 3, 64)             36864     
_________________________________________________________________
lstm_21 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_20 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_21 (Dense)             (None, 79)                5135      
Total params: 56,527
Trainable params: 56,527
Non-trainable params: 0
_________________________________________________________________


Compile:

In [0]:
model.compile(optimizer=Adam(lr=0.001),
              loss=logcosh,
              metrics=[mean_squared_error]
              )

Callbacks:

In [0]:
checkpointer = ModelCheckpoint(filepath=CHECKPOINT_PATH, monitor='val_loss',
                               verbose=1, save_best_only=True,
                               save_weights_only=True, mode='min')
logger = CSVLogger(LOG_NAME)

Fit:

In [0]:
model.fit(X, y,
          batch_size=64,
          epochs=1000,
          verbose=2,
          callbacks=[logger, checkpointer],
          validation_data=(X_valid, y_valid),
          shuffle=False)

### Evaluate
---

Load model

In [0]:
model = load_model("./gdrive/My Drive/Mouse/1/1_mouse_weight_952_loss-0.027_valloss-0.056.h5")

In [0]:
y_test, X_test = load_data(DATASET, USERNAME, SESSION, 'train',
                           legal=False, timesteps=timesteps)

In [0]:
y_test_pred = model.predict(X_test)
np.mean(((y_test - y_test_pred) ** 2)[:])

In [0]:
y_valid_pred = model.predict(X_valid)
np.mean(((y_valid - y_valid_pred) ** 2)[:])

In [0]:
y_pred = model.predict(X)
np.mean(((y - y_pred) ** 2)[:])