# Как обрабатывать пропущенные временные шаги в задачах прогнозирования последовательности с помощью Python
https://machinelearningmastery.com/handle-missing-timesteps-sequence-prediction-problems-python/

## Демонстрация проблемы последовательности

In [1]:
from random import random
from numpy import array
from pandas import concat
from pandas import DataFrame

# generate a sequence of random values
def generate_sequence(n_timesteps):
	return [random() for _ in range(n_timesteps)]

# generate data for the lstm
def generate_data(n_timesteps):
	# generate sequence
	sequence = generate_sequence(n_timesteps)
	sequence = array(sequence)
	# create lag
	df = DataFrame(sequence)
	df = concat([df.shift(1), df], axis=1)
	values = df.values
	# specify input and output data
	X, y = values, values[:, 0]
	return X, y

# generate sequence
n_timesteps = 10
X, y = generate_data(n_timesteps)
# print sequence
for i in range(n_timesteps):
	print(X[i], '=>', y[i])

[       nan 0.40531374] => nan
[0.40531374 0.02901154] => 0.4053137372794514
[0.02901154 0.80863035] => 0.029011540917747936
[0.80863035 0.83176467] => 0.8086303509380647
[0.83176467 0.45082904] => 0.8317646709920837
[0.45082904 0.74522144] => 0.45082903625450355
[0.74522144 0.64456485] => 0.7452214369304784
[0.64456485 0.59238968] => 0.6445648508186052
[0.59238968 0.82743259] => 0.5923896844090002
[0.82743259 0.37163253] => 0.8274325915552974


## Удалить отсутствующие данные последовательности

In [2]:
from random import random
from numpy import array
from pandas import concat
from pandas import DataFrame

# generate a sequence of random values
def generate_sequence(n_timesteps):
	return [random() for _ in range(n_timesteps)]

# generate data for the lstm
def generate_data(n_timesteps):
	# generate sequence
	sequence = generate_sequence(n_timesteps)
	sequence = array(sequence)
	# create lag
	df = DataFrame(sequence)
	df = concat([df.shift(1), df], axis=1)
	# remove rows with missing values
	df.dropna(inplace=True)
	values = df.values
	# specify input and output data
	X, y = values, values[:, 0]
	return X, y

# generate sequence
n_timesteps = 10
X, y = generate_data(n_timesteps)
# print sequence
for i in range(len(X)):
	print(X[i], '=>', y[i])

[0.60703897 0.46145566] => 0.6070389652880486
[0.46145566 0.66840817] => 0.46145565525111154
[0.66840817 0.06200417] => 0.668408166535919
[0.06200417 0.63484706] => 0.06200416849407375
[0.63484706 0.30086317] => 0.634847059222349
[0.30086317 0.98016782] => 0.3008631709157785
[0.98016782 0.9905226 ] => 0.9801678207477256
[0.9905226  0.07593411] => 0.9905226030519235
[0.07593411 0.47941179] => 0.07593410985452864


## Заменить отсутствующие данные последовательности

In [3]:
from random import random
from numpy import array
from pandas import concat
from pandas import DataFrame

# generate a sequence of random values
def generate_sequence(n_timesteps):
	return [random() for _ in range(n_timesteps)]

# generate data for the lstm
def generate_data(n_timesteps):
	# generate sequence
	sequence = generate_sequence(n_timesteps)
	sequence = array(sequence)
	# create lag
	df = DataFrame(sequence)
	df = concat([df.shift(1), df], axis=1)
	# replace missing values with -1
	df.fillna(-1, inplace=True)
	values = df.values
	# specify input and output data
	X, y = values, values[:, 1]
	return X, y

# generate sequence
n_timesteps = 10
X, y = generate_data(n_timesteps)
# print sequence
for i in range(len(X)):
	print(X[i], '=>', y[i])

[-1.          0.35551706] => 0.35551705812576406
[0.35551706 0.08252663] => 0.08252663045139808
[0.08252663 0.4242715 ] => 0.424271501747155
[0.4242715 0.7377109] => 0.73771089867304
[0.7377109  0.52284893] => 0.5228489302195808
[0.52284893 0.36435033] => 0.36435033403852546
[0.36435033 0.02632615] => 0.02632615153395379
[0.02632615 0.5680358 ] => 0.5680357977683799
[0.5680358  0.85151198] => 0.851511981805571
[0.85151198 0.49000946] => 0.49000946238349363


## Обучение с отсутствующими значениями последовательности

### Изучение недостающих ценностей

In [5]:
from random import random
from numpy import array
from pandas import concat
from pandas import DataFrame
# import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# generate a sequence of random values
def generate_sequence(n_timesteps):
	return [random() for _ in range(n_timesteps)]

# generate data for the lstm
def generate_data(n_timesteps):
	# generate sequence
	sequence = generate_sequence(n_timesteps)
	sequence = array(sequence)
	# create lag
	df = DataFrame(sequence)
	df = concat([df.shift(1), df], axis=1)
	# replace missing values with -1
	df.fillna(-1, inplace=True)
	values = df.values
	# specify input and output data
	X, y = values, values[:, 1]
	# reshape
	X = X.reshape(len(X), 2, 1)
	y = y.reshape(len(y), 1)
	return X, y

n_timesteps = 10
# define model
model = Sequential()
model.add(LSTM(5, input_shape=(2, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
# fit model
for i in range(500):
	X, y = generate_data(n_timesteps)
	model.fit(X, y, epochs=1, batch_size=1, verbose=2)
# evaluate model on new data
X, y = generate_data(n_timesteps)
yhat = model.predict(X)
for i in range(len(X)):
	print('Expected', y[i,0], 'Predicted', yhat[i,0])

2021-10-26 00:17:27.091366: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-26 00:17:27.091681: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-26 00:17:27.093497: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2021-10-26 00:17:27.449735: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-26 00:17:27.469583: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2594090000 Hz


10/10 - 3s - loss: 0.1975
10/10 - 0s - loss: 0.2079
10/10 - 0s - loss: 0.2507
10/10 - 0s - loss: 0.1923
10/10 - 0s - loss: 0.2327
10/10 - 0s - loss: 0.1874
10/10 - 0s - loss: 0.1816
10/10 - 0s - loss: 0.0817
10/10 - 0s - loss: 0.0870
10/10 - 0s - loss: 0.1186
10/10 - 0s - loss: 0.0696
10/10 - 0s - loss: 0.1323
10/10 - 0s - loss: 0.1436
10/10 - 0s - loss: 0.0710
10/10 - 0s - loss: 0.0416
10/10 - 0s - loss: 0.0975
10/10 - 0s - loss: 0.0985
10/10 - 0s - loss: 0.0925
10/10 - 0s - loss: 0.0720
10/10 - 0s - loss: 0.0416
10/10 - 0s - loss: 0.0370
10/10 - 0s - loss: 0.0478
10/10 - 0s - loss: 0.0971
10/10 - 0s - loss: 0.0362
10/10 - 0s - loss: 0.0438
10/10 - 0s - loss: 0.0542
10/10 - 0s - loss: 0.0569
10/10 - 0s - loss: 0.0569
10/10 - 0s - loss: 0.0531
10/10 - 0s - loss: 0.0758
10/10 - 0s - loss: 0.0895
10/10 - 0s - loss: 0.0316
10/10 - 0s - loss: 0.0479
10/10 - 0s - loss: 0.0496
10/10 - 0s - loss: 0.0691
10/10 - 0s - loss: 0.0345
10/10 - 0s - loss: 0.0647
10/10 - 0s - loss: 0.0350
10/10 - 0s -

### Маскирование отсутствующих значений

In [None]:
from tensorflow.keras.layers import Masking

n_timesteps = 10
# define model
model = Sequential()
model.add(Masking(mask_value=-1, input_shape=(2, 1)))
model.add(LSTM(5))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
# fit model
for i in range(500):
	X, y = generate_data(n_timesteps)
	model.fit(X, y, epochs=1, batch_size=1, verbose=2)
# evaluate model on new data
X, y = generate_data(n_timesteps)
yhat = model.predict(X)
for i in range(len(X)):
	print('Expected', y[i,0], 'Predicted', yhat[i,0])