1. Попробуйте обучить нейронную сеть RNN/LSTM/GRU на любом другом датасете (любимый временной ряд, текст на русском (другом языке) как генератор или классификатор, или прилагаемый набор airline-passengers - пасажиропоток для авиалиний).

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import activations
from tensorflow.keras import metrics

In [2]:
!wget 'https://drive.google.com/uc?export=download&id=1xwXieDVy1RKdfiJ6am_nNWp6XhAYdhNb' -O airline-passengers.csv

--2021-05-14 10:13:00--  https://drive.google.com/uc?export=download&id=1xwXieDVy1RKdfiJ6am_nNWp6XhAYdhNb
Resolving drive.google.com (drive.google.com)... 74.125.31.101, 74.125.31.138, 74.125.31.100, ...
Connecting to drive.google.com (drive.google.com)|74.125.31.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-c0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/gimlmdgmli2o1dc6f0bfnkuus98j7cjf/1620987150000/14904333240138417226/*/1xwXieDVy1RKdfiJ6am_nNWp6XhAYdhNb?e=download [following]
--2021-05-14 10:13:00--  https://doc-08-c0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/gimlmdgmli2o1dc6f0bfnkuus98j7cjf/1620987150000/14904333240138417226/*/1xwXieDVy1RKdfiJ6am_nNWp6XhAYdhNb?e=download
Resolving doc-08-c0-docs.googleusercontent.com (doc-08-c0-docs.googleusercontent.com)... 173.194.216.132, 2607:f8b0:400c:c12::84
Connecting to doc-08-c0-docs.googleusercontent.com (doc-08-c0

In [3]:
df = pd.read_csv('airline-passengers.csv')

In [4]:
df.head()

Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121


In [5]:
def split_date(x):
    
    list_of_date = x['Month'].split('-')
    x['Year'] = int(list_of_date[0])
    x['Month'] = int(list_of_date[1])
    
    return x

In [6]:
df = df.apply(split_date, axis=1)

In [7]:
df.head()

Unnamed: 0,Month,Passengers,Year
0,1,112,1949
1,2,118,1949
2,3,132,1949
3,4,129,1949
4,5,121,1949


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Month       144 non-null    int64
 1   Passengers  144 non-null    int64
 2   Year        144 non-null    int64
dtypes: int64(3)
memory usage: 3.5 KB


In [9]:
features = ['Year', 'Month']
target = ['Passengers']

In [10]:
X = df[features]
y = df[target]
X, y

(     Year  Month
 0    1949      1
 1    1949      2
 2    1949      3
 3    1949      4
 4    1949      5
 ..    ...    ...
 139  1960      8
 140  1960      9
 141  1960     10
 142  1960     11
 143  1960     12
 
 [144 rows x 2 columns],      Passengers
 0           112
 1           118
 2           132
 3           129
 4           121
 ..          ...
 139         606
 140         508
 141         461
 142         390
 143         432
 
 [144 rows x 1 columns])

In [11]:
scaler = StandardScaler()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
dataset_train = keras.preprocessing.timeseries_dataset_from_array(X_train, 
                                                                  y_train, 
                                                                  sequence_length=3, 
                                                                  sampling_rate=1, 
                                                                  batch_size=5)
dataset_val = keras.preprocessing.timeseries_dataset_from_array(X_test, 
                                                                y_test, 
                                                                sequence_length=3, 
                                                                sampling_rate=1, 
                                                                batch_size=5)

In [50]:
for batch in dataset_train.take(1):
    inputs, targets = batch
    print(inputs.shape)
    print(targets.shape)

(5, 3, 2)
(5, 1)


In [79]:
model = models.Sequential()

model.add(layers.LSTM(64, input_shape=(inputs.shape[1], inputs.shape[2]), 
                      return_sequences=True
                      ))
model.add(layers.Conv1D(32, 1, activation='linear'))
model.add(layers.Flatten())
model.add(layers.Dense(16, activation='linear'))
model.add(layers.Dense(8, activation='linear'))
model.add(layers.Dense(4, activation='linear'))
model.add(layers.Dense(2, activation='linear'))
model.add(layers.Dense(1, activation='linear'))

model.compile(loss='mse', optimizer='adam', metrics=['mae'])
model.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_25 (LSTM)               (None, 3, 64)             17152     
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 3, 32)             2080      
_________________________________________________________________
flatten_22 (Flatten)         (None, 96)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 16)                1552      
_________________________________________________________________
dense_28 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_29 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_30 (Dense)             (None, 2)               

In [80]:
history = model.fit(dataset_train,
                    epochs=30,
                    validation_data=dataset_val)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2. Опишите, какой результат вы получили? Что помогло вам улучшить ее точность?

Учитывая объем датасета, я думаю что результат неплохой. При решении задачи улучшить метрики помогло увеличение таких гиперпараметров как sequence_length(препроцессинг датасета временых рядов), количесво нейровнов в слоях LSTM и Conv1D, количества слоев Dense. Так же снижение batch_size положительно влияло на конечные метрики