# Импорт необходимых библиотек

In [None]:
import numpy as np
import pandas as pd
import datetime
import random

import optuna
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, LSTM, Dense, concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

from keras.models import save_model

# Создание функций

## Создание мок-данных с датами

In [None]:
def generate_dates_for_client(sub_df, start_date):
    category_to_date = {}  # Словарь для хранения даты для определенной категории

    for index, row in sub_df.iterrows():
        category = row['category']

        # Если категория уже имеет дату и случайное число меньше 0.5 (50% шанс), то используем эту дату
        if category in category_to_date and np.random.rand() < 0.5:
            new_date = category_to_date[category]
        else:
            # В противном случае генерируем новую дату
            delta = datetime.timedelta(
                days=np.random.randint(0, 365*2),
                seconds=np.random.randint(0, 3600*24),
            )
            new_date = start_date + delta
            category_to_date[category] = new_date  # Сохраняем дату для данной категории

        sub_df.loc[index, 'date'] = new_date

    return sub_df

## Создание временных фичей

In [None]:
def add_time_features(df):
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['season'] = (df['month'] % 12 + 3) // 3 # 1: зима, 2: весна, 3: лето, 4: осень
    return df

## Создание датасетов для обучения модели

In [None]:
def create_dataset(data, look_back=1):
    dataX, dataY = [], []
    for i in range(len(data) - look_back):
        dataX.append(data[i:(i + look_back), :])
        dataY.append(data[i + look_back, :])
    return np.array(dataX), np.array(dataY)

## Создание нейросети

In [None]:
def lstm(trial):
    look_back = trial.suggest_int('look_back', 1, 23)

    # Выделение последнего месяца как тестового набора данных
    test = df_grouped.iloc[-1].values.reshape(1, -1)
    train = df_grouped.iloc[:-1]

    # Масштабирование
    scaler = StandardScaler().fit(train.values)
    scaled_train = scaler.transform(train.values)
    scaled_test = scaler.transform(test)

    X_train, y_train = create_dataset(scaled_train, look_back)
    X_test = scaled_train[-look_back:].reshape(1, look_back, -1)
    y_test = scaled_test

    n_neurons = trial.suggest_int('n_neurons', 4, 128)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    num_layers = trial.suggest_int('num_layers', 1, 10)
    dropout_rate = trial.suggest_float('dropout_rate', 0, 0.5)
    epochs = trial.suggest_int('epochs', 5, 100)
    batch_size = trial.suggest_int('batch_size', 1, 256)

    model = build_and_train_lstm(X_train, y_train, n_neurons, Adam, learning_rate, num_layers, dropout_rate, epochs, batch_size)
    y_pred = model.predict(X_test)

    # Денормализация прогноза
    y_pred = scaler.inverse_transform(y_pred)

    trial.set_user_attr("scaler", scaler)

    return mean_squared_error(test, y_pred)


## Создание и обучение модели

In [None]:
def build_and_train_lstm(X_train, y_train, n_neurons, optimizer, learning_rate, num_layers, dropout_rate, epochs, batch_size):

    input_time_series = Input(shape=(X_train.shape[1], X_train.shape[2]), name='time_series_input')

    x_ts = input_time_series
    for i in range(num_layers):
        x_ts = LSTM(n_neurons, return_sequences=True if i < num_layers - 1 else False)(x_ts)
        if dropout_rate > 0:
            x_ts = Dropout(dropout_rate)(x_ts)

    output = Dense(y_train.shape[1])(x_ts)

    model = Model(inputs=input_time_series, outputs=output)

    model.summary()

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(loss='mean_squared_error', optimizer=optimizer)
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)  # change verbose to 1 to see training logs

    return model

# Обработка данных

Словарь с шифром

In [None]:
dictionary = {
    "topic": ['автозапчасти', 'видеоигры', 'напитки', 'продукты питания', 'закуски и приправы', 'аквариум', 'одежда', 'уборка', 'электроника', 'образование'],
    "label": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}

## Загружаем датасет и обрабатываем данные

In [None]:
df = pd.read_excel('final_df.xlsx', sheet_name='Sheet1')
df = df[df['topic']!='нет категории']

Создание мок-данных с датами

In [None]:
unique_clients = df['client'].unique()
selected_clients = np.random.choice(unique_clients, 2400, replace=False)

# Создаем список для сохранения промежуточных результатов
all_subsets = []

for i in range(100):
    # Выберите 24 клиента для текущей группы
    clients_group = selected_clients[i*24:(i+1)*24]

    client_name = "Объединенный_Клиент_" + str(i+1)
    start_date = datetime.datetime.now() - datetime.timedelta(days=365*2)  # начнем отсчет с 2-х лет назад

    for client in clients_group:
        client_subset = df[df['client'] == client].copy()
        client_subset['date'] = None
        client_subset = generate_dates_for_client(client_subset, start_date)
        client_subset['client'] = client_name
        all_subsets.append(client_subset)

# Используем pd.concat для объединения всех подмножеств
result_df = pd.concat(all_subsets, ignore_index=True)

In [None]:
display(result_df[result_df['client']=='Объединенный_Клиент_2'].sort_values(by=['date']).tail(60))

Unnamed: 0,Столбец1,sale,category,price,client,cleaned_sale,topic,Unnamed: 7,Категория,Встречаемость,Unnamed: 10,Unnamed: 11,Unnamed: 12,date
1135,259938,Напиток чайный травяной о'кей русский сбор...,1976872004,74,Объединенный_Клиент_2,напиток чайный травяной русский сбор,напитки,,,,,,,2023-07-23 22:26:53.350929
1406,295362,Вода природная питьевая сенежская газированная...,1976872004,38,Объединенный_Клиент_2,вода природный питьевой сенежский газированная л,напитки,,,,,,,2023-07-25 03:03:58.350929
1407,295363,Кофе carte noire intense absolu в зернах 800 г,1976872004,2,Объединенный_Клиент_2,кофе carte noire intense absolu в зернах г,напитки,,,,,,,2023-07-25 03:03:58.350929
799,206596,Оперативная память kingston fury beast black [...,1838685550,3899,Объединенный_Клиент_2,оперативный память kingston fury beast black г...,электроника,,,,,,,2023-07-25 05:16:08.350929
800,206597,Материнская плата asrock b660m pro rs,1838685550,8466,Объединенный_Клиент_2,материнский плата asrock m pro rs,электроника,,,,,,,2023-07-25 05:16:08.350929
1328,69197,Конфеты шоколадные essen 35 с морской солью и ...,1363498391,64,Объединенный_Клиент_2,конфета шоколадный essen с морской соль и кран...,продукты питания,,,,,,,2023-07-27 05:50:46.350929
1327,69196,Вафли акульчев венские с бананом 100 г,1363498391,59,Объединенный_Клиент_2,вафли акульчев венский с банан г,продукты питания,,,,,,,2023-07-27 05:50:46.350929
1056,158669,Мешок для стирки одежды atmosphere 40 х 30 см,1505938033,339,Объединенный_Клиент_2,мешок для стирка одежда atmosphere х см,уборка,,,,,,,2023-07-31 17:15:19.350929
1057,158670,Сменный флакон средства mr. muscle утренняя ро...,1505938033,189,Объединенный_Клиент_2,сменный флакон средство mr muscle утренний рос...,уборка,,,,,,,2023-07-31 17:15:19.350929
802,247922,Смесь орехов и кукурузы ем! best nuts super ba...,1964648565,229,Объединенный_Клиент_2,смесь орех и кукуруза ем best nuts super bar г,закуски и приправы,,,,,,,2023-08-02 01:32:45.350929


Даты создались корректно

Преобразование типа данных

In [None]:
result_df['date'] = pd.to_datetime(result_df['date'])

Сохраняем одного клиента для будущих тестов

In [None]:
test_df = result_df[result_df['client']=='Объединенный_Клиент_1']

In [None]:
test_df.to_csv('test_df.csv')

In [None]:
train_df = result_df.drop(result_df[result_df.index.isin(test_df.index)].index)

Добавление временных фичей в трейн и тест выборки

In [None]:
new_train_df = add_time_features(train_df)
new_test_df = add_time_features(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quarter'] = df['date'].dt.quarter
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

## Аггрегация данных

In [None]:
df_grouped = new_train_df.groupby(['client', 'year', 'month', 'season', 'topic']).agg({'price': 'sum'}).reset_index()
df_grouped = df_grouped.pivot_table(index=['year', 'month', 'season'], columns='topic', values='price', fill_value=0).reset_index()

In [None]:
df_grouped_test = new_test_df.groupby(['client', 'year', 'month', 'season', 'topic']).agg({'price': 'sum'}).reset_index()
df_grouped_test = df_grouped_test.pivot_table(index=['year', 'month', 'season'], columns='topic', values='price', fill_value=0).reset_index()

#  Запуск обучения

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(lstm, n_trials=20)

best_params = study.best_params
print("Лучшие параметры: ", best_params)

[I 2023-09-27 22:39:58,701] A new study created in memory with name: no-name-661fe10b-ab08-4037-b42f-f68e6147d57d


Model: "model_299"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 10, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1474 (LSTM)            (None, 10, 77)            27104     
                                                                 
 dropout_1474 (Dropout)      (None, 10, 77)            0         
                                                                 
 lstm_1475 (LSTM)            (None, 10, 77)            47740     
                                                                 
 dropout_1475 (Dropout)      (None, 10, 77)            0         
                                                                 
 lstm_1476 (LSTM)            (None, 10, 77)            47740     
                                                         

[I 2023-09-27 22:40:23,480] Trial 0 finished with value: 14872399.034937257 and parameters: {'look_back': 10, 'n_neurons': 77, 'learning_rate': 0.03203437321567369, 'num_layers': 9, 'dropout_rate': 0.3589718194965184, 'epochs': 100, 'batch_size': 207}. Best is trial 0 with value: 14872399.034937257.


Model: "model_300"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 12, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1483 (LSTM)            (None, 12, 97)            41904     
                                                                 
 dropout_1483 (Dropout)      (None, 12, 97)            0         
                                                                 
 lstm_1484 (LSTM)            (None, 12, 97)            75660     
                                                                 
 dropout_1484 (Dropout)      (None, 12, 97)            0         
                                                                 
 lstm_1485 (LSTM)            (None, 12, 97)            75660     
                                                         

[I 2023-09-27 22:40:33,424] Trial 1 finished with value: 9053509.538612772 and parameters: {'look_back': 12, 'n_neurons': 97, 'learning_rate': 0.002591718149280472, 'num_layers': 5, 'dropout_rate': 0.34731515204354824, 'epochs': 13, 'batch_size': 131}. Best is trial 1 with value: 9053509.538612772.


Model: "model_301"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 19, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1488 (LSTM)            (None, 19, 100)           44400     
                                                                 
 dropout_1488 (Dropout)      (None, 19, 100)           0         
                                                                 
 lstm_1489 (LSTM)            (None, 19, 100)           80400     
                                                                 
 dropout_1489 (Dropout)      (None, 19, 100)           0         
                                                                 
 lstm_1490 (LSTM)            (None, 19, 100)           80400     
                                                         

[I 2023-09-27 22:40:53,364] Trial 2 finished with value: 25202345.5293405 and parameters: {'look_back': 19, 'n_neurons': 100, 'learning_rate': 0.015614450853352871, 'num_layers': 7, 'dropout_rate': 0.4031867145890578, 'epochs': 75, 'batch_size': 255}. Best is trial 1 with value: 9053509.538612772.


Model: "model_302"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 7, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1495 (LSTM)            (None, 7, 65)             19760     
                                                                 
 dropout_1495 (Dropout)      (None, 7, 65)             0         
                                                                 
 lstm_1496 (LSTM)            (None, 7, 65)             34060     
                                                                 
 dropout_1496 (Dropout)      (None, 7, 65)             0         
                                                                 
 lstm_1497 (LSTM)            (None, 65)                34060     
                                                         

[I 2023-09-27 22:40:59,581] Trial 3 finished with value: 20716534.975369375 and parameters: {'look_back': 7, 'n_neurons': 65, 'learning_rate': 0.0008201419454280225, 'num_layers': 3, 'dropout_rate': 0.09178848526183958, 'epochs': 21, 'batch_size': 209}. Best is trial 1 with value: 9053509.538612772.


Model: "model_303"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 9, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1498 (LSTM)            (None, 9, 25)             3600      
                                                                 
 dropout_1498 (Dropout)      (None, 9, 25)             0         
                                                                 
 lstm_1499 (LSTM)            (None, 9, 25)             5100      
                                                                 
 dropout_1499 (Dropout)      (None, 9, 25)             0         
                                                                 
 lstm_1500 (LSTM)            (None, 9, 25)             5100      
                                                         

[I 2023-09-27 22:41:15,222] Trial 4 finished with value: 20297530.96594978 and parameters: {'look_back': 9, 'n_neurons': 25, 'learning_rate': 0.01495572941905382, 'num_layers': 7, 'dropout_rate': 0.4537622997783698, 'epochs': 11, 'batch_size': 94}. Best is trial 1 with value: 9053509.538612772.


Model: "model_304"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 14, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1505 (LSTM)            (None, 14, 64)            19200     
                                                                 
 dropout_1505 (Dropout)      (None, 14, 64)            0         
                                                                 
 lstm_1506 (LSTM)            (None, 14, 64)            33024     
                                                                 
 dropout_1506 (Dropout)      (None, 14, 64)            0         
                                                                 
 lstm_1507 (LSTM)            (None, 14, 64)            33024     
                                                         

[I 2023-09-27 22:41:36,776] Trial 5 finished with value: 27948136.564549077 and parameters: {'look_back': 14, 'n_neurons': 64, 'learning_rate': 0.005717083645946583, 'num_layers': 8, 'dropout_rate': 0.12261776781354117, 'epochs': 84, 'batch_size': 198}. Best is trial 1 with value: 9053509.538612772.


Model: "model_305"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 11, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1513 (LSTM)            (None, 11, 67)            20904     
                                                                 
 dropout_1513 (Dropout)      (None, 11, 67)            0         
                                                                 
 lstm_1514 (LSTM)            (None, 11, 67)            36180     
                                                                 
 dropout_1514 (Dropout)      (None, 11, 67)            0         
                                                                 
 lstm_1515 (LSTM)            (None, 11, 67)            36180     
                                                         

[I 2023-09-27 22:41:48,091] Trial 6 finished with value: 22141626.102261297 and parameters: {'look_back': 11, 'n_neurons': 67, 'learning_rate': 0.07643355782056707, 'num_layers': 5, 'dropout_rate': 0.24747293115178465, 'epochs': 40, 'batch_size': 25}. Best is trial 1 with value: 9053509.538612772.


Model: "model_306"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 6, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1518 (LSTM)            (None, 84)                31920     
                                                                 
 dropout_1518 (Dropout)      (None, 84)                0         
                                                                 
 dense_561 (Dense)           (None, 10)                850       
                                                                 
Total params: 32770 (128.01 KB)
Trainable params: 32770 (128.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/87
Epoch 2/87
Epoch 3/87
Epoch 4/87
Epoch 5/87
Epoch 6/87
Epoch 7/87
Epoch 8/

[I 2023-09-27 22:41:51,429] Trial 7 finished with value: 63717277.9330281 and parameters: {'look_back': 6, 'n_neurons': 84, 'learning_rate': 0.01787298088553517, 'num_layers': 1, 'dropout_rate': 0.10171251204914056, 'epochs': 87, 'batch_size': 235}. Best is trial 1 with value: 9053509.538612772.


Model: "model_307"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 8, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1519 (LSTM)            (None, 8, 44)             9680      
                                                                 
 dropout_1519 (Dropout)      (None, 8, 44)             0         
                                                                 
 lstm_1520 (LSTM)            (None, 8, 44)             15664     
                                                                 
 dropout_1520 (Dropout)      (None, 8, 44)             0         
                                                                 
 lstm_1521 (LSTM)            (None, 8, 44)             15664     
                                                         

[I 2023-09-27 22:42:03,566] Trial 8 finished with value: 6752254.256320087 and parameters: {'look_back': 8, 'n_neurons': 44, 'learning_rate': 0.08454072486381273, 'num_layers': 5, 'dropout_rate': 0.08953081803596408, 'epochs': 35, 'batch_size': 182}. Best is trial 8 with value: 6752254.256320087.


Model: "model_308"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 23, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1524 (LSTM)            (None, 23, 16)            1728      
                                                                 
 dropout_1524 (Dropout)      (None, 23, 16)            0         
                                                                 
 lstm_1525 (LSTM)            (None, 23, 16)            2112      
                                                                 
 dropout_1525 (Dropout)      (None, 23, 16)            0         
                                                                 
 lstm_1526 (LSTM)            (None, 23, 16)            2112      
                                                         

[I 2023-09-27 22:42:13,252] Trial 9 finished with value: 12649845.502215488 and parameters: {'look_back': 23, 'n_neurons': 16, 'learning_rate': 0.00011859233873585027, 'num_layers': 5, 'dropout_rate': 0.3679545835321003, 'epochs': 5, 'batch_size': 192}. Best is trial 8 with value: 6752254.256320087.


Model: "model_309"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 1, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1529 (LSTM)            (None, 1, 39)             7800      
                                                                 
 dropout_1529 (Dropout)      (None, 1, 39)             0         
                                                                 
 lstm_1530 (LSTM)            (None, 1, 39)             12324     
                                                                 
 dropout_1530 (Dropout)      (None, 1, 39)             0         
                                                                 
 lstm_1531 (LSTM)            (None, 39)                12324     
                                                         

[I 2023-09-27 22:42:19,839] Trial 10 finished with value: 92471214.94239423 and parameters: {'look_back': 1, 'n_neurons': 39, 'learning_rate': 0.09418314845109724, 'num_layers': 3, 'dropout_rate': 0.03853606287693345, 'epochs': 52, 'batch_size': 146}. Best is trial 8 with value: 6752254.256320087.


Model: "model_310"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 15, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1532 (LSTM)            (None, 15, 122)           64904     
                                                                 
 dropout_1532 (Dropout)      (None, 15, 122)           0         
                                                                 
 lstm_1533 (LSTM)            (None, 15, 122)           119560    
                                                                 
 dropout_1533 (Dropout)      (None, 15, 122)           0         
                                                                 
 lstm_1534 (LSTM)            (None, 122)               119560    
                                                         

[I 2023-09-27 22:42:26,929] Trial 11 finished with value: 8092282.2565089045 and parameters: {'look_back': 15, 'n_neurons': 122, 'learning_rate': 0.002660993821739402, 'num_layers': 3, 'dropout_rate': 0.23686985666677332, 'epochs': 29, 'batch_size': 122}. Best is trial 8 with value: 6752254.256320087.


Model: "model_311"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 17, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1535 (LSTM)            (None, 17, 123)           65928     
                                                                 
 dropout_1535 (Dropout)      (None, 17, 123)           0         
                                                                 
 lstm_1536 (LSTM)            (None, 17, 123)           121524    
                                                                 
 dropout_1536 (Dropout)      (None, 17, 123)           0         
                                                                 
 lstm_1537 (LSTM)            (None, 123)               121524    
                                                         

[I 2023-09-27 22:42:34,258] Trial 12 finished with value: 15951549.079834606 and parameters: {'look_back': 17, 'n_neurons': 123, 'learning_rate': 0.0029682266104154027, 'num_layers': 3, 'dropout_rate': 0.20851987368442693, 'epochs': 33, 'batch_size': 90}. Best is trial 8 with value: 6752254.256320087.


Model: "model_312"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 3, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1538 (LSTM)            (None, 49)                11760     
                                                                 
 dropout_1538 (Dropout)      (None, 49)                0         
                                                                 
 dense_567 (Dense)           (None, 10)                500       
                                                                 
Total params: 12260 (47.89 KB)
Trainable params: 12260 (47.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32

[I 2023-09-27 22:42:37,166] Trial 13 finished with value: 22282164.133514687 and parameters: {'look_back': 3, 'n_neurons': 49, 'learning_rate': 0.001159859646931284, 'num_layers': 1, 'dropout_rate': 0.2044591387619531, 'epochs': 32, 'batch_size': 158}. Best is trial 8 with value: 6752254.256320087.


Model: "model_313"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 15, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1539 (LSTM)            (None, 15, 119)           61880     
                                                                 
 dropout_1539 (Dropout)      (None, 15, 119)           0         
                                                                 
 lstm_1540 (LSTM)            (None, 15, 119)           113764    
                                                                 
 dropout_1540 (Dropout)      (None, 15, 119)           0         
                                                                 
 lstm_1541 (LSTM)            (None, 15, 119)           113764    
                                                         

[I 2023-09-27 22:42:47,665] Trial 14 finished with value: 12590444.929693002 and parameters: {'look_back': 15, 'n_neurons': 119, 'learning_rate': 0.007148616887495889, 'num_layers': 4, 'dropout_rate': 0.01884455517828154, 'epochs': 56, 'batch_size': 89}. Best is trial 8 with value: 6752254.256320087.


Model: "model_314"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 22, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1543 (LSTM)            (None, 22, 4)             240       
                                                                 
 dropout_1543 (Dropout)      (None, 22, 4)             0         
                                                                 
 lstm_1544 (LSTM)            (None, 22, 4)             144       
                                                                 
 dropout_1544 (Dropout)      (None, 22, 4)             0         
                                                                 
 lstm_1545 (LSTM)            (None, 22, 4)             144       
                                                         

[I 2023-09-27 22:43:11,282] Trial 15 finished with value: 4735971.344863197 and parameters: {'look_back': 22, 'n_neurons': 4, 'learning_rate': 0.03760326378601691, 'num_layers': 10, 'dropout_rate': 0.1638169303427387, 'epochs': 51, 'batch_size': 38}. Best is trial 15 with value: 4735971.344863197.


Model: "model_315"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 21, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1553 (LSTM)            (None, 21, 6)             408       
                                                                 
 dropout_1553 (Dropout)      (None, 21, 6)             0         
                                                                 
 lstm_1554 (LSTM)            (None, 21, 6)             312       
                                                                 
 dropout_1554 (Dropout)      (None, 21, 6)             0         
                                                                 
 lstm_1555 (LSTM)            (None, 21, 6)             312       
                                                         

[I 2023-09-27 22:43:32,302] Trial 16 finished with value: 28044201.328095533 and parameters: {'look_back': 21, 'n_neurons': 6, 'learning_rate': 0.05010283993791127, 'num_layers': 10, 'dropout_rate': 0.15038980169024718, 'epochs': 53, 'batch_size': 63}. Best is trial 15 with value: 4735971.344863197.


Model: "model_316"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 5, 10)]           0         
 yer)                                                            
                                                                 
 lstm_1563 (LSTM)            (None, 5, 38)             7448      
                                                                 
 dropout_1563 (Dropout)      (None, 5, 38)             0         
                                                                 
 lstm_1564 (LSTM)            (None, 5, 38)             11704     
                                                                 
 dropout_1564 (Dropout)      (None, 5, 38)             0         
                                                                 
 lstm_1565 (LSTM)            (None, 5, 38)             11704     
                                                         

[I 2023-09-27 22:43:54,878] Trial 17 finished with value: 16147608.376725424 and parameters: {'look_back': 5, 'n_neurons': 38, 'learning_rate': 0.0955407384092249, 'num_layers': 7, 'dropout_rate': 0.17085930138483174, 'epochs': 61, 'batch_size': 1}. Best is trial 15 with value: 4735971.344863197.


Model: "model_317"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 19, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1570 (LSTM)            (None, 19, 24)            3360      
                                                                 
 dropout_1570 (Dropout)      (None, 19, 24)            0         
                                                                 
 lstm_1571 (LSTM)            (None, 19, 24)            4704      
                                                                 
 dropout_1571 (Dropout)      (None, 19, 24)            0         
                                                                 
 lstm_1572 (LSTM)            (None, 19, 24)            4704      
                                                         

[I 2023-09-27 22:44:17,755] Trial 18 finished with value: 16795740.592632275 and parameters: {'look_back': 19, 'n_neurons': 24, 'learning_rate': 0.03693577908914035, 'num_layers': 10, 'dropout_rate': 0.2966031847991124, 'epochs': 42, 'batch_size': 51}. Best is trial 15 with value: 4735971.344863197.


Model: "model_318"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 13, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1580 (LSTM)            (None, 13, 4)             240       
                                                                 
 dropout_1580 (Dropout)      (None, 13, 4)             0         
                                                                 
 lstm_1581 (LSTM)            (None, 13, 4)             144       
                                                                 
 dropout_1581 (Dropout)      (None, 13, 4)             0         
                                                                 
 lstm_1582 (LSTM)            (None, 13, 4)             144       
                                                         

[I 2023-09-27 22:44:30,620] Trial 19 finished with value: 8623958.311775438 and parameters: {'look_back': 13, 'n_neurons': 4, 'learning_rate': 0.03861256844675997, 'num_layers': 6, 'dropout_rate': 0.05804846707841893, 'epochs': 66, 'batch_size': 164}. Best is trial 15 with value: 4735971.344863197.


Лучшие параметры:  {'look_back': 22, 'n_neurons': 4, 'learning_rate': 0.03760326378601691, 'num_layers': 10, 'dropout_rate': 0.1638169303427387, 'epochs': 51, 'batch_size': 38}


Сохранение лучших параметров

In [None]:
best_params = study.best_params
best_trial = study.best_trial
best_scaler = best_trial.user_attrs["scaler"]
best_look_back = best_trial.params['look_back']

In [None]:
df_grouped_test.iloc[-1]

topic
автозапчасти           863
аквариум                 0
видеоигры                0
закуски и приправы      49
напитки               1805
образование           2040
одежда                   0
продукты питания      1971
уборка                  89
электроника              0
Name: 24, dtype: int64

## Проверка на тесте

In [None]:
final_test = df_grouped_test.iloc[-1].values.reshape(1, -1)
final_train = df_grouped_test.iloc[:-1]

# Масштабирование
scaler = StandardScaler().fit(final_train.values)
final_scaled_train = scaler.transform(final_train.values)
final_scaled_test = scaler.transform(final_test)

X_train, y_train = create_dataset(final_scaled_train, best_look_back)
X_test = final_scaled_train[-best_look_back:].reshape(1, best_look_back, -1)
y_test = final_scaled_test

# Построение и обучение модели с лучшими параметрами на всем тренировочном наборе данных
best_model = build_and_train_lstm(
    X_train, y_train,
    n_neurons=best_params['n_neurons'],
    optimizer=Adam,
    learning_rate=best_params['learning_rate'],
    num_layers=best_params['num_layers'],
    dropout_rate=best_params['dropout_rate'],
    epochs=best_params['epochs'],
    batch_size=best_params['batch_size']
)

# 4. Тестирование модели на тестовом наборе данных
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Data: {mse}")

Model: "model_319"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_series_input (InputLa  [(None, 22, 10)]          0         
 yer)                                                            
                                                                 
 lstm_1586 (LSTM)            (None, 22, 4)             240       
                                                                 
 dropout_1586 (Dropout)      (None, 22, 4)             0         
                                                                 
 lstm_1587 (LSTM)            (None, 22, 4)             144       
                                                                 
 dropout_1587 (Dropout)      (None, 22, 4)             0         
                                                                 
 lstm_1588 (LSTM)            (None, 22, 4)             144       
                                                         

## Дешифровка данных

In [None]:
predictions_original = scaler.inverse_transform(predictions)

In [None]:
print("\nСравнение реальных данных и прогноза:")

topics = ['автозапчасти', 'аквариум', 'видеоигры', 'закуски и приправы', 'напитки', 'образование', 'одежда', 'продукты питания', 'уборка', 'электроника']

for j, topic in enumerate(topics):
    print(f"{topic}: Реальные данные: {final_test[0][j]:.2f}, Прогноз: {predictions_original[0][j]:.2f}")


Сравнение реальных данных и прогноза:
автозапчасти: Реальные данные: 863.00, Прогноз: -34.07
аквариум: Реальные данные: 0.00, Прогноз: 1911.49
видеоигры: Реальные данные: 0.00, Прогноз: 233.48
закуски и приправы: Реальные данные: 49.00, Прогноз: 808.51
напитки: Реальные данные: 1805.00, Прогноз: 1004.00
образование: Реальные данные: 2040.00, Прогноз: 1225.51
одежда: Реальные данные: 0.00, Прогноз: 348.28
продукты питания: Реальные данные: 1971.00, Прогноз: 1509.28
уборка: Реальные данные: 89.00, Прогноз: 731.31
электроника: Реальные данные: 0.00, Прогноз: 55939.65


В целом результаты расходятся ощутимо в некоторых позициям, но это результат создания искусственных данных из разных людей. По сути модель предсказывает траты в следюущем месяце для обобщённого пользователя из двух десятков клиентов. А сравнение мы проводим с другим человеком. Однако сами числа в целом нормальные. Есть отрицательные значения, но это даже лучше, потому что мы не будем выводить эти числа пользователю, а ранжировать по ним категории. В этом случае отрицательное значение будет стоять меньше нуля и почти не имеет шансов попасть в пятёрку.

# Сохранение модели и параметров

In [None]:
best_model.save("spendings.h5")

In [None]:
best_look_back

22