In [1]:
import warnings
from collections import namedtuple
from pathlib import Path

import pandas as pd
from keras.layers import Conv1D, GRU, Dense, Dropout, Flatten, LSTM
from keras.models import Sequential
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.regularizers import l2

warnings.filterwarnings("ignore")

In [2]:
base_path = Path('dataset/dataset_versions')

datasets = {}
datasets_names = (
    'bfill_ffill',
    'linear_interpolation',
    'cubic_interpolation',
    'quadratic_interpolation',
    'polynomial_5_interpolation',
    'polynomial_7_interpolation',
    'polynomial_9_interpolation',
    'polynomial_11_interpolation',
)
for dataset_name in datasets_names:
    dataset = pd.read_excel(base_path / f'{dataset_name}_rescaled_dataset.xlsx')
    datasets[dataset_name] = dataset.iloc[:, 1:]

In [3]:
test_size = 0.2
seed = 7
target_feature_name = 'GDP per capita (current US$)'

SplittedDataset = namedtuple('SplittedDataset', ['name', 'x_train', 'y_train', 'x_test', 'y_test'])
splited_datasets = []

for dataset_name, dataset in datasets.items():
    model = dict()
    model['name'] = dataset_name
    data_x = dataset.drop([target_feature_name], axis=1)
    data_y = dataset[target_feature_name]
    model['x_train'], model['x_test'], model['y_train'], model['y_test'] = train_test_split(data_x, data_y, test_size=test_size, random_state=seed)
    splited_datasets.append(SplittedDataset(model['name'], model['x_train'],  model['y_train'], model['x_test'], model['y_test']))

In [4]:
import keras.backend as K
import autokeras as ak
from sklearn.metrics import mean_squared_error
import numpy as np

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    r2 = 1 - SS_res/(SS_tot + K.epsilon())
    return r2

# Кастомная функция потерь на основе r2
def r2_loss(y_true, y_pred):
    return -r2_score(y_true, y_pred)

dataset = splited_datasets[-1]
clf = ak.StructuredDataRegressor(max_trials=1, project_name=f'models_for_{dataset.name}', loss=r2_loss, metrics=[r2_score])
clf.fit(dataset.x_train, dataset.y_train)
test_predict = clf.predict(dataset.x_test)
rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
r2 = r2_score(dataset.y_test, test_predict)
print(r2, rmse)


Using TensorFlow backend
Reloading Tuner from .\models_for_polynomial_11_interpolation\tuner0.json
INFO:tensorflow:Assets written to: .\models_for_polynomial_11_interpolation\best_model\assets


ValueError: Unable to restore custom object of type _tf_keras_metric. Please make sure that any custom layers are included in the `custom_objects` arg when calling `load_model()` and make sure that all layers implement `get_config` and `from_config`.

In [19]:
import autokeras as ak
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

#for dataset in datasets:
# TODO написать функцию r2 метрик - https://keras.io/api/metrics/
#  и функцию r2 потерь - https://keras.io/api/losses/
# Как то проверить, прогнать датасеты
def custom_r2_loss(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    r2 = r2_score(y_true.to_numpy(), y_pred.to_numpy())
    # Преобразование коэффициента детерминации в функцию потерь
    return r2

dataset = splited_datasets[-1]
clf = ak.StructuredDataRegressor(max_trials=1, project_name=f'models_for_{dataset.name}', loss=custom_r2_loss)
clf.fit(dataset.x_train, dataset.y_train)
test_predict = clf.predict(dataset.x_test)
rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
r2 = r2_score(dataset.y_test, test_predict)
print(r2, rmse)

Reloading Tuner from .\models_for_polynomial_11_interpolation\tuner0.json


AttributeError: in user code:

    File "C:\Users\Daniil_Alenushkin\Desktop\SUAI\Magistracy_09.04.04\predicting_a_country_economic_potential\env\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Daniil_Alenushkin\AppData\Local\Temp\ipykernel_46516\1708412979.py", line 8, in custom_r2_loss  *
        r2 = r2_score(y_true.to_numpy(), y_pred.to_numpy())

    AttributeError: 'Tensor' object has no attribute 'to_numpy'


In [10]:
import tensorflow as tf
import autokeras as ak
# Загрузить модель из файла
loaded_model = tf.keras.models.load_model(
    r'structured_data_regressor\best_model',
     custom_objects={'MultiCategoryEncoding': ak.MultiCategoryEncoding}
)
dataset = splited_datasets[-1]
loaded_model.fit(dataset.x_train, dataset.y_train)
test_predict = loaded_model.predict(dataset.x_test)
rmse = np.sqrt(mean_squared_error(dataset.y_test, test_predict))
r2 = r2_score(dataset.y_test, test_predict)
print(r2, rmse)


"""

# Получить информацию о модели
loaded_model.summary()
# Получить список слоев модели
layers = loaded_model.layers

# Пройтись по каждому слою и вывести его конфигурацию
for layer in layers:
    print(layer.get_config())
    print()
"""

-2.300531591882382 0.1093615226404732


'\n\n# Получить информацию о модели\nloaded_model.summary()\n# Получить список слоев модели\nlayers = loaded_model.layers\n\n# Пройтись по каждому слою и вывести его конфигурацию\nfor layer in layers:\n    print(layer.get_config())\n    print()\n'

In [8]:
param_grid_1d_gru = {
    'filters': [8, 32],
    'kernel_size': [1, 3],
    'units': [16, 64],
    'optimizer' : ["adam", "rmsprop"],
    'dropout_rate': [0.1, 0.2, 0.3],
    'gru_act': ['tanh', 'relu'],
    'conv_act': ['tanh', 'relu']
}

In [24]:
from keras import backend as K

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - SS_res/(SS_tot + K.epsilon())

def create_model(filters, kernel_size, units, dropout_rate, optimizer_model, gru_act, conv_act):
    input_shape = (dataset.x_train.shape[1], 1)
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=conv_act, input_shape=input_shape))
    model.add(GRU(units, input_shape=input_shape, activation=gru_act))
    model.add(Dense(1024, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dense(512, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(optimizer=optimizer_model, loss='mean_squared_error', metrics=[r2_score])
    return model

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv1D, MaxPooling1D, Flatten
from keras.regularizers import l2
from keras.optimizers import Adam

def create_advanced_model():
    model = Sequential()
    input_shape = (dataset.x_train.shape[1], 1)
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=[r2_score])
    return model

dataset = splited_datasets[-1]
"""
regressor = KerasRegressor(build_fn=create_model, filters=32, kernel_size=3, units=64, dropout_rate=0.2, optimizer_model='adam', gru_act='tanh', conv_act='relu', epochs=20, batch_size=32)
"""
regressor = KerasRegressor(build_fn=create_advanced_model(), epochs=20, batch_size=32)




regressor.fit(dataset.x_train, dataset.y_train, validation_data = (dataset.x_test, dataset.y_test))
print("значение", regressor.score(dataset.x_test, dataset.y_test))
print()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
значение -2.543741464733455



In [9]:
def create_model(filters, kernel_size, units, dropout_rate, optimizer_model, gru_act, conv_act):
    input_shape = (dataset.get('X_train').shape[1], 1)
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=conv_act, input_shape=input_shape))
    model.add(GRU(units, input_shape=input_shape, activation=gru_act))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # L2 регуляризация с коэффициентом 0.01
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(optimizer=optimizer_model, loss='mean_squared_error')
    return model


for dataset in datasets:
    regressor = KerasRegressor(build_fn=create_model, filters=32, kernel_size=3, units=64, dropout_rate=0.2, optimizer_model='adam', gru_act='tanh', conv_act='relu', epochs=3, batch_size=32)




    grid = GridSearchCV(estimator=regressor, param_grid=param_grid_1d_gru, n_jobs=7)
    grid_result = grid.fit(dataset.get('X_train'), dataset.get('Y_train'), validation_data = (dataset.get('X_test'), dataset.get('Y_test')))
    print(f'dataset: {dataset.get("name")}')
    print("Best hyper parameters: ", grid_result.best_params_)
    print("MSE:", grid.score(dataset.get('X_test'), dataset.get('Y_test')))
    print()

Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: only_percent_dataset
Best hyper parameters:  {'conv_act': 'relu', 'dropout_rate': 0.1, 'filters': 8, 'gru_act': 'relu', 'kernel_size': 3, 'optimizer': 'rmsprop', 'units': 64}
MSE: 0.32211055795821064

Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: original_data
Best hyper parameters:   {'conv_act': 'tanh', 'dropout_rate': 0.2, 'filters': 8, 'gru_act': 'relu', 'kernel_size': 3, 'optimizer': 'adam', 'units': 64}
MSE: 0.3549903756538161


In [18]:
param_grid_1d_lstm = {
    'filters': [8, 32],
    'kernel_size': [1, 3],
    'units': [16, 64],
    'optimizer' : ["adam", "rmsprop"],
    'dropout_rate': [0.1, 0.2, 0.3],
    'lstm_act': ['tanh', 'relu'],
    'conv_act': ['tanh', 'relu']
}

In [19]:
def create_model(filters, kernel_size, units, dropout_rate, optimizer_model, lstm_act, conv_act):
    input_shape = (dataset.get('X_train').shape[1], 1)
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=conv_act, input_shape=input_shape))
    model.add(LSTM(units, activation=lstm_act, input_shape=input_shape))
    model.add(Dropout(rate=dropout_rate))
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(optimizer=optimizer_model, loss='mean_squared_error')
    return model


for dataset in datasets:
    regressor = KerasRegressor(build_fn=create_model, filters=32, kernel_size=3, units=64, dropout_rate=0.2, optimizer_model='adam', lstm_act='tanh', conv_act='relu', epochs=3, batch_size=32)
    grid = GridSearchCV(estimator=regressor, param_grid=param_grid_1d_lstm, n_jobs=7)
    grid_result = grid.fit(dataset.get('X_train'), dataset.get('Y_train'), validation_data = (dataset.get('X_test'), dataset.get('Y_test')))
    print(f'dataset: {dataset.get("name")}')
    print("Best hyper parameters: ", grid_result.best_params_)
    print("MSE:", grid.score(dataset.get('X_test'), dataset.get('Y_test')))
    print()


Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: only_percent_dataset
Best hyper parameters: {'conv_act': 'tanh', 'dropout_rate': 0.3, 'filters': 32, 'kernel_size': 3, 'lstm_act': 'relu', 'optimizer': 'rmsprop', 'units': 64}
MSE: 0.3658389029515866

Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: original_data
Best hyper parameters: {'conv_act': 'tanh', 'dropout_rate': 0.1, 'filters': 32, 'kernel_size': 1, 'lstm_act': 'relu', 'optimizer': 'adam', 'units': 64}
MSE: 0.32444082352535886


In [24]:
param_grid_gru_lstm = {
    'filters': [8, 32],
    'kernel_size': [1, 3],
    'units': [16, 64],
    'units_gru': [16, 64],
    'optimizer' : ["adam", "rmsprop"],
    'dropout_rate': [0.1, 0.2, 0.3],
    'lstm_act': ['relu'],
    'conv_act': ['relu'],
    'gru_act': ['relu']
}

In [25]:
def create_model(filters, kernel_size, units, units_gru, dropout_rate, optimizer_model, lstm_act, gru_act, conv_act):
    input_shape = (dataset.get('X_train').shape[1], 1)
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=conv_act, input_shape=input_shape))
    model.add(LSTM(units, activation=lstm_act, input_shape=input_shape, return_sequences=True))
    model.add(GRU(units_gru, input_shape=input_shape, activation=gru_act))
    model.add(Dropout(rate=dropout_rate))
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(optimizer=optimizer_model, loss='mean_squared_error')
    return model

for dataset in datasets:
    regressor = KerasRegressor(build_fn=create_model, filters=32, kernel_size=3, units=64, units_gru=64, dropout_rate=0.2, optimizer_model='adam', lstm_act='tanh', conv_act='relu', gru_act='relu', epochs=3, batch_size=32)
    grid = GridSearchCV(estimator=regressor, param_grid=param_grid_gru_lstm, n_jobs=7)
    grid_result = grid.fit(dataset.get('X_train'), dataset.get('Y_train'), validation_data = (dataset.get('X_test'), dataset.get('Y_test')))
    print(f'dataset: {dataset.get("name")}')
    print("Best hyper parameters: ", grid_result.best_params_)
    print("MSE:", grid.score(dataset.get('X_test'), dataset.get('Y_test')))
    print()

Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: only_percent_dataset
Best hyper parameters: {'conv_act': 'relu', 'dropout_rate': 0.1, 'filters': 32, 'gru_act': 'relu', 'kernel_size': 3, 'lstm_act': 'relu', 'optimizer': 'rmsprop', 'units': 64, 'units_gru': 16}
MSE: 0.32369279000082507

Epoch 1/3
Epoch 2/3
Epoch 3/3
dataset: original_data
Best hyper parameters: {'conv_act': 'relu', 'dropout_rate': 0.2, 'filters': 32, 'gru_act': 'relu', 'kernel_size': 3, 'lstm_act': 'relu', 'optimizer': 'adam', 'units': 64, 'units_gru': 64}
MSE: 0.37896665971742893
