In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier

from sklearn.metrics import mean_squared_error, f1_score


In [2]:
# Генерируем уникальный seed
my_code = "Pyslar"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [3]:
example_data = pd.read_csv('data.csv')
print(example_data.head())

     X1     X2     X3      X4   X5  X6   X7  X8     Y1     Y2
0  0.98  514.5  294.0  110.25  7.0   2  0.0   0  15.55  21.33
1  0.98  514.5  294.0  110.25  7.0   3  0.0   0  15.55  21.33
2  0.98  514.5  294.0  110.25  7.0   4  0.0   0  15.55  21.33
3  0.98  514.5  294.0  110.25  7.0   5  0.0   0  15.55  21.33
4  0.90  563.5  318.5  122.50  7.0   2  0.0   0  20.84  28.28


In [4]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

154


In [5]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

460 154 154


In [6]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
num_columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'Y1', 'Y2']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), num_columns)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
                                  'X8', 'Y1', 'Y2'])])

In [7]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [8]:
# Устанавливаем названия столбцов
column_names = num_columns
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [9]:
# Выбираем 4 числовых переменных, три их них будут предикторами, одна - зависимой переменной
n = 10
labels = random.sample(num_columns, n)

y_label = labels[0]
x_labels = labels[1:]

print(x_labels)
print(y_label)
x_labels = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
y_label = 'Y1'

print(x_labels)
print(y_label)

['Y2', 'Y1', 'X3', 'X4', 'X8', 'X6', 'X5', 'X7', 'X1']
X2
['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
Y1


In [10]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [11]:
print(x_train)
print(y_train)

           X1        X2        X3        X4   X5        X6     X7   X8
0    0.111111  0.833333  0.428571  1.000000  0.0  0.000000  0.625  0.6
1    0.000000  1.000000  0.714286  1.000000  0.0  0.666667  0.625  0.2
2    0.055556  0.916667  0.571429  1.000000  0.0  0.333333  1.000  0.2
3    0.333333  0.583333  0.000000  1.000000  0.0  0.333333  0.625  0.6
4    0.000000  1.000000  0.714286  1.000000  0.0  1.000000  0.625  1.0
..        ...       ...       ...       ...  ...       ...    ...  ...
455  0.194444  0.750000  0.285714  1.000000  0.0  0.333333  1.000  0.4
456  0.250000  0.666667  0.142857  1.000000  0.0  1.000000  0.625  1.0
457  0.333333  0.583333  0.000000  1.000000  0.0  0.000000  0.625  0.2
458  0.000000  1.000000  0.714286  1.000000  0.0  0.000000  0.625  0.8
459  0.388889  0.500000  1.000000  0.111111  1.0  1.000000  0.625  1.0

[460 rows x 8 columns]
0      0.187703
1      0.244854
2      0.360780
3      0.162243
4      0.211809
         ...   
455    0.234832
456    0.168

In [12]:
#Gradient Boosting
reg = GradientBoostingRegressor(random_state=0)
reg.fit(x_train, y_train)
reg.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [55]:
score = reg.score(x_val, y_val)
print(score)

0.9973721675702463


In [56]:
# Создадим нейроннную сеть для решения задачи регрессии на базе библиотеки sklearn
reg = MLPRegressor(alpha=0.0, batch_size=16, epsilon=1e-07, max_iter=50)
reg.get_params()

{'activation': 'relu',
 'alpha': 0.0,
 'batch_size': 16,
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-07,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 50,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [57]:
reg.fit(x_train, y_train)

MLPRegressor(alpha=0.0, batch_size=16, epsilon=1e-07, max_iter=50)

In [58]:
score = reg.score(x_val, y_val)
print(score)

0.9367744834139777
