Загружаем необходимые библиотеки

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.svm import SVR

Загружаем данные

In [2]:
pre_data = pd.read_csv('datasets/V1.csv')
data = pre_data.iloc [: , 1:]

Формируем 4 набора данных для исследований:

Получаем исходный набор данных:

In [3]:
original_sub_data = data.copy()
data_address = [i.split()[-2] for i in data['Address']]
data_email = [i.split('@')[-1] for i in data['Email']]
original_sub_data['Address'] = data_address
original_sub_data['Email'] = data_email
ordinal_encoder = OrdinalEncoder()
avatar_encoded = ordinal_encoder.fit_transform(original_sub_data['Avatar'].values.reshape(-1,1))
address_encoded = ordinal_encoder.fit_transform(original_sub_data['Address'].values.reshape(-1,1))
email_encoded = ordinal_encoder.fit_transform(original_sub_data['Email'].values.reshape(-1,1))
original_sub_data['Avatar'] = avatar_encoded
original_sub_data['Address'] = address_encoded
original_sub_data['Email'] = email_encoded
original_data = original_sub_data.drop('Time on Website', axis=1)
original_data

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Length of Membership,Yearly Amount Spent
0,58.0,29.0,132.0,34.497268,12.655651,4.082621,587.951054
1,93.0,8.0,25.0,31.926272,11.109461,2.664034,392.204933
2,241.0,11.0,6.0,33.000915,11.330278,4.104543,487.547505
3,73.0,43.0,114.0,34.305557,13.717514,3.120179,581.852344
4,43.0,47.0,80.0,33.330673,12.795189,4.446308,599.406092
...,...,...,...,...,...,...,...
495,40.0,54.0,127.0,33.237660,13.566160,3.746573,573.847438
496,73.0,8.0,104.0,34.702529,11.695736,3.576526,529.049004
497,93.0,52.0,18.0,32.646777,11.499409,4.958264,551.620145
498,93.0,53.0,128.0,33.322501,12.391423,2.336485,456.469510


Получаем преобразованный набор исходных данных:

In [4]:
scaler = MinMaxScaler(feature_range=(0, 1))
original_rescaled_data = pd.DataFrame(scaler.fit_transform(original_data))
original_rescaled_data

Unnamed: 0,0,1,2,3,4,5,6
0,0.238683,0.475410,0.963504,0.751425,0.626620,0.573101,0.651040
1,0.382716,0.131148,0.182482,0.362306,0.393016,0.359869,0.266355
2,0.991770,0.180328,0.043796,0.524953,0.426378,0.576396,0.453725
3,0.300412,0.704918,0.832117,0.722409,0.787050,0.428434,0.639055
4,0.176955,0.770492,0.583942,0.574861,0.647702,0.627768,0.673552
...,...,...,...,...,...,...,...
495,0.164609,0.885246,0.927007,0.560784,0.764183,0.522589,0.623324
496,0.300412,0.131148,0.759124,0.782491,0.481592,0.497028,0.535285
497,0.382716,0.852459,0.131387,0.471354,0.451931,0.704722,0.579642
498,0.382716,0.868852,0.934307,0.573625,0.586699,0.310634,0.392650


Получаем исходный набор данных из существенных признаков:

In [5]:
essential_sub_data = data.iloc[:,4:8]
essential_data = essential_sub_data.drop('Time on Website', axis=1)
essential_data

Unnamed: 0,Time on App,Length of Membership,Yearly Amount Spent
0,12.655651,4.082621,587.951054
1,11.109461,2.664034,392.204933
2,11.330278,4.104543,487.547505
3,13.717514,3.120179,581.852344
4,12.795189,4.446308,599.406092
...,...,...,...
495,13.566160,3.746573,573.847438
496,11.695736,3.576526,529.049004
497,11.499409,4.958264,551.620145
498,12.391423,2.336485,456.469510


Получаем преобразованный набор из исходных данных из существенных признаков:

In [6]:
rescaled_essential_data = pd.DataFrame(scaler.fit_transform(essential_data))
rescaled_essential_data

Unnamed: 0,0,1,2
0,0.626620,0.573101,0.651040
1,0.393016,0.359869,0.266355
2,0.426378,0.576396,0.453725
3,0.787050,0.428434,0.639055
4,0.647702,0.627768,0.673552
...,...,...,...
495,0.764183,0.522589,0.623324
496,0.481592,0.497028,0.535285
497,0.451931,0.704722,0.579642
498,0.586699,0.310634,0.392650


Разделяем каждый набор данных на обучающую и тестовую выборки

In [7]:
test_size = 0.2
seed = 7

original_data_X = original_data.iloc[:, :6]
original_data_Y = original_data["Yearly Amount Spent"]

original_data_X_train, original_data_X_test, original_data_Y_train, original_data_Y_test = \
    train_test_split(original_data_X, original_data_Y, test_size=test_size, random_state=seed)


original_rescaled_data_X = original_rescaled_data.iloc[:, :6]
original_rescaled_data_Y = original_rescaled_data[6]

original_rescaled_data_X_train, original_rescaled_data_X_test, original_rescaled_data_Y_train, original_rescaled_data_Y_test = \
    train_test_split(original_rescaled_data_X, original_rescaled_data_Y, test_size=test_size, random_state=seed)


essential_data_X = essential_data.iloc[:, :2]
essential_data_Y = essential_data["Yearly Amount Spent"]

essential_data_X_train, essential_data_X_test, essential_data_Y_train, essential_data_Y_test = \
    train_test_split(essential_data_X, essential_data_Y, test_size=test_size, random_state=seed)


rescaled_essential_data_X = rescaled_essential_data.iloc[:, :2]
rescaled_essential_data_Y = rescaled_essential_data[2]

rescaled_essential_data_X_train, rescaled_essential_data_X_test, rescaled_essential_data_Y_train, rescaled_essential_data_Y_test = \
    train_test_split(rescaled_essential_data_X, rescaled_essential_data_Y, test_size=test_size, random_state=seed)

datasets = [{'X_train': original_data_X_train, 'X_test': original_data_X_test, 'Y_train': original_data_Y_train,
             'Y_test': original_data_Y_test, 'name': 'original_data'},
            {'X_train': original_rescaled_data_X_train, 'X_test': original_rescaled_data_X_test,
             'Y_train': original_rescaled_data_Y_train, 'Y_test': original_rescaled_data_Y_test, 'name': 'original_rescaled_data'},
            {'X_train': essential_data_X_train, 'X_test': essential_data_X_test, 'Y_train': essential_data_Y_train,
             'Y_test': essential_data_Y_test, 'name': 'essential_data'},
            {'X_train': rescaled_essential_data_X_train, 'X_test': rescaled_essential_data_X_test,
             'Y_train': rescaled_essential_data_Y_train, 'Y_test': rescaled_essential_data_Y_test, 'name': 'rescaled_essential_data'}]

Определим гиперпараметры для SVR

In [8]:
param_grid = {'C': [i for i in np.linspace(0.1, 10, num=10)],
              'kernel': ['linear', 'poly']}

Реализация SVR

In [9]:
for dataset in datasets:
    svr_model = SVR()
    grid_search = GridSearchCV(svr_model, param_grid)

    grid_search.fit(dataset.get('X_train'), dataset.get('Y_train'))

    best_params_for_data = grid_search.best_params_
    print(f"Best hyper parameters for {dataset.get('name')}: {best_params_for_data}")

    svr_model_train_data = SVR(C=best_params_for_data.get('C'),
                               kernel=best_params_for_data.get('kernel'))
    svr_model_train_data.fit(dataset.get('X_train'), dataset.get('Y_train'))
    train_Y_pred = svr_model_train_data.predict(dataset.get('X_train'))
    rmse_train = np.sqrt(mean_squared_error(dataset.get('Y_train'), train_Y_pred))
    r2_train = r2_score(dataset.get('Y_train'), train_Y_pred)
    print(f'RMSE for SVR for train {dataset.get("name")}: ', rmse_train)
    print(f'R2_score for SVR for train {dataset.get("name")}: ', r2_train)

    svr_model_test_data = SVR(C=best_params_for_data.get('C'),
                              kernel=best_params_for_data.get('kernel'))
    svr_model_test_data.fit(dataset.get('X_test'), dataset.get('Y_test'))
    test_Y_pred = svr_model_test_data.predict(dataset.get('X_test'))
    rmse_test = np.sqrt(mean_squared_error(dataset.get('Y_test'), test_Y_pred))
    r2_test = r2_score(dataset.get('Y_test'), test_Y_pred)
    print(f'RMSE for SVR for test {dataset.get("name")}: ', rmse_test)
    print(f'R2_score for SVR for test {dataset.get("name")}: ', r2_test)
    print()

Best hyper parameters for original_data: {'C': 10.0, 'kernel': 'linear'}
RMSE for SVR for train original_data:  9.836194670284913
R2_score for SVR for train original_data:  0.9846637719793999
RMSE for SVR for test original_data:  10.047885185149736
R2_score for SVR for test original_data:  0.9835824211779726

Best hyper parameters for original_rescaled_data: {'C': 1.2000000000000002, 'kernel': 'linear'}
RMSE for SVR for train original_rescaled_data:  0.03840738915841269
R2_score for SVR for train original_rescaled_data:  0.9394561776023781
RMSE for SVR for test original_rescaled_data:  0.050776377337436306
R2_score for SVR for test original_rescaled_data:  0.8914426717617145

Best hyper parameters for essential_data: {'C': 7.800000000000001, 'kernel': 'linear'}
RMSE for SVR for train essential_data:  27.301193423611483
R2_score for SVR for train essential_data:  0.8818516692637637
RMSE for SVR for test essential_data:  27.13958726122749
R2_score for SVR for test essential_data:  0.8802

Определим гиперпараметры для бэггинга

In [10]:
param_grid = {'n_estimators': [300],
              'max_samples': [i for i in range(49, 100, 25)],
              'estimator__C': [10, 1.2, 7.8],
              'estimator__kernel': ['linear']}

Реализация Бэггинга на основе SVR

In [11]:
for dataset in datasets:
    svr_model = SVR()
    bagging_model = BaggingRegressor(svr_model)
    grid_search = GridSearchCV(bagging_model, param_grid)
    grid_search.fit(dataset.get('X_train'), dataset.get('Y_train'))

    best_params_for_data = grid_search.best_params_
    print(f"Best hyper parameters for {dataset.get('name')}: {best_params_for_data}")
    svr_model_train_data = SVR(C=best_params_for_data.get('estimator__C'),
                               kernel=best_params_for_data.get('estimator__kernel'))

    bagging_model_train_data = BaggingRegressor(svr_model_train_data,
                                                n_estimators=best_params_for_data.get('n_estimators'),
                                                max_samples=best_params_for_data.get('max_samples'))
    bagging_model_train_data.fit(dataset.get('X_train'), dataset.get('Y_train'))
    train_Y_pred = bagging_model_train_data.predict(dataset.get('X_train'))
    rmse_train = np.sqrt(mean_squared_error(dataset.get('Y_train'), train_Y_pred))
    r2_train = r2_score(dataset.get('Y_train'), train_Y_pred)
    print(f'RMSE for Bagging based on SVR for train {dataset.get("name")}: ', rmse_train)
    print(f'R2_score Bagging based on for SVR for train {dataset.get("name")}: ', r2_train)

    svr_model_test_data = SVR(C=best_params_for_data.get('estimator__C'),
                              kernel=best_params_for_data.get('estimator__kernel'))
    bagging_model_test_data = BaggingRegressor(svr_model_test_data,
                                                n_estimators=best_params_for_data.get('n_estimators'),
                                                max_samples=best_params_for_data.get('max_samples'))
    bagging_model_test_data.fit(dataset.get('X_test'), dataset.get('Y_test'))
    test_Y_pred = bagging_model_test_data.predict(dataset.get('X_test'))
    rmse_test = np.sqrt(mean_squared_error(dataset.get('Y_test'), test_Y_pred))
    r2_test = r2_score(dataset.get('Y_test'), test_Y_pred)
    print(f'RMSE for Bagging based on SVR for test {dataset.get("name")}: ', rmse_test)
    print(f'R2_score for  Bagging based on SVR for test {dataset.get("name")}: ', r2_test)
    print()

Best hyper parameters for original_data: {'estimator__C': 10, 'estimator__kernel': 'linear', 'max_samples': 99, 'n_estimators': 300}
RMSE for Bagging based on SVR for train original_data:  9.871953146579088
R2_score Bagging based on for SVR for train original_data:  0.9845520627278531
RMSE for Bagging based on SVR for test original_data:  10.061185524952004
R2_score for  Bagging based on SVR for test original_data:  0.9835389286631412

Best hyper parameters for original_rescaled_data: {'estimator__C': 10, 'estimator__kernel': 'linear', 'max_samples': 99, 'n_estimators': 300}
RMSE for Bagging based on SVR for train original_rescaled_data:  0.04806581372840694
R2_score Bagging based on for SVR for train original_rescaled_data:  0.9051771809034665
RMSE for Bagging based on SVR for test original_rescaled_data:  0.052831704180885175
R2_score for  Bagging based on SVR for test original_rescaled_data:  0.8824764340814361

Best hyper parameters for essential_data: {'estimator__C': 10, 'estimat

Выводы: Наилучшей предиктивной моделью, найденной в ЛР№1 была полиномиальная регрессия 2 степени, на исходном наборе данных, которая имела на тестовом наборе данных R^2 равное 0.981921257183405. В Лабораторной работе №2 лучшей моделью стала SVC на исходном наборе данных с оценкой на тестовом наборе данных, с R^2 равной 0.9835824211779726, что лучше, чем у полиномиальной регрессии 2 степени, на исходном наборе данных из ЛР№1