# 1 Загрузка пакетов и библиотек

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import BaggingRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_classif, chi2
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor


# 2 Загрузить данные из указанного источника

In [4]:
# Считываем данные по варианту
initial_data_1 = pd.read_csv('V6.csv')
# Отбрасываем первый столбец, который содержит индексы записей (за ненадобностью)
initial_data_2 = initial_data_1.iloc[:,1:]
# Удаляем столбец, в соответсвии с заданием
data = initial_data_2.drop(columns = ['total_UPDRS'], axis = 1) 

In [5]:
data.drop(columns = ['index','subject#','Jitter(Abs)','Jitter:PPQ5'], axis = 1, inplace=True) 

In [6]:
data

Unnamed: 0,age,sex,test_time,motor_UPDRS,Jitter(%),Jitter:RAP,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,28.199,0.00662,0.00401,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,72,0,12.6660,28.447,0.00300,0.00132,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,72,0,19.6810,28.695,0.00481,0.00205,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,72,0,25.6470,28.905,0.00528,0.00191,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,72,0,33.6420,29.187,0.00335,0.00093,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,0,142.7900,22.485,0.00406,0.00167,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,61,0,149.8400,21.988,0.00297,0.00119,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,61,0,156.8200,21.495,0.00349,0.00152,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,61,0,163.7300,21.007,0.00281,0.00128,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


# 3 Формируем 4 набора данных для исследований:

## Формируем набор из существенных признаков

In [7]:
new_columns = data.iloc[:, :4]
lust_column = data['HNR']
data_essential_features = pd.concat([new_columns, lust_column], axis=1)
data_essential_features

Unnamed: 0,age,sex,test_time,motor_UPDRS,HNR
0,72,0,5.6431,28.199,21.640
1,72,0,12.6660,28.447,27.183
2,72,0,19.6810,28.695,23.047
3,72,0,25.6470,28.905,24.445
4,72,0,33.6420,29.187,26.126
...,...,...,...,...,...
5870,61,0,142.7900,22.485,22.369
5871,61,0,149.8400,21.988,22.886
5872,61,0,156.8200,21.495,25.065
5873,61,0,163.7300,21.007,24.422


## Преобразуем исходный набор

In [8]:
# Преобразовываем исходный набор
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_data = pd.DataFrame(scaler.fit_transform(data))
rescaled_data_essential_features = pd.DataFrame(scaler.fit_transform(data_essential_features))
rescaled_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.734694,0.0,0.045076,0.671862,0.058390,0.064324,0.064433,0.085062,0.098030,0.079287,0.067543,0.051764,0.079267,0.018723,0.551717,0.328638,0.097793,0.194544
1,0.734694,0.0,0.077034,0.679056,0.021884,0.017305,0.017303,0.064691,0.073522,0.051720,0.053186,0.052753,0.051699,0.014474,0.704771,0.348330,0.144300,0.121335
2,0.734694,0.0,0.108957,0.686250,0.040137,0.030065,0.030178,0.051549,0.074483,0.035577,0.039375,0.044291,0.035556,0.026651,0.590568,0.381812,0.085362,0.265104
3,0.734694,0.0,0.136105,0.692342,0.044877,0.027618,0.027673,0.075423,0.144642,0.058674,0.064878,0.062791,0.058632,0.036834,0.629169,0.412583,0.181761,0.437884
4,0.734694,0.0,0.172487,0.700522,0.025413,0.010488,0.010486,0.052604,0.072081,0.032162,0.044524,0.057515,0.032121,0.015160,0.675585,0.393664,0.134202,0.241814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,0.510204,0.0,0.669173,0.506111,0.032574,0.023422,0.023420,0.059871,0.064392,0.050416,0.056882,0.047624,0.050416,0.033224,0.571847,0.602569,0.111219,0.270078
5871,0.510204,0.0,0.701255,0.491694,0.021581,0.015032,0.015147,0.075649,0.090822,0.055321,0.065605,0.060629,0.055321,0.015563,0.586122,0.460040,0.145466,0.146851
5872,0.510204,0.0,0.733018,0.477393,0.026825,0.020801,0.020856,0.082577,0.104757,0.075127,0.076448,0.059640,0.075086,0.023283,0.646289,0.401075,0.184435,0.168492
5873,0.510204,0.0,0.764462,0.463237,0.019968,0.016605,0.016604,0.044357,0.050457,0.033031,0.040950,0.038759,0.032990,0.010292,0.628534,0.512392,0.140033,0.169155


In [9]:
rescaled_data_essential_features

Unnamed: 0,0,1,2,3,4
0,0.734694,0.0,0.045076,0.671862,0.551717
1,0.734694,0.0,0.077034,0.679056,0.704771
2,0.734694,0.0,0.108957,0.686250,0.590568
3,0.734694,0.0,0.136105,0.692342,0.629169
4,0.734694,0.0,0.172487,0.700522,0.675585
...,...,...,...,...,...
5870,0.510204,0.0,0.669173,0.506111,0.571847
5871,0.510204,0.0,0.701255,0.491694,0.586122
5872,0.510204,0.0,0.733018,0.477393,0.646289
5873,0.510204,0.0,0.764462,0.463237,0.628534


## Разделяем каждый набор данных на обучающую и тестовую выборки

In [10]:
test_size = 0.2
seed = 7

data_X = data.drop('motor_UPDRS', axis=1)
data_Y = data['motor_UPDRS']

data_X_train, data_X_test, data_Y_train, data_Y_test =\
train_test_split(data_X, data_Y, test_size=test_size, random_state=seed)


data_essential_features_X = data_essential_features.drop('motor_UPDRS', axis=1)
data_essential_features_Y = data_essential_features['motor_UPDRS']

data_essential_features_X_train, data_essential_features_X_test, data_essential_features_Y_train, data_essential_features_Y_test =\
train_test_split(data_essential_features_X, data_essential_features_Y, test_size=test_size, random_state=seed)



In [11]:
test_size = 0.2
seed = 7

rescaled_data_X = rescaled_data.drop(3, axis=1)
rescaled_data_Y = rescaled_data[3]

rescaled_data_X_train, rescaled_data_X_test, rescaled_data_Y_train, rescaled_data_Y_test =\
train_test_split(rescaled_data_X, rescaled_data_Y, test_size=test_size, random_state=seed)

rescaled_data_essential_features_X = rescaled_data_essential_features.drop(3, axis=1)
rescaled_data_essential_features_Y = rescaled_data_essential_features[3]

rescaled_data_essential_features_X_train, rescaled_data_essential_features_X_test, rescaled_data_essential_features_Y_train, rescaled_data_essential_features_Y_test =\
train_test_split(rescaled_data_essential_features_X, rescaled_data_essential_features_Y, test_size=test_size, random_state=seed)

## Формируем датасет из наборов данных

In [12]:
datasets = [{'X_train': data_X_train, 'X_test': data_X_test, 'Y_train': data_Y_train,'Y_test': data_Y_test, 'name': 'original_data'},
            {'X_train': rescaled_data_X_train, 'X_test': rescaled_data_X_test,
             'Y_train': rescaled_data_Y_train, 'Y_test': rescaled_data_Y_test, 'name': 'original_rescaled_data'},
            {'X_train': data_essential_features_X_train, 'X_test': data_essential_features_X_test, 'Y_train': data_essential_features_Y_train,
             'Y_test': data_essential_features_Y_test, 'name': 'essential_data'},
            {'X_train': rescaled_data_essential_features_X_train, 'X_test': rescaled_data_essential_features_X_test,
             'Y_train': rescaled_data_essential_features_Y_train, 'Y_test': rescaled_data_essential_features_Y_test, 'name': 'rescaled_essential_data'}]

## Определим гиперпараметр для CART



In [22]:
param_grid = {'max_depth': [i for i in range(1, 32)], # максимальная глубина дерева
  'min_samples_split': [i for i in range(2, 10)], # минимальное число объектов в узле для разделения
  'min_samples_leaf': [i for i in range(1, 10)]} # минимальное число объектов в листе

## Реализация CART

In [23]:
for dataset in datasets:
    print(f'Analyse for {dataset.get("name")}')
    tree_reg = DecisionTreeRegressor()
    grid_search = GridSearchCV(tree_reg, param_grid)

    grid_search.fit(dataset.get('X_train'), dataset.get('Y_train'))

    best_params_for_data = grid_search.best_params_
    print(f"Best hyper parameters for {dataset.get('name')}: {best_params_for_data}")

    tree_reg_train_data = DecisionTreeRegressor(max_depth=best_params_for_data.get('max_depth'),
                                                min_samples_split=best_params_for_data.get('min_samples_split'),
                                                min_samples_leaf=best_params_for_data.get('min_samples_leaf'),
                                                random_state=42)
    tree_reg_train_data.fit(dataset.get('X_train'), dataset.get('Y_train'))
    train_Y_pred = tree_reg_train_data.predict(dataset.get('X_train'))
    rmse_train = np.sqrt(mean_squared_error(dataset.get('Y_train'), train_Y_pred))
    r2_train = r2_score(dataset.get('Y_train'), train_Y_pred)
    print(f'RMSE for CART for train {dataset.get("name")}: ', rmse_train)
    print(f'R2_score for CART for train {dataset.get("name")}: ', r2_train)

    tree_reg_test_data = DecisionTreeRegressor(max_depth=best_params_for_data.get('max_depth'),
                                                min_samples_split=best_params_for_data.get('min_samples_split'),
                                                min_samples_leaf=best_params_for_data.get('min_samples_leaf'),
                                                random_state=42)
    tree_reg_test_data.fit(dataset.get('X_test'), dataset.get('Y_test'))
    test_Y_pred = tree_reg_test_data.predict(dataset.get('X_test'))
    rmse_test = np.sqrt(mean_squared_error(dataset.get('Y_test'), test_Y_pred))
    r2_test = r2_score(dataset.get('Y_test'), test_Y_pred)
    print(f'RMSE for CART for test {dataset.get("name")}: ', rmse_test)
    print(f'R2_score for CART for test {dataset.get("name")}: ', r2_test)
    print()

Best hyper parameters for original_data: {'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 8}
RMSE for CART for train original_data:  1.0340249342527092
R2_score for CART for train original_data:  0.9838030666405236
RMSE for CART for test original_data:  1.2545230959707372
R2_score for CART for test original_data:  0.9762596440304483

Best hyper parameters for original_rescaled_data: {'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 9}
RMSE for CART for train original_rescaled_data:  0.030583890188743006
R2_score for CART for train original_rescaled_data:  0.9831607811393913
RMSE for CART for test original_rescaled_data:  0.037589455683661066
R2_score for CART for test original_rescaled_data:  0.9746704370838812

Best hyper parameters for essential_data: {'max_depth': 14, 'min_samples_leaf': 4, 'min_samples_split': 6}
RMSE for CART for train essential_data:  1.3045278385176664
R2_score for CART for train essential_data:  0.9742203227849202
RMSE for CART for test

## Реализация Бэггинга на основе CART


In [29]:
param_grid = {'n_estimators': [100],
              'max_samples': [0.5,0.75,0.99],
              'base_estimator__max_depth': [14,16]}


In [32]:
for dataset in datasets:
    tree_reg = DecisionTreeRegressor()
    bagging_model = BaggingRegressor(tree_reg)
    grid_search = GridSearchCV(bagging_model, param_grid)
    grid_search.fit(dataset.get('X_train'), dataset.get('Y_train'))

    best_params_for_data = grid_search.best_params_
    print(f"Best hyper parameters for {dataset.get('name')}: {best_params_for_data}")
    tree_reg_train_data = DecisionTreeRegressor(max_depth=best_params_for_data.get('max_depth'),
                                                random_state=42)

    bagging_model_train_data = BaggingRegressor(tree_reg_train_data,
                                                n_estimators=best_params_for_data.get('n_estimators'),
                                                max_samples=best_params_for_data.get('max_samples'))
    bagging_model_train_data.fit(dataset.get('X_train'), dataset.get('Y_train'))
    train_Y_pred = bagging_model_train_data.predict(dataset.get('X_train'))
    rmse_train = np.sqrt(mean_squared_error(dataset.get('Y_train'), train_Y_pred))
    r2_train = r2_score(dataset.get('Y_train'), train_Y_pred)
    print(f'RMSE for Bagging based on CART for train {dataset.get("name")}: ', rmse_train)
    print(f'R2_score Bagging based on for CART for train {dataset.get("name")}: ', r2_train)

    tree_reg_test_data = DecisionTreeRegressor(max_depth=best_params_for_data.get('max_depth'),
                                                random_state=42)

    bagging_model_test_data = BaggingRegressor(tree_reg_test_data,
                                                n_estimators=best_params_for_data.get('n_estimators'),
                                                max_samples=best_params_for_data.get('max_samples'))
    bagging_model_test_data.fit(dataset.get('X_test'), dataset.get('Y_test'))
    test_Y_pred = bagging_model_test_data.predict(dataset.get('X_test'))
    rmse_test = np.sqrt(mean_squared_error(dataset.get('Y_test'), test_Y_pred))
    r2_test = r2_score(dataset.get('Y_test'), test_Y_pred)
    print(f'RMSE for Bagging based on CART for test {dataset.get("name")}: ', rmse_test)
    print(f'R2_score Bagging based on for CART for test {dataset.get("name")}: ', r2_test)
    print()

Best hyper parameters for original_data: {'base_estimator__max_depth': 16, 'max_samples': 0.99, 'n_estimators': 100}
RMSE for Bagging based on CART for train original_data:  0.5916471729035842
R2_score Bagging based on for CART for train original_data:  0.9946973085191978
RMSE for Bagging based on CART for test original_data:  1.1447532632484791
R2_score Bagging based on for CART for test original_data:  0.9802324118239415

Best hyper parameters for original_rescaled_data: {'base_estimator__max_depth': 16, 'max_samples': 0.99, 'n_estimators': 100}
RMSE for Bagging based on CART for train original_rescaled_data:  0.017013810322940442
R2_score Bagging based on for CART for train original_rescaled_data:  0.9947887769308449
RMSE for Bagging based on CART for test original_rescaled_data:  0.033318858290326665
R2_score Bagging based on for CART for test original_rescaled_data:  0.9800989567004258

Best hyper parameters for essential_data: {'base_estimator__max_depth': 16, 'max_samples': 0.75

## Выводы: Наилучшей предиктивной моделью, найденной в ЛР№1 была полиномиальная регрессия 2 степени, на исходном наборе данных, которая имела на тестовом наборе данных R^2 равное 0.8225577487247453. В Лабораторной работе №2 лучшей моделью стала Бэггинг модель на основе CART  на масштабированном наборе данных из существенных признаков наборе данных с оценкой на тестовом наборе данных, с R^2 равной 0.9883437576693822, что лучше, чем у полиномиальной регрессии 2 степени, на исходном наборе данных из ЛР№1