# Задание

- Выберите набор данных (датасет) для решения задачи классификации или регресии.
- В случае необходимости проведите удаление или заполнение пропусков и кодирование категориальных признаков.
- С использованием метода train_test_split разделите выборку на обучающую и тестовую.
- Обучите следующие ансамблевые модели:
  - две модели группы бэггинга (бэггинг или случайный лес или сверхслучайные деревья);
  - AdaBoost;
  - градиентный бустинг.
- Оцените качество моделей с помощью одной из подходящих для задачи метрик.Сравните качество полученных моделей.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [2]:
data = pd.read_csv('data/Car_sales.csv')

In [3]:
data.shape, data.Price_in_thousands.shape

((157, 16), (157,))

In [4]:
data.dtypes

Manufacturer            object
Model                   object
Sales_in_thousands     float64
__year_resale_value    float64
Vehicle_type            object
Price_in_thousands     float64
Engine_size            float64
Horsepower             float64
Wheelbase              float64
Width                  float64
Length                 float64
Curb_weight            float64
Fuel_capacity          float64
Fuel_efficiency        float64
Latest_Launch           object
Power_perf_factor      float64
dtype: object

In [5]:
data.isnull().sum()

Manufacturer            0
Model                   0
Sales_in_thousands      0
__year_resale_value    36
Vehicle_type            0
Price_in_thousands      2
Engine_size             1
Horsepower              1
Wheelbase               1
Width                   1
Length                  1
Curb_weight             2
Fuel_capacity           1
Fuel_efficiency         3
Latest_Launch           0
Power_perf_factor       2
dtype: int64

In [6]:
data = data.dropna(axis=0, how='any')
(data.shape, data.Price_in_thousands.shape)

((117, 16), (117,))

In [7]:
data.head()


Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.28015
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
3,Acura,RL,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639
5,Audi,A6,18.78,23.555,Passenger,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,8/9/2011,84.565105


### Кодирование категориальных признаков

In [8]:
Vehicle_type_dict = {'Passenger': 0, 'Car': 1}
# ['Acura', 'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet',
#        'Chrysler', 'Dodge', 'Ford', 'Honda', 'Hyundai', 'Infiniti',
#        'Jeep', 'Lexus', 'Lincoln', 'Mitsubishi', 'Mercury', 'Mercedes-B',
#        'Nissan', 'Oldsmobile', 'Plymouth', 'Pontiac', 'Porsche', 'Saturn',
#        'Toyota', 'Volkswagen']
Manufacturer_type_dict  = {
    'Acure': 0, 'Audi': 1, 'BMW': 2, 'Buick': 3, 'Cadillac': 4, 'Chevrolet': 5,
    'Chrysler': 6, 'Dodge': 7, 'Ford': 8, 'Honda': 9, 'Hyundai': 10, 'Infiniti': 11,
    'Jeep': 12, 'Lexus': 13, 'Lincoln': 14, 'Mitsubishi': 15, 'Mercury': 16, 'Mercedes-B': 17,
    'Nissan': 18, 'Oldsmobile': 19, 'Plymouth': 20, 'Pontiac': 21, 'Porsche': 22, 'Saturn': 23,
    'Toyota': 24, 'Volkswagen': 25
}
data['Vehicle_type'] = data['Vehicle_type'].map(Vehicle_type_dict)
data['Manufacturer'] = data['Manufacturer'].map(Manufacturer_type_dict)
data = data.dropna(axis=0, how='any')
df_encoded = data.drop(columns=[ 'Model', 'Vehicle_type', 'Latest_Launch', '__year_resale_value', 'Power_perf_factor'])
print(df_encoded)

     Manufacturer  Sales_in_thousands  Price_in_thousands  Engine_size  \
4             1.0              20.397               23.99          1.8   
5             1.0              18.780               33.95          2.8   
6             1.0               1.380               62.00          4.2   
8             2.0               9.231               33.40          2.8   
9             2.0              17.527               38.90          2.8   
..            ...                 ...                 ...          ...   
145          25.0               9.761               14.90          2.0   
146          25.0              83.721               16.70          2.0   
147          25.0              51.102               21.20          1.8   
148          25.0               9.569               19.99          2.0   
149          25.0               5.596               17.50          2.0   

     Horsepower  Wheelbase  Width  Length  Curb_weight  Fuel_capacity  \
4         150.0      102.6   68.2   17

### Разделим выборку на обучающую и тестовую

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df_encoded.drop("Price_in_thousands", axis=1)
y = df_encoded["Price_in_thousands"]

In [11]:
print(X.head(), "\n")
print(y.head())

   Manufacturer  Sales_in_thousands  Engine_size  Horsepower  Wheelbase  \
4           1.0              20.397          1.8       150.0      102.6   
5           1.0              18.780          2.8       200.0      108.7   
6           1.0               1.380          4.2       310.0      113.0   
8           2.0               9.231          2.8       193.0      107.3   
9           2.0              17.527          2.8       193.0      111.4   

   Width  Length  Curb_weight  Fuel_capacity  Fuel_efficiency  
4   68.2   178.0        2.998           16.4             27.0  
5   76.1   192.0        3.561           18.5             22.0  
6   74.0   198.2        3.902           23.7             21.0  
8   68.5   176.0        3.197           16.6             24.0  
9   70.9   188.0        3.472           18.5             25.0   

4    23.99
5    33.95
6    62.00
8    33.40
9    38.90
Name: Price_in_thousands, dtype: float64


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [13]:
X_train.shape, y_train.shape,

((85, 10), (85,))

In [14]:
X_test.shape, y_test.shape

((29, 10), (29,))

### Обучение моделей

In [15]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
bagging_regressor = BaggingRegressor()
bagging_regressor.fit(X_train, y_train)

In [17]:
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(X_train, y_train)

In [18]:
ada_boost_regressor = AdaBoostRegressor()
ada_boost_regressor.fit(X_train, y_train)

In [19]:
gradient_boosting_regressor = GradientBoostingRegressor()
gradient_boosting_regressor.fit(X_train, y_train)

In [20]:
bagging_pred = bagging_regressor.predict(X_test)
print(bagging_pred)

[16.8956 21.071  33.7987 13.4503 24.376  30.0304 12.6235 12.5411 32.9025
 19.8799 24.9598 13.1346 22.3998 21.0453 16.4276 17.1535 27.4245 15.8397
 15.874  30.8293 13.0686 29.2753 12.3607 21.7208 30.8387 46.5579 12.787
 19.7837 22.1963]


In [21]:
random_forest_pred = random_forest_regressor.predict(X_test)
print(random_forest_pred)

[16.69354 22.74619 36.16719 13.4694  24.94294 30.38815 12.77639 13.20272
 35.09118 20.83878 23.52692 13.10867 20.64363 19.9387  16.56488 17.66882
 30.05602 15.5881  15.84767 28.84322 13.32526 31.27656 12.46306 21.26757
 32.08637 40.53307 13.31761 20.02596 24.27107]


In [22]:
ada_boost_pred = ada_boost_regressor.predict(X_test)
print(ada_boost_pred)

[17.62923333 24.50515789 30.53543182 15.76166667 26.85033333 30.58826
 14.70371429 15.76166667 30.77789189 22.88414286 22.88414286 15.546125
 19.79275    24.01328571 16.785      17.97455    27.51585714 15.76654545
 16.69475    29.312      15.546125   30.84836957 14.63847619 24.50515789
 30.77789189 41.38583333 15.76654545 22.96423333 25.16128   ]


In [23]:
gradient_boosting_pred = gradient_boosting_regressor.predict(X_test)
print(gradient_boosting_pred)

[15.56002633 23.73052451 31.46292604 13.27375264 25.77920554 34.46813165
 13.37134896 12.71624824 29.14734845 19.80144071 22.46142602 13.62039386
 19.26197384 20.06882514 15.80717878 16.97575516 30.50128499 15.21020583
 15.39881749 29.8618125  13.50166152 30.80957968 12.01164197 19.95012183
 31.99776275 45.84375493 13.63453732 19.20789889 22.59031797]


In [24]:
bagging_mse = mean_squared_error(y_test, bagging_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
ada_boost_mse = mean_squared_error(y_test, ada_boost_pred)
gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_pred)

In [25]:
bagging_r2_score = r2_score(y_test, bagging_pred)
random_forest_r2_score = r2_score(y_test, random_forest_pred)
ada_boost_r2_score = r2_score(y_test, ada_boost_pred)
gradient_boosting_r2_score = r2_score(y_test, gradient_boosting_pred)

In [26]:
print("Bagging MSE:", bagging_mse)
print("Random Forest MSE:", random_forest_mse)
print("AdaBoost MSE:", ada_boost_mse)
print("Gradient Boosting MSE:", gradient_boosting_mse)

Bagging MSE: 33.99647942241378
Random Forest MSE: 28.96407585131039
AdaBoost MSE: 30.964442630744045
Gradient Boosting MSE: 23.285794115081714


In [27]:
print("Bagging r2_score:", bagging_r2_score)
print("Random Forest r2_score:", random_forest_r2_score)
print("AdaBoost r2_score:", ada_boost_r2_score)
print("Gradient Boosting r2_score:", gradient_boosting_r2_score)

Bagging r2_score: 0.6780053717970904
Random Forest r2_score: 0.7256693341947997
AdaBoost r2_score: 0.7067230383325132
Gradient Boosting r2_score: 0.7794506741321017


## Часть 2
### Задание

- Обучите следующие ансамблевые модели:
  - одну из моделей группы стекинга.
  - модель многослойного персептрона. По желанию, вместо библиотеки scikit-learn возможно использование библиотек TensorFlow, PyTorch или других аналогичных библиотек.
  - двумя методами на выбор из семейства МГУА (один из линейных методов COMBI / MULTI + один из нелинейных методов MIA / RIA) с использованием библиотеки gmdh.
- Оцените качество моделей с помощью одной из подходящих для задачи метрик. Сравните качество полученных моделей.

Стекинг

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

In [29]:
base_models = [
    ('linear_regression', LinearRegression()),
    ('gradient_boosting', GradientBoostingRegressor()),
    ('random_forest', RandomForestRegressor())
]

In [30]:
meta_model = LinearRegression()

In [31]:
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

In [32]:
stacking_regressor.fit(X_train, y_train)

In [33]:
stacking_pred = stacking_regressor.predict(X_test)

In [34]:
stacking_mse = mean_squared_error(y_test, stacking_pred)
print("Stacking Regressor MSE:", stacking_mse)

Stacking Regressor MSE: 15.732279756414684


In [35]:
stacking_r2_score = r2_score(y_test, stacking_pred)
print("Stacking Regressor R²:", stacking_r2_score)

Stacking Regressor R²: 0.8509931129041821


### Модель многослойного персептрона (MLP)

In [36]:
from sklearn.neural_network import MLPRegressor

In [37]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=1)

In [38]:
mlp_regressor.fit(X_train, y_train)

In [39]:
mlp_pred = mlp_regressor.predict(X_test)

In [40]:
mlp_mse = mean_squared_error(y_test, mlp_pred)
print("MLP Regressor MSE:", mlp_mse)

MLP Regressor MSE: 155.19798239355117


In [41]:
mlp_r2_score = r2_score(y_test, mlp_pred)
print("MLP Regressor R²:", mlp_r2_score)

MLP Regressor R²: -0.469943873238422


### МГУА 

In [42]:
%pip install gmdh

Note: you may need to restart the kernel to use updated packages.


In [43]:
from gmdh import Combi, split_data

In [44]:
x_train, x_test, y_train, y_test = split_data(X, y)

In [45]:
model = Combi()
model.fit(x_train, y_train)

<gmdh.gmdh.Combi at 0x25ea95c0510>

In [46]:
y_predicted = model.predict(x_test)
print('y_predicted: ', y_predicted)
print('y_test: ', y_test)

y_predicted:  [22.18050841 26.62762119 26.29498026 28.41067366 29.85479786 46.55091166
 47.70098528 11.39934685 11.73740198 15.60725339 16.37241875 17.33882607
 31.41589655 17.65975329 17.21037421 18.09638718 25.19744899 38.0213576
 13.40164034 13.69305135 21.36831412 16.54900197 13.44345722]
y_test:  [19.72  25.31  21.665 23.755 41.43  71.02  74.97  10.685 12.535 14.29
 13.108 17.518 25.545 16.875 11.528 16.888 22.288 51.728 14.9   16.7
 21.2   19.99  17.5  ]


In [47]:
model.get_best_polynomial()

'y = - 0.0025*x2 + 0.1774*x4 - 0.1397*x5 - 1.0435*x6 + 5.5786*x8 + 0.591*x9 + 0.4558*x10 + 42.2523'

In [48]:
model_r2_score = r2_score(y_test, y_predicted)
print("model Regressor R²:", model_r2_score)

model Regressor R²: 0.7324569474007379


In [49]:
from gmdh import Mia

In [50]:
mia = Mia()
mia.fit(x_train, y_train)

<gmdh.gmdh.Mia at 0x25eaa14a290>

In [51]:
y_pred = mia.predict(x_test)
print('y_predicted: ', y_pred)
print('y_test: ', y_test)

y_predicted:  [22.09125067 25.74960019 24.95849007 27.05520906 31.43236737 45.89767052
 44.97737658 12.13105418 12.21366084 16.40958644 15.384474   16.53390194
 27.50182688 18.39826328 19.8506205  16.5131575  20.79266155 27.70754226
 14.44244703 14.71955064 19.3578045  15.56688444 14.4251835 ]
y_test:  [19.72  25.31  21.665 23.755 41.43  71.02  74.97  10.685 12.535 14.29
 13.108 17.518 25.545 16.875 11.528 16.888 22.288 51.728 14.9   16.7
 21.2   19.99  17.5  ]


In [52]:
mia.get_best_polynomial()

'f1_1 = - 0.3513*x4 + 38.5013*x8 + 0.121*x4*x8 + 0.0002*x4^2 - 8.3704*x8^2 - 27.4662\nf1_2 = 0.4981*x4 - 6.7845*x6 - 0.0048*x4*x6 + 7.08483e-05*x4^2 + 0.0488*x6^2 + 228.5285\n\ny = - 0.1017*f1_1 + 1.0184*f1_2 - 0.1201*f1_1*f1_2 + 0.0745*f1_1^2 + 0.0467*f1_2^2 + 0.8542'

In [53]:
mia_r2_score = r2_score(y_test, y_pred)
print("model Regressor R²:", mia_r2_score)

model Regressor R²: 0.6575424064468822
