## Лаборатораная работа №6 по курсу ТМО
## "Ансамбли моделей машинного обучения".


In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

### 1. Загрузка и обработка набора данных
Для решения задачи регрессии выберем набор данных [Daily Temperature of Major Cities](https://www.kaggle.com/sudalairajkumar/daily-temperature-of-major-cities), который показывает среднесуточные значения температуры воздуха зафиксированы в крупных городах мира.

In [2]:
data = pd.read_csv('../datasets/city_temperature.csv')
data

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9
...,...,...,...,...,...,...,...,...
2906322,North America,US,Additional Territories,San Juan Puerto Rico,7,27,2013,82.4
2906323,North America,US,Additional Territories,San Juan Puerto Rico,7,28,2013,81.6
2906324,North America,US,Additional Territories,San Juan Puerto Rico,7,29,2013,84.2
2906325,North America,US,Additional Territories,San Juan Puerto Rico,7,30,2013,83.8


Удаление пропусков в данных:

In [3]:
data.isnull().sum()

Region                  0
Country                 0
State             1450990
City                    0
Month                   0
Day                     0
Year                    0
AvgTemperature          0
dtype: int64

In [4]:
# удалим колонку state, тк штаты есть не во всех странах и нам этот прзнак не особо важен в понятийном смысле
data = data.dropna(axis=1, how='any')
data.shape

(2906327, 7)

Так как мы имеем очень много строк, то будем решать более узкую задачу и возьмем данные только за 2013 год:

In [5]:
data = data[data['Year']==2013]
data.shape

(111021, 7)

In [6]:
data = data.drop(['Year'], axis=1)

Кодирование категориальных признаков:

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
#label encoding
le = LabelEncoder()
reg_enc_le = le.fit_transform(data['Region'])
country_enc_le = le.fit_transform(data['Country'])
city_enc_le = le.fit_transform(data['City'])

In [9]:
data_new = data.drop(['Region','Country','City','AvgTemperature'], axis=1)
data_new['Region'] = reg_enc_le
data_new['Country'] = country_enc_le
data_new['City'] = city_enc_le
data_new['AvgTemperature'] = data['AvgTemperature']
data_new

Unnamed: 0,Month,Day,Region,Country,City,AvgTemperature
6575,1,1,0,1,7,49.2
6576,1,2,0,1,7,51.7
6577,1,3,0,1,7,48.2
6578,1,4,0,1,7,49.2
6579,1,5,0,1,7,49.4
...,...,...,...,...,...,...
2906322,7,27,5,103,244,82.4
2906323,7,28,5,103,244,81.6
2906324,7,29,5,103,244,84.2
2906325,7,30,5,103,244,83.8


### 2. Разделение выборки на обучающую и тестовую

In [10]:
X = data_new[['Month','Day','Region', 'Country', 'City']]
y = data_new['AvgTemperature']
print(X.head(), "\n")
print(y.head())

      Month  Day  Region  Country  City
6575      1    1       0        1     7
6576      1    2       0        1     7
6577      1    3       0        1     7
6578      1    4       0        1     7
6579      1    5       0        1     7 

6575    49.2
6576    51.7
6577    48.2
6578    49.2
6579    49.4
Name: AvgTemperature, dtype: float64


Нормализация:

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
# промасштабируем X
columns = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
pd.DataFrame(X, columns=columns).describe()

Unnamed: 0,Month,Day,Region,Country,City
count,111021.0,111021.0,111021.0,111021.0,111021.0
mean,6.9120810000000004e-18,5.0560590000000007e-17,2.048024e-16,-2.785313e-16,-1.454097e-16
std,1.000005,1.000005,1.000005,1.000005,1.000005
min,-1.600388,-1.673511,-2.092386,-2.25229,-1.726359
25%,-0.7300681,-0.8777078,-0.4659704,-0.8074637,-0.8622675
50%,0.1402518,0.03178103,0.6183069,0.7240526,0.001824264
75%,1.010572,0.8275838,0.6183069,0.7240526,0.865916
max,1.590785,1.737073,1.160446,1.013018,1.706965


In [13]:
# разделим выборку
temp_X_train, temp_X_test, temp_y_train, temp_y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### 3. Обучение моделей

**3.1 Случаный лес**

Попробуем случайный лес с гиперпараметром n=100 и максимальной глубиной 25.

In [14]:
ran_100 = RandomForestRegressor(n_estimators=100,max_depth=25)
ran_100.fit(temp_X_train, temp_y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [15]:
print("MAE:", mean_absolute_error(temp_y_test, ran_100.predict(temp_X_test)))
print("MSE:", mean_squared_error(temp_y_test, ran_100.predict(temp_X_test)))

MAE: 4.595490006014878
MSE: 125.00996049459796


Попробуем случайный лес с гиперпараметром n=200 и максимальной глубиной 10.

In [17]:
ran_200 = RandomForestRegressor(n_estimators=200,max_depth=10)
ran_200.fit(temp_X_train, temp_y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [18]:
print("MAE:", mean_absolute_error(temp_y_test, ran_200.predict(temp_X_test)))
print("MSE:", mean_squared_error(temp_y_test, ran_200.predict(temp_X_test)))

MAE: 8.599517955964654
MSE: 199.89646039220298


**3.2 Градиентный бустинг**

In [19]:
# гиперпараметр равен 100
gr_100 = GradientBoostingRegressor(n_estimators=100)
gr_100.fit(temp_X_train, temp_y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
print("MAE:", mean_absolute_error(temp_y_test, gr_100.predict(temp_X_test)))
print("MSE:", mean_squared_error(temp_y_test, gr_100.predict(temp_X_test)))

MAE: 9.914969464605694
MSE: 255.68970623453973


In [22]:
# гиперпараметр равен 200
gr_200 = GradientBoostingRegressor(n_estimators=200)
gr_200.fit(temp_X_train, temp_y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=200,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
print("MAE:", mean_absolute_error(temp_y_test, gr_200.predict(temp_X_test)))
print("MSE:", mean_squared_error(temp_y_test, gr_200.predict(temp_X_test)))

MAE: 8.925352011497793
MSE: 223.92218233105484


### Сравнение качества полученных моделей

Лучшим методом стал случайный лес с гиперпараметром 100. Метрики этого метода MSE=125,0 и MAE=4,6.

Градиентный бустинг показал себя несколько хуже. При гиперпараметре n_estimators=200 метрики этого метода MSE=223,0 и MAE=4,6.