In [20]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split

In [21]:
df = pd.read_csv(str(Path(os.path.abspath(os.curdir)).parent) + '/data/total_data.csv')

# df = df[df.Price < 2000000]

df = pd.get_dummies(df, columns=['Sex', 'Color', 'Breed'])

train, test = train_test_split(df, test_size=0.2, random_state=42)

corr_matrix = train.corr()
print('Correlation coefficients of survival from other features:\n',
      corr_matrix['Price'].sort_values(ascending=False)[1:])

x_train = train.drop('Price', axis=1)

y_train = train['Price']

Correlation coefficients of survival from other features:
 Height                            0.274127
Breed_Андалузская                 0.159828
Breed_Фризская                    0.135605
Breed_Бельгийская теплокровная    0.113083
Breed_Ганноверская                0.112583
                                    ...   
Sex_Жеребец                      -0.068191
Breed_Русский рысак              -0.073403
Color_Пегой                      -0.100633
Breed_Помесь пони                -0.104134
Breed_Шетлендский пони           -0.130832
Name: Price, Length: 73, dtype: float64


In [22]:
features = list(x_train)
rfr = RandomForestRegressor(random_state=42)

random_search_params = {'bootstrap': [True, False],
                        'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
                        'max_depth': list(np.arange(1, 50)),
                        'max_features': list(np.arange(1, len(features) + 1)),
                        'min_samples_leaf': list(np.arange(1, 10)),
                        'min_samples_split': list(np.arange(2, 10)),
                        'min_weight_fraction_leaf': list(np.arange(0.0, 0.6, 0.1)),
                        'n_estimators': list(np.arange(1, 300, 2))}

random_search = RandomizedSearchCV(estimator=rfr,
                                   param_distributions=random_search_params,
                                   n_iter=2,
                                   n_jobs=-1,
                                   refit=True,
                                   cv=StratifiedKFold(5),
                                   random_state=42)

random_search.fit(x_train, y_train)

predictions = random_search.predict(x_train)
random_search_rmse = np.sqrt(mean_squared_error(y_train, predictions))
print('RMSE случайного поиска: ', random_search_rmse)



RMSE случайного поиска:  521325.7269113714


In [23]:
random_search_parameters = pd.DataFrame(random_search.cv_results_).sort_values('rank_test_score').reset_index(
    drop=True)
random_search_parameters = random_search_parameters.drop('params', axis=1)
random_search_parameters.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_weight_fraction_leaf,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,param_bootstrap,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.135473,0.01717,0.008001,0.001094,85,0.0,9,6,5,47,friedman_mse,False,0.123578,0.184874,0.117575,0.114176,0.146384,0.137317,0.026298,1
1,1.836626,0.031594,0.007204,0.000982,139,0.1,8,9,18,25,absolute_error,False,0.017844,0.030362,0.007083,-0.020811,0.015598,0.010015,0.017119,2


In [24]:
top_models = random_search_parameters.head(3)
param_bootstrap = [top_models['param_bootstrap'].value_counts().idxmax()]
param_criterion = [top_models['param_criterion'].value_counts().idxmax()]
param_max_depth = [x for x in set(top_models['param_max_depth'])]
param_max_features = [x for x in set(top_models['param_max_features'])]
param_min_samples_leaf = [x for x in set(top_models['param_min_samples_leaf'])]
param_min_samples_split = [x for x in set(top_models['param_min_samples_split'])]
param_min_weight_fraction_leaf = [x for x in set(top_models['param_min_weight_fraction_leaf'])]
param_n_estimators = [x for x in set(top_models['param_n_estimators'])]

params_grid = {'bootstrap': param_bootstrap,
               'criterion': param_criterion,
               'max_depth': param_max_depth,
               'max_features': param_max_features,
               'min_samples_leaf': param_min_samples_leaf,
               'min_samples_split': param_min_samples_split,
               'min_weight_fraction_leaf': param_min_weight_fraction_leaf,
               'n_estimators': param_n_estimators
               }

params_grid

{'bootstrap': [False],
 'criterion': ['friedman_mse'],
 'max_depth': [25, 47],
 'max_features': [18, 5],
 'min_samples_leaf': [9, 6],
 'min_samples_split': [8, 9],
 'min_weight_fraction_leaf': [0.0, 0.1],
 'n_estimators': [139, 85]}

In [25]:
rfr = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rfr,
                           param_grid=params_grid,
                           n_jobs=-1,
                           refit=True,
                           cv=StratifiedKFold(5)
                           )
grid_search.fit(x_train, y_train)

predictions = grid_search.predict(x_train)
grid_search_rmse = np.sqrt(mean_squared_error(y_train, predictions))
print('RMSE решётчатого поиска: ', grid_search_rmse)



RMSE решётчатого поиска:  489913.3019714717


In [26]:
best_model = grid_search.best_estimator_
train_predictions = best_model.predict(x_train)
model_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
print('RMSE модели: ', model_rmse)
print('The best hyperparameters for DecisionTreeRegressor found using grid search:\n',
      best_model.get_params())

# feature_importance = grid_search.best_estimator_.feature_importances_
# print('\nThe importance of each feature of model:',
#           *sorted(zip(feature_importance, features), reverse=True), sep='\n')

RMSE модели:  489913.3019714717
The best hyperparameters for DecisionTreeRegressor found using grid search:
 {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'max_depth': 47, 'max_features': 18, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 9, 'min_samples_split': 8, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 85, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
