In [146]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split

In [147]:
df = pd.read_csv(str(Path(os.path.abspath(os.curdir)).parent) + '/data/total_data.csv')

df = df[df.Price < 1300000]

scaling_params = ['Age', 'Height']
features = df[scaling_params]
scaler = MinMaxScaler()
features = scaler.fit_transform(features.values)
df[scaling_params] = features

df = pd.get_dummies(df, columns=['Sex', 'Color', 'Breed'])

train, test = train_test_split(df, test_size=0.2, random_state=42)

corr_matrix = train.corr()
print('Correlation coefficients of survival from other features:\n',
      corr_matrix['Price'].sort_values(ascending=False)[1:])

x_train = train.drop('Price', axis=1)
y_train = train['Price']

x_test = test.drop('Price', axis=1)
y_test = test['Price']

Correlation coefficients of survival from other features:
 Height                      0.344395
Breed_Ганноверская          0.230652
Breed_Тракененская          0.138509
Breed_Русская спортивная    0.108535
Breed_Андалузская           0.082814
                              ...   
Breed_Помесь пони          -0.098204
Color_Пегой                -0.107989
Breed_Аппалуза             -0.109158
Breed_Русский рысак        -0.114462
Breed_Шетлендский пони     -0.212490
Name: Price, Length: 73, dtype: float64


In [174]:
features = list(x_train)
rfr = RandomForestRegressor(
    # bootstrap=True,
    # ccp_alpha= 0,
    # criterion= 'squared_error',
    # max_depth= None,
    # max_features= 1,
    # max_leaf_nodes=None,
    # max_samples=None,
    # min_impurity_decrease=0,
    # min_samples_leaf= 1,
    # min_samples_split= 2,
    # min_weight_fraction_leaf= 0,
    # n_estimators= 100,
    # oob_score=False,
    random_state=42
)

random_search_params = {
    # 'bootstrap': [True, False],
    # 'ccp_alpha': list(np.arange(0, 100, 1)),
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_depth': list(np.arange(5, 40, 2)),
    'max_features': list(np.arange(1, len(features) + 1)),
    # 'max_leaf_nodes': list(np.arange(2, 10)),
    # 'max_samples': list(np.arange(1, 20)),
    'min_impurity_decrease':list(np.arange(1, 9)),
    'min_samples_leaf': list(np.arange(1, 6)),
    'min_samples_split': list(np.arange(2, 10)),
    'min_weight_fraction_leaf': list(np.arange(0.0, 0.6, 0.1)),
    'n_estimators': list(np.arange(1, 1000, 10)),
    'oob_score': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=rfr,
    param_distributions=random_search_params,
    n_iter=50,
    n_jobs=-2,
    refit=True,
    cv=StratifiedKFold(5),
    random_state=42
)

random_search.fit(x_train, y_train)

predictions = random_search.predict(x_test)
random_search_rmse = np.sqrt(mean_squared_error(y_test, predictions))
print('RMSE случайного поиска: ', random_search_rmse)

OverflowError: Python int too large to convert to C long

In [149]:
random_search_parameters = pd.DataFrame(random_search.cv_results_).sort_values('rank_test_score').reset_index(
    drop=True)
random_search_parameters = random_search_parameters.drop('params', axis=1)
random_search_parameters.head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_oob_score,param_n_estimators,param_min_weight_fraction_leaf,param_min_samples_split,param_max_samples,param_max_features,param_max_depth,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.381694,0.093231,0.082332,0.003979,True,955,0.0,4,9,47,9,0.101559,0.098467,0.134002,0.109919,0.15041,0.118872,0.020099,1
1,2.0064,0.041215,0.050215,0.002994,True,583,0.1,2,9,50,9,0.10341,0.094986,0.129437,0.110033,0.155962,0.118766,0.021795,2
2,0.595684,0.002631,0.02702,0.002299,False,299,0.0,2,9,30,30,0.094938,0.091386,0.128043,0.122152,0.135408,0.114386,0.017866,3
3,1.251862,0.020114,0.051813,0.001167,False,615,0.1,2,8,49,31,0.099523,0.093034,0.12684,0.114719,0.131012,0.113026,0.014827,4
4,0.869958,0.008114,0.03381,0.003658,False,415,0.1,6,8,58,21,0.092495,0.082546,0.112893,0.094328,0.144847,0.105422,0.022017,5
5,1.228217,0.106913,0.031808,0.001834,True,329,0.1,3,6,73,17,0.083325,0.086134,0.131108,0.095297,0.127417,0.104656,0.020511,6
6,2.69663,0.148376,0.06692,0.007082,True,801,0.1,8,9,57,20,0.089419,0.087905,0.116509,0.092754,0.13317,0.103951,0.017919,7
7,1.357199,0.038312,0.038412,0.006019,True,423,0.0,2,7,54,31,0.087142,0.07159,0.116683,0.103066,0.134786,0.102654,0.022068,8
8,0.838761,0.02629,0.023806,0.00172,True,277,0.0,2,9,14,18,0.081938,0.078617,0.101294,0.116483,0.132789,0.102224,0.020529,9
9,1.013607,0.027025,0.041404,0.003321,False,577,0.3,7,8,73,23,0.081793,0.065794,0.121536,0.094789,0.112721,0.095327,0.020224,10


In [152]:
top_models = random_search_parameters.head(5)
param_bootstrap = list(set([True] + [top_models['param_bootstrap'].value_counts().idxmax()]))
param_ccp_alpha = list(set([0] + [x for x in set(top_models['ccp_alpha'])]))
param_criterion = list(set(['squared_error'] + [top_models['param_criterion'].value_counts().idxmax()]))
param_max_depth = list(set([None] + [x for x in set(top_models['param_max_depth'])]))
param_max_features = list(set([1] + [x for x in set(top_models['param_max_features'])]))
param_max_leaf_nodes = [None] + [x for x in set(top_models['param_max_leaf_nodes'])]
param_max_samples = [None] + [x for x in set(top_models['param_max_samples'])]
param_min_impurity_decrease = list(set([0] + [x for x in set(top_models['param_min_impurity_decrease'])]))
param_min_samples_leaf = list(set([1] + [x for x in set(top_models['param_min_samples_leaf'])]))
param_min_samples_split = list(set([2] + [x for x in set(top_models['param_min_samples_split'])]))
param_min_weight_fraction_leaf = list(set([0] + [x for x in set(top_models['param_min_weight_fraction_leaf'])]))
param_n_estimators = list(set([100] + [x for x in set(top_models['param_n_estimators'])]))
param_oob_score = list(set([False] + [x for x in set(top_models['param_oob_score'])]))

params_grid = {
    'bootstrap': param_bootstrap,
    'ccp_alpha': param_ccp_alpha,
    'criterion': param_criterion,
    'max_depth': param_max_depth,
    'max_features': param_max_features,
    'max_leaf_nodes': param_max_leaf_nodes,
    'max_samples': param_max_samples,
    'min_impurity_decrease': param_min_impurity_decrease,
    'min_samples_leaf': param_min_samples_leaf,
    'min_samples_split': param_min_samples_split,
    'min_weight_fraction_leaf': param_min_weight_fraction_leaf,
    'n_estimators': param_n_estimators,
    'oob_score': param_oob_score
}

params_grid

{'max_depth': [None, 9, 21, 30, 31],
 'max_features': [1, 47, 49, 50, 58, 30],
 'n_estimators': [100, 583, 615, 299, 955, 415],
 'oob_score': [False, True]}

In [153]:
rfr = RandomForestRegressor(
    # bootstrap=True,
    # ccp_alpha=0,
    # criterion='squared_error',
    # max_depth=None,
    # max_features=1,
    # max_leaf_nodes=None,
    # max_samples=None,
    # min_impurity_decrease=0,
    # min_samples_leaf=1,
    # min_samples_split=2,
    # min_weight_fraction_leaf=0,
    # n_estimators=100,
    # oob_score=False,
    random_state=42
)
grid_search = GridSearchCV(
    estimator=rfr,
    param_grid=params_grid,
    n_jobs=-2,
    refit=True,
    cv=StratifiedKFold(5)
)
grid_search.fit(x_train, y_train)

predictions = grid_search.predict(x_test)
grid_search_rmse = np.sqrt(mean_squared_error(y_test, predictions))
print('RMSE решётчатого поиска: ', grid_search_rmse)



RMSE решётчатого поиска:  231486.14753604244


In [154]:
best_model = grid_search.best_estimator_
train_predictions = best_model.predict(x_test)
model_rmse = np.sqrt(mean_squared_error(y_test, train_predictions))
print('RMSE модели: ', model_rmse)
print('The best hyperparameters for DecisionTreeRegressor found using grid search:\n',
      best_model.get_params())

# feature_importance = grid_search.best_estimator_.feature_importances_
# print('\nThe importance of each feature of model:',
#           *sorted(zip(feature_importance, features), reverse=True), sep='\n')

RMSE модели:  231486.14753604244
The best hyperparameters for DecisionTreeRegressor found using grid search:
 {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 9, 'max_features': 30, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 299, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
