In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd

In [12]:
df = pd.read_excel(r'../data/result_prepared_df.xlsx', index_col=0)
df.head()

Unnamed: 0,ЖК,index(),Регион,Населенный пункт,Группа компаний,ID проекта,Класс недвижимости,Статус,"Продано квартир, руб","Продано нежилых, шт",...,Месяц_Май 2022,Месяц_Март 2021,Месяц_Март 2022,Месяц_Ноябрь 2021,Месяц_Октябрь 2021,Месяц_Сентябрь 2021,Месяц_Февраль 2021,Месяц_Февраль 2022,Месяц_Январь 2021,Месяц_Январь 2022
0,Восход 40376,258,Приморский край,Артем,СЗВОСХОД,11228,-1.341444,-0.321288,-0.889197,-0.287707,...,-0.262202,-0.1675,-0.290803,-0.240275,-0.219108,-0.22154,-0.164399,-0.276767,-0.161245,-0.253605
1,Восход 40376,261,Приморский край,Артем,СЗВОСХОД,11228,-1.341444,-0.321288,-0.824694,-0.287707,...,-0.262202,-0.1675,-0.290803,4.161902,-0.219108,-0.22154,-0.164399,-0.276767,-0.161245,-0.253605
2,Восход 40376,262,Приморский край,Артем,СЗВОСХОД,11228,-1.341444,-0.321288,-0.849961,-0.287707,...,-0.262202,-0.1675,-0.290803,-0.240275,-0.219108,-0.22154,-0.164399,-0.276767,-0.161245,-0.253605
3,Восход 40376,263,Приморский край,Артем,СЗВОСХОД,11228,-1.341444,-0.321288,-0.810597,-0.287707,...,-0.262202,-0.1675,-0.290803,-0.240275,-0.219108,-0.22154,-0.164399,-0.276767,-0.161245,3.943144
4,Восход 40376,264,Приморский край,Артем,СЗВОСХОД,11228,-1.341444,-0.321288,-0.868588,-0.287707,...,-0.262202,-0.1675,-0.290803,-0.240275,-0.219108,-0.22154,-0.164399,3.613142,-0.161245,-0.253605


# Дерево решений

In [13]:
x = df.drop(['Таргет'], axis=1)
y = df['Таргет']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

x_train_numeric = x_train.select_dtypes(include=[np.number])
x_test_numeric = x_test.select_dtypes(include=[np.number])

# размерность обучающей
print(x_train_numeric.shape, y_train.shape)
 
# и тестовой выборки
print(x_test_numeric.shape, y_test.shape)

(718, 155) (718,)
(308, 155) (308,)


In [14]:
mae_list = []
clf = DecisionTreeRegressor(random_state=42)

for i in range(50):    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

    x_train_numeric = x_train.select_dtypes(include=[np.number])
    x_test_numeric = x_test.select_dtypes(include=[np.number])

    clf.fit(x_train_numeric, y_train)

    y_pred = clf.predict(x_test_numeric)

    mae_list.append(mean_absolute_error(y_test, y_pred))

np.mean(mae_list), np.std(mae_list)


(np.float64(15631.114392624677), np.float64(1079.1622945253996))

In [15]:
clf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}

# Случайный лес

In [16]:
rf_mae_list = []
base_rf = RandomForestRegressor(random_state=42)

for i in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

    x_train_numeric = x_train.select_dtypes(include=[np.number])
    x_test_numeric = x_test.select_dtypes(include=[np.number])

    base_rf.fit(x_train_numeric, y_train)

    y_base_rf_pred = base_rf.predict(x_test_numeric)

    rf_mae_list.append(mean_absolute_error(y_test, y_base_rf_pred))

np.mean(rf_mae_list), np.std(rf_mae_list)


(np.float64(12154.718203592303), np.float64(614.5552771220268))

In [17]:
param_grid = {
   'n_estimators': list(range(500, 701, 100)),
   'max_features': ['sqrt'],
   'min_samples_leaf': list(range(1, 3))
}

rf = RandomForestRegressor(random_state=42)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

x_train_numeric = x_train.select_dtypes(include=[np.number])
x_test_numeric = x_test.select_dtypes(include=[np.number])

grid_search_rf = GridSearchCV(
   estimator=rf,
   param_grid=param_grid,
   scoring='neg_mean_absolute_error',
   verbose=1,
   n_jobs=-1
)

grid_search_rf.fit(x_train_numeric, y_train)

# Выводим лучшие параметры
best_params = grid_search_rf.best_params_
best_params

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 700}

In [18]:
rf_tuned_mae_list = []
rf_tuned = RandomForestRegressor(**best_params, random_state=42)

for i in range(10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

    x_train_numeric = x_train.select_dtypes(include=[np.number])
    x_test_numeric = x_test.select_dtypes(include=[np.number])

    rf_tuned.fit(x_train_numeric, y_train)

    y_rf_tuned_pred = rf_tuned.predict(x_test_numeric)

    rf_tuned_mae_list.append(mean_absolute_error(y_test, y_rf_tuned_pred))

np.mean(rf_tuned_mae_list), np.std(rf_tuned_mae_list)

(np.float64(13221.791957887031), np.float64(619.5416742337795))