In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.metrics import mean_squared_error, root_mean_squared_error

import numpy as np
import optuna

In [14]:
df = pd.read_csv('./data/data_after_aliona_pipline.csv')
df['set_type'] = df['set_type'].astype('string')
df = df.select_dtypes(exclude=['object'])
df = df.set_index('ID')

In [15]:
# Для обучения и предсказание на тесте 
X_train_val = df[df['set_type']!='test']
y_train_val = X_train_val['Price']
X_train_val = X_train_val.drop(['set_type','Price'], axis=1)

train = df[df['set_type']=='train']
val = df[df['set_type']=='val']
test = df[df['set_type']=='test']

train = train.drop('set_type', axis=1)
val = val.drop('set_type', axis=1)
test = test.drop('set_type', axis=1)

y_train = train['Price']
X_train = train.drop('Price', axis=1)

y_val = val['Price']
X_val= val.drop('Price', axis=1)

y_test = test['Price']
X_test = test.drop('Price', axis=1)

plot_df = X_val[['Количество комнат', 'Площадь общая', 'Год постройки']]
plot_df = pd.merge(plot_df, y_val, left_index=True, right_index=True)


In [4]:
def error0(y_true,y_pred,size_x,sizy_y):

    def millions_formatter(x, pos):
        return '%1.1fтыс.' % (x * 1e-3)

    error = y_true - y_pred
    
    plt.figure(figsize=(size_x, sizy_y))
    plt.scatter(y_pred, error, color='blue', edgecolor='k')
    plt.axhline(y=0, color='red', linestyle='--') # Линия, показывающая отсутствие ошибки
    plt.xlabel('Предсказанные значения')
    plt.ylabel('Остатки')
    plt.title('График остатков')
    plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
    plt.gca().xaxis.set_major_formatter(FuncFormatter(millions_formatter))
    plt.show()

def tripal_print(pred_train, pred_val):

    def error(y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        rmse = root_mean_squared_error(y_true, y_pred)
        return mse, rmse
    
    print(f'Train:      mse= {error(y_train, pred_train)[0]:.2f}, rmse={error(y_train, pred_train)[1]:.2f}')
    print(f'Validation: mse= {error(y_val, pred_val)[0]:.2f}, rmse={error(y_val, pred_val)[1]:.2f}')

def long_pred(y_pred):
    plot_df['pred'] =  y_pred #plot_df['Price'] -
    plot_df.groupby('Площадь общая')['Price'].mean().plot(figsize=(18, 3))
    plot_df.groupby('Площадь общая')['pred'].mean().plot(figsize=(18, 3), title='pred by Площадь общая')
    plt.legend()
    plt.show()
    plot_df.groupby('Количество комнат')['Price'].mean().plot(figsize=(18, 3))
    plot_df.groupby('Количество комнат')['pred'].mean().plot(figsize=(18, 3), title='pred by Количество комнат')
    plt.legend()
    plt.show()
    plot_df.groupby('Год постройки')['Price'].mean().plot(figsize=(18, 3))
    plot_df.groupby('Год постройки')['pred'].mean().plot(figsize=(18, 3), title='pred by Год постройки')
    plt.legend()
    plt.show()

In [5]:
correlation_matrix = X_train.corr(method='kendall')
X_train.shape

(13740, 383)

In [6]:
threshold = 0.75

# Находим индексы и названия колонок для признаков, которые следует исключить
cols_to_drop = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:  # Если корреляция выше порога
            colname = correlation_matrix.columns[i]  # Получаем название колонки
            cols_to_drop.add(colname)

# Удаление сильно коррелирующих признаков
X_train = X_train.drop(columns=cols_to_drop)
X_val = X_val.drop(columns=cols_to_drop)
X_train.shape

(13740, 366)

In [7]:
param = {'n_estimators': 1000,
         'min_samples_split': 2,
         'min_samples_leaf': 1,
         'n_jobs': -1,
         'random_state': 777,
         }

Validation: mse= 968128994.45, rmse=31114.77
'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1,

In [8]:
regressor = RandomForestRegressor(**param)
regressor.fit(X_train, y_train)
predictions_train = regressor.predict(X_train)
predictions_val = regressor.predict(X_val)

tripal_print(predictions_train, predictions_val)
error0(predictions_val, y_val, 8, 4)
long_pred(predictions_val)

KeyboardInterrupt: 

In [None]:
regressor = RandomForestRegressor(**param)
regressor.fit(X_train_val, y_train_val)

pred_test = regressor.predict(X_test)


In [16]:
y_val

6        182000.0
7        180000.0
10        39000.0
13       149700.0
16        38000.0
           ...   
21105     65500.0
21111     70900.0
21122     20600.0
21127     45000.0
21128     84900.0
Name: Price, Length: 4226, dtype: float64

In [29]:
name_model = 'random_forest'
submission_df_val = pd.DataFrame()
submission_df_val['ID'] = y_val.index
submission_df_val[f'pred_val_{name_model}'] = predictions_val
submission_df_val.to_csv(f'./result_after_aliona_pipline/val/prediction_val_{name_model}.csv',index=False)


submission_df_test = pd.DataFrame()
submission_df_test['ID'] = y_test.index
submission_df_test[f'pred_test_{name_model}'] = pred_test
submission_df_test.to_csv(f'./result_after_aliona_pipline/test/prediction_test_{name_model}.csv',index=False)
submission_df_test

Unnamed: 0,ID,pred_test_random_forest
0,18,142004.819
1,19,114413.061
2,21,184621.734
3,24,147719.365
4,28,100840.142
...,...,...
3158,21110,37996.990
3159,21117,24128.739
3160,21118,104410.129
3161,21124,48801.540
