In [1]:
# base imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import category_encoders as ce

# Models
from sklearn.ensemble import RandomForestRegressor

# Metrics
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score

# notebook settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
df = pd.read_csv('../train_prepared_1st.csv')

In [5]:
df

Unnamed: 0,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,FireplaceQu,GarageType,GarageQual,GarageCond,Fence,SaleCondition,Utilities,Electrical,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,RoofStyle,RoofMatl,MasVnrType,GarageFinish,PavedDrive,PoolQC,MiscFeature,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Street_Grvl,Street_Pave,Alley_Absence,Alley_Grvl,Alley_Pave,CentralAir_N,CentralAir_Y,Id,SalePrice
0,0.393191,1.097101,1.129034,0.579156,0.249890,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,-0.508241,1.492421,0.293401,0.110108,0.963091,-0.955493,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,0.300452,-0.738356,-0.075338,-0.335787,-0.222372,0.334246,0.300895,0.028316,-0.491536,-0.072556,0.713848,0.482011,0.317812,-0.042156,0.186455,0.580907,-0.293130,-0.934863,-0.444328,1.087023,-0.249895,0.306528,0.348900,0.525202,-0.191815,0.998954,0.067331,-0.217879,0.646183,-0.507284,1.046258,0.896833,-0.773861,1.207379,-0.101197,0.413547,0.781366,1.232599,0.169927,-0.207698,0.986849,-0.924311,-0.740760,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1,208500.0
1,0.393191,-1.095788,-1.085498,-0.766638,0.249890,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,0.381578,0.164226,2.516713,-0.549169,0.293401,0.110108,0.963091,0.678737,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,0.300452,-0.738356,-0.075338,-0.228952,-0.222372,0.328248,-3.031345,0.028316,-0.491536,-0.072556,-0.705908,0.482011,0.317812,-0.042156,0.186455,1.178112,-0.293130,-0.629896,0.477111,-0.819679,3.822419,0.306528,-0.059792,-0.572250,0.511940,-0.086940,-0.873616,-0.072044,-0.063185,2.188279,0.154764,-0.395604,0.261075,-0.785025,-0.101197,-0.471891,0.781366,-0.756321,0.169927,-0.207698,-0.287758,0.623632,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2,181500.0
2,0.393191,1.097101,1.129034,0.579156,0.249890,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.383646,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,0.300452,1.213697,-0.075338,-0.335787,-0.222372,0.334246,0.300895,0.028316,-0.491536,-0.072556,0.713848,0.482011,0.317812,-0.042156,0.186455,0.097873,-0.293130,-0.288516,-0.299076,1.087023,-0.249895,0.306528,0.627553,0.334828,-0.051064,0.918517,0.067331,0.137197,0.646183,-0.507284,0.980221,0.848965,-0.610718,1.235375,-0.101197,0.563755,0.781366,1.232599,0.169927,-0.207698,-0.287758,0.623632,-0.740760,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3,223500.0
3,0.393191,-1.081485,-0.595336,0.579156,0.249890,-0.268332,0.339821,1.325041,-0.694228,0.302765,-1.245160,-0.751066,1.963481,-0.508241,-0.549169,0.293401,0.110108,-0.703473,1.202261,-1.211765,0.32207,0.298704,0.450863,-1.222534,0.018512,0.300452,1.213697,-0.075338,0.086770,-0.222372,0.545860,0.300895,0.028316,-0.491536,-0.072556,-0.705908,-0.876383,0.317812,-0.042156,0.186455,-0.494941,-0.293130,-0.047275,-0.671283,1.087023,-0.249895,1.619961,0.785457,-0.572250,-0.426400,0.797862,0.302568,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.506205,0.978742,-0.101197,0.427382,-1.027363,-0.756321,0.169927,-0.207698,0.349546,0.623632,-0.740760,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,4,140000.0
4,0.393191,1.097101,1.129034,0.579156,0.249890,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.839008,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,0.300452,1.213697,-0.075338,-0.228952,-0.222372,2.575031,0.300895,0.028316,-0.491536,-0.072556,0.713848,0.482011,0.317812,-0.042156,0.186455,0.468931,-0.293130,-0.161068,0.211573,1.087023,-0.249895,1.619961,1.686437,1.387486,0.699608,0.878299,0.067331,0.518903,1.355551,-0.507284,0.947203,0.753229,-0.037170,1.671651,-0.101197,1.378042,0.781366,1.232599,1.385655,-0.207698,1.624153,0.623632,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,5,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.393191,1.097101,1.129034,-0.766638,0.249890,-0.268332,0.339821,1.325041,-0.694228,0.302765,1.097914,0.381578,0.164226,-0.508241,-0.297636,0.293401,0.110108,0.963091,0.678737,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,0.300452,-0.738356,-0.075338,-0.335787,-0.222372,0.238583,0.300895,0.028316,-0.491536,-0.072556,-0.705908,0.482011,0.317812,-0.042156,0.186455,-0.969192,-0.293130,0.892658,-0.224181,-0.819679,-0.249895,0.306528,-0.059792,-0.572250,-0.332566,0.838081,0.067331,-0.285470,-0.063185,-0.507284,0.914184,0.753229,-0.526598,0.834095,-0.101197,0.289033,0.781366,1.232599,0.169927,-0.207698,0.349546,0.623632,-0.740760,-0.110811,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,0.658334,-0.602962,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1456,175000.0
1456,0.393191,-0.191467,-0.458128,-0.766638,-4.102904,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,0.381578,0.164226,-0.508241,-0.549169,-1.308805,0.110108,-1.119600,0.678737,0.540480,0.32207,0.298704,-2.214307,-0.214778,0.018512,0.300452,-0.738356,-0.075338,-0.335787,-0.222372,0.167685,0.300895,0.028316,-0.491536,-0.072556,2.524919,-0.876383,0.317812,-0.042156,0.186455,0.765338,0.670525,0.064243,1.112586,1.087023,-0.249895,0.306528,0.125977,0.094060,0.746525,-0.006503,-0.873616,0.381311,-0.063185,0.391237,0.220801,0.178812,2.328397,-0.785025,-0.101197,1.130989,0.781366,-0.756321,0.169927,-0.207698,0.349546,2.171575,2.018026,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,1.678860,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1457,210000.0
1457,0.393191,1.681209,1.635368,0.579156,0.249890,-0.268332,0.339821,1.325041,3.328475,-1.288859,-0.093776,-0.751066,1.963481,-0.508241,1.492421,0.293401,0.110108,0.963091,1.202261,0.540480,0.32207,0.298704,-0.141221,-0.214778,0.018512,0.300452,-0.738356,-0.075338,-0.335787,-0.222372,0.545860,0.300895,0.028316,-0.491536,-0.072556,-0.705908,0.482011,0.317812,-0.042156,-5.424125,-0.365400,-0.293130,0.719693,0.227460,-0.819679,-0.249895,-1.006906,-1.025792,-0.572250,-0.144898,-1.494580,0.302568,-0.142806,0.646183,3.086800,-1.000876,1.040437,0.072441,1.902620,-0.101197,1.658694,0.781366,-0.756321,1.385655,-0.207698,1.624153,2.171575,-0.740760,0.185205,-0.359601,-0.103331,-0.285935,-0.06315,4.317209,-0.446925,1.678860,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1458,266500.0
1458,0.393191,-1.095788,-1.085498,0.579156,0.249890,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,-0.751066,0.164226,0.383646,1.492421,-1.308805,0.110108,-0.703473,-0.955493,0.540480,0.32207,0.298704,0.450863,-0.214778,0.018512,-3.461904,-0.738356,-0.075338,-0.335787,-0.222372,-0.635035,0.300895,0.028316,2.066829,-0.072556,-0.705908,-0.876383,0.317812,-0.042156,0.186455,-0.861608,5.790313,-1.276243,0.059513,1.087023,-0.249895,-1.006906,-1.081523,-0.572250,-0.051064,-1.132615,-0.873616,-0.057207,-0.772552,0.391237,-0.703711,0.561757,-0.207960,-0.785025,-0.101197,-0.835553,-1.027363,-0.756321,-1.045801,-0.207698,-0.925062,-0.924311,2.152408,-0.702843,1.384045,-0.103331,-0.285935,-0.06315,-0.089592,-0.815344,1.678860,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1459,142125.0


In [6]:
X, log_y = df.drop('SalePrice', axis=1), np.log(df['SalePrice'])

X_train, X_valid, log_y_train, log_y_valid = train_test_split(
    X,
    log_y,
    test_size=0.2,
    random_state=52
)

In [7]:
X_train.shape, log_y_train.shape

((1168, 84), (1168,))

In [8]:
X_valid.shape, log_y_valid.shape

((292, 84), (292,))

In [9]:
random_forest = RandomForestRegressor(random_state=52)

ml_pipeline_rf = Pipeline(
    [
        ('model', random_forest)
    ]
)

ml_pipeline_rf

In [10]:
ml_pipeline_rf.fit(X_train, log_y_train)

In [13]:
# Предсказания на логарифмических данных
y_train_pred_log = ml_pipeline_rf.predict(X_train)
y_valid_pred_log = ml_pipeline_rf.predict(X_valid)

# Возвращаемся к исходной шкале
y_train_pred = np.exp(y_train_pred_log)
y_valid_pred = np.exp(y_valid_pred_log)
y_valid = np.exp(log_y_valid)
y_train = np.exp(log_y_train)

# Проверяем, что в целевых значениях и предсказаниях нет отрицательных значений
if (y_train < 0).any() or (y_valid < 0).any() or (y_train_pred < 0).any() or (y_valid_pred < 0).any():
    print("RMSLE не может быть применена, так как имеются отрицательные значения в y или в предсказаниях.")
else:
    # Вычисляем MSLE и RMSLE
    train_msle = mean_squared_log_error(y_train, y_train_pred)
    valid_msle = mean_squared_log_error(y_valid, y_valid_pred)

    print('Train MSLE:', train_msle)
    print('Validation MSLE:', valid_msle)
    print("\n" + "="*30 + "\n")  # Разделитель для удобства чтения

    print('Train RMSLE:', np.sqrt(train_msle))
    print('Validation RMSLE:', np.sqrt(valid_msle))
    print("\n" + "="*30 + "\n")  # Разделитель для удобства чтения

# Вычисляем другие метрики для более полной оценки модели

# Среднеквадратичная ошибка (MSE) и корень из MSE (RMSE)
train_mse = mean_squared_error(y_train, y_train_pred)
valid_mse = mean_squared_error(y_valid, y_valid_pred)

print('Train MSE:', train_mse)
print('Validation MSE:', valid_mse)
print("\n" + "="*30 + "\n")  # Разделитель для удобства чтения

print('Train RMSE:', np.sqrt(train_mse))
print('Validation RMSE:', np.sqrt(valid_mse))
print("\n" + "="*30 + "\n")  # Разделитель для удобства чтения

# Средняя абсолютная ошибка (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
valid_mae = mean_absolute_error(y_valid, y_valid_pred)

print('Train MAE:', train_mae)
print('Validation MAE:', valid_mae)
print("\n" + "="*30 + "\n")  # Разделитель для удобства чтения

# Коэффициент детерминации (R²)
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print('Train R^2:', train_r2)
print('Validation R^2:', valid_r2)

Train MSLE: 0.0026695486772365675
Validation MSLE: 0.026836090652662674


Train RMSLE: 0.051667675361260136
Validation RMSLE: 0.16381724772643041


Train MSE: 121163440.37772581
Validation MSE: 1027248083.9142243


Train RMSE: 11007.426601060113
Validation RMSE: 32050.71112961808


Train MAE: 6408.331588095298
Validation MAE: 19381.737313214002


Train R^2: 0.9794424412327709
Validation R^2: 0.8703769584410196


In [14]:
answer = pd.read_csv('../sample_submission.csv')

In [15]:
answer

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [16]:
test_df = pd.read_csv('../test_prepared_1st.csv')

In [18]:
# Предсказания на логарифмических данных
y_test_pred_log = ml_pipeline_rf.predict(test_df)

# Возвращаемся к исходной шкале
y_test_pred = np.exp(y_test_pred_log)

In [19]:
answer['SalePrice'] = y_test_pred

In [24]:
answer

Unnamed: 0,Id,SalePrice
0,1461,127797.196845
1,1462,152804.170675
2,1463,188133.564151
3,1464,185044.338157
4,1465,193344.338595
...,...,...
1454,2915,86234.311572
1455,2916,87011.201847
1456,2917,148564.421429
1457,2918,111150.643564


In [21]:
answer.to_csv('../submission.csv', index=False)

In [22]:
check = pd.read_csv('../submission.csv')

In [23]:
check

Unnamed: 0,Id,SalePrice
0,1461,127797.196845
1,1462,152804.170675
2,1463,188133.564151
3,1464,185044.338157
4,1465,193344.338595
...,...,...
1454,2915,86234.311572
1455,2916,87011.201847
1456,2917,148564.421429
1457,2918,111150.643564
