In [39]:
import pandas as pd
from pycaret.regression import setup, compare_models, blend_models, finalize_model, plot_model, predict_model, add_metric, evaluate_model,tune_model
import random

VERSION = "merge_baseline_blend_id_v5"

In [40]:
def preprocess(df):
    
    df['year'] = df["timestamp"].dt.year
    df['month'] = df["timestamp"].dt.month
    
    return df

In [41]:
main_train_data = pd.read_csv("HW_train_main_data.csv", parse_dates=["timestamp"])
main_test_data = pd.read_csv("HW_test_main_data.csv", parse_dates=["timestamp"])

main_train_data = main_train_data.sort_values(by=["timestamp"])
main_test_data = main_test_data.sort_values(by=["timestamp"])

add_train_data = pd.read_csv("HW_train_additional_data.csv")
add_test_data = pd.read_csv("HW_test_additional_data.csv")

In [42]:
main_train_data = main_train_data.merge(add_train_data, on="id")
main_test_data = main_test_data.merge(add_test_data, on="id")

main_train_data = preprocess(main_train_data)
main_test_data = preprocess(main_test_data)

In [43]:
setup(data=main_train_data,
          target="price",
          fold = 5)

best_models  = compare_models(sort = 'RMSE', n_select = 3)

blender = blend_models(estimator_list=best_models)

tuned_blender = tune_model(blender)

pred = predict_model(tuned_blender, data= main_test_data)

Unnamed: 0,Description,Value
0,Session id,1245
1,Target,price
2,Target type,Regression
3,Original data shape,"(29000, 33)"
4,Transformed data shape,"(29000, 35)"
5,Transformed train set shape,"(20300, 35)"
6,Transformed test set shape,"(8700, 35)"
7,Numeric features,31
8,Date features,1
9,Rows with missing values,71.7%


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,1531756.0276,8394106580468.467,2891598.4595,0.6755,0.4763,0.4448,1.138
lightgbm,Light Gradient Boosting Machine,1586627.3135,9025853723681.574,2998820.3928,0.6513,0.4809,0.4532,0.074
xgboost,Extreme Gradient Boosting,1623730.025,9169738504601.6,3024452.2,0.6441,0.4842,0.4534,0.41
et,Extra Trees Regressor,1578644.2467,9244513852650.893,3034591.4975,0.6423,0.4788,0.4402,0.726
rf,Random Forest Regressor,1592975.2248,9269723161201.154,3038909.8224,0.6415,0.48,0.4469,1.452
gbr,Gradient Boosting Regressor,1676559.0935,9307995235283.994,3044823.8174,0.6406,0.4888,0.469,0.68
dt,Decision Tree Regressor,2268940.7494,18855268282878.31,4335397.8892,0.2657,0.6931,0.5252,0.062
ada,AdaBoost Regressor,3320378.4468,20074765493694.15,4436483.7193,0.2298,0.6717,0.8878,0.302
omp,Orthogonal Matching Pursuit,2969435.7738,25259989836878.63,5016854.741,0.026,0.6234,0.6727,0.022
dummy,Dummy Regressor,3068313.85,25915659439308.8,5082820.0,-0.0003,0.6319,0.6842,0.022


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1506821.9772,7665924655393.143,2768740.6262,0.6877,0.4546,0.4113
1,1536077.688,9947299981901.072,3153934.0484,0.6285,0.4876,0.4538
2,1595164.9073,9167811316003.906,3027839.3808,0.6801,0.4863,0.4758
3,1542836.0164,8506963829570.679,2916669.9898,0.7013,0.4671,0.4295
4,1512954.8085,7180829511072.636,2679706.9823,0.6594,0.4862,0.4545
Mean,1538771.0795,8493765858788.286,2909378.2055,0.6714,0.4764,0.445
Std,31277.0423,997636789926.1407,171126.6081,0.0254,0.0133,0.0223


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1501470.8321,7535019361011.674,2744998.9729,0.693,0.4547,0.4111
1,1525717.581,9896638745212.889,3145892.3607,0.6304,0.4863,0.4519
2,1578864.5311,8981613963365.734,2996934.0939,0.6866,0.4852,0.4722
3,1529542.6675,8348695098224.316,2889410.8566,0.7069,0.4669,0.4291
4,1498024.1893,6990225249410.435,2643903.4115,0.6685,0.4858,0.4546
Mean,1526723.9602,8350438483445.011,2884227.9391,0.6771,0.4758,0.4438
Std,28944.7556,1029971762973.0428,177954.1475,0.0264,0.0128,0.0213


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [44]:
pred

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,office_num,green_part,prom_part,cafe_count,church_facilities,mosque,leisure_facilities,year,month,prediction_label
0,63119,2011-10-08,34,19.0,8.0,,,,,,...,1,2.750000,0.04,0,0,0,0,2011,10,4.353046e+06
1,53004,2011-10-14,35,20.0,11.0,,,,,,...,24,8.030000,0.00,1,0,0,0,2011,10,5.418495e+06
2,78624,2011-10-23,44,28.0,2.0,,,,,,...,5,50.290001,0.00,1,0,0,1,2011,10,5.021054e+06
3,35365,2011-10-28,41,20.0,12.0,,,,,,...,5,17.290001,0.00,1,1,0,0,2011,10,6.212918e+06
4,25243,2011-10-30,77,45.0,11.0,,,,,,...,2,13.470000,13.22,3,0,0,2,2011,10,9.326027e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22160,2015-07-05,31,1.0,4.0,17.0,1.0,2015.0,2.0,1.0,...,141,12.240000,0.00,71,15,0,0,2015,7,3.858371e+06
996,62292,2015-07-09,58,29.0,13.0,14.0,1.0,2001.0,2.0,11.0,...,0,2.630000,0.00,2,0,0,0,2015,7,1.065153e+07
997,94819,2015-07-13,48,1.0,8.0,1.0,1.0,2016.0,1.0,1.0,...,6,3.470000,56.84,2,1,0,0,2015,7,9.314626e+06
998,63348,2015-07-17,86,,12.0,17.0,1.0,,3.0,1.0,...,0,15.290000,2.83,1,0,0,0,2015,7,9.894186e+06


In [45]:
pred = pred.sort_values(by=["id"])
pred = pred.rename(columns={"prediction_label":"predicted_price"}).reset_index()
pred[['id', "predicted_price"]].to_csv(f'{VERSION}.csv', index=False)
