In [11]:
from tools.preprocess import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score
from dataprep.eda import plot, plot_missing, plot_correlation, create_report


pd.options.display.max_rows=140
pd.options.display.max_columns=140

In [12]:
preprocessing_parameters = {
    'drop_columns':["exposition","energy_performance_category","ghg_category"],
    'frequency_encoding':['city'],
    'quantile_encoding':[],
    'knn_imputation':["size", "land_size","energy_performance_value","ghg_value", "nb_rooms","nb_bathrooms", "nb_bedrooms"],
    'feature_engineering':False,
    'add_geo':True,
    'geodata':["index","num_train_station","num_gas_station", "num_night_club", "num_transit_station"],
    'feature_transformation':False,
    'hot_encoding':True,
    'standard_scaling':False,
    'robust_scaling':True,
    'rower_scaling':False,
    'target_transformation':True,
}

In [13]:
X_train_0, Y_train_0, X_test_0, X_test_ids = load_data('data/')

In [14]:
X_train_1,Y_train_1, X_test_1 = preprocess(X_train_0, Y_train_0, X_test_0,preprocessing_parameters)

In [16]:
X_train_1.corrwith(Y_train_1).sort_values()

property_type_terrain             -0.292496
property_type_terrain à bâtir     -0.135557
property_type_parking             -0.105659
num_gas_station                   -0.069435
property_type_viager              -0.063466
property_type_divers              -0.060409
ghg_value                         -0.039270
energy_performance_value          -0.037223
property_type_chambre             -0.011059
upper_floors                      -0.004130
property_type_ferme               -0.001397
property_type_maison              -0.001264
property_type_chalet               0.002726
property_type_péniche              0.003347
property_type_hôtel                0.004316
num_transit_station                0.005032
property_type_gîte                 0.006035
property_type_atelier              0.015708
property_type_moulin               0.015860
property_type_hôtel particulier    0.016022
property_type_loft                 0.019508
num_train_station                  0.021388
property_type_duplex            

In [7]:
#baseline model
baseline_model=XGBRegressor()
baseline_model.fit(X_train_1, Y_train_1)
kf = KFold(n_splits=10)
baseline_result = cross_val_score(baseline_model, X_train_1, Y_train_1, scoring="neg_mean_absolute_error", cv=kf)
mean_baseline_result = -np.mean(baseline_result)
mean_baseline_result
"""
0.2868198047384043 standard_scaling
"""

'\n0.2868198047384043 standard_scaling\n'

In [11]:
baseline_predictions= np.exp(baseline_model.predict(X_test_1))

final_predictions=pd.Series(baseline_predictions,name="price")
final_predictions.head()
final_submission = pd.concat([X_test_ids, final_predictions], axis=1)
final_submission.to_csv("data/final_submission_57.csv", index=False, header=True)
print("Finished submitting")

Finished submitting


In [19]:
xgb_params={'tree_method': 'auto',
 'learning_rate': 0.22502761577232197,
 'max_depth': 7,
 'min_child_weight': 2.8420515790248526,
 'subsample': 0.975,
 'colsample_bytree': 0.75,
 'colsample_bylevel': 0.925,
 'gamma': 0.001531824642793309,
 'reg_lambda': 0.010527402680638263,
 'reg_alpha': 1.2647545029756599e-08}

etr_params={
}

cat_params={
    'learning_rate': 0.17447854966066342,
    'objective': 'RMSE',
    'colsample_bylevel': 0.09999300796304286,
    'depth': 12,
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.9709524520446949
}

In [18]:
tuned_models = {
    "xgboost":XGBRegressor(**xgb_params),
    "etr":ExtraTreesRegressor(),
    "cat":CatBoostRegressor(**cat_params, verbose=False),
}

for model_name, model in tuned_models.items():
    model.fit(X_train_1, Y_train_1)
    print(f"Finished Training {model_name}.")


Finished Training xgboost.
Finished Training etr.
Finished Training cat.


In [20]:
final_predictions_sum= (
    6/20*np.exp(tuned_models["xgboost"].predict(X_test_1)) +
    4/20*np.exp(tuned_models["etr"].predict(X_test_1)) + 
    10/20*np.exp(tuned_models["cat"].predict(X_test_1)) 
    )

final_predictions=pd.Series(final_predictions_sum,name="price")
final_predictions.head()
final_submission = pd.concat([X_test_ids, final_predictions], axis=1)
final_submission.to_csv("data/final_submission_80.csv", index=False, header=True)
print("Finished submitting")

Finished submitting
