In [3]:
from tools.preprocess import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor, plot_importance
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score
from dataprep.eda import plot, plot_missing, plot_correlation, create_report 
from tools.selector import *
import shap
pd.options.display.max_rows=400
pd.options.display.max_columns=400
import optuna 
from optuna.pruners import MedianPruner, PatientPruner
from optuna.visualization import plot_optimization_history, plot_param_importances

In [4]:
preprocessing_parameters = {
    'drop_columns':["energy_performance_category","ghg_category"],
    'frequency_encoding':['city'],
    'quantile_encoding':[],
    'label_encoding':[],
    'target_encoding':[],
    'constant_imputation_floor':False,
    'constant_land_size':False,
    'constant_energy_performance_value':True,
    'constant_ghg_value':True,
    'constant_imputation_bedrooms':False,
    'constant_imputation_exposition':False,
    'iter_imputation':False,
    'mean_imputation':False,
    'mini_imputation':False,
    'add_polar_coordinates':True,
    'add_polar_rotation':True,
    'add_geo_pca':True,
    'add_geopopulation':False,
    'add_geopopulation_2':False,
    'add_distance_to_city_center':False, #only True when add_geopopulation is true
    'add_geo':True,
    'geodata':["index","num_train_station","num_night_club", "num_gas_station","user_ratings_total"],#  
    'add_classification_quality':True,
    'classification_threshold':0,
    'images_features':[],
    'feature_transformation':False,
    'features_interactions':False,
    'hot_encoding':True,
    'standard_scaling':False,
    'robust_scaling':True,
    'power_scaling':False,
    'target_transformation':True,
}



In [5]:
X_train_0, Y_train_0, X_test_0, X_test_ids = load_data('data/')
xgb_params, lgb_params, cat_params=load_hyperparameters()

In [6]:
X_train_1,Y_train_1, X_test_1 = preprocess(X_train_0, Y_train_0, X_test_0,preprocessing_parameters)

In [7]:
X_train_1.corrwith(Y_train_1).sort_values()

property_type_terrain             -0.292496
property_type_terrain à bâtir     -0.135557
property_type_parking             -0.105659
image_quality_sum_Bedroom         -0.104044
image_quality_count_Bedroom       -0.088309
num_night_club                    -0.074100
property_type_viager              -0.063466
image_quality_mean_Backyard       -0.061076
property_type_divers              -0.060409
image_quality_mean_Bedroom        -0.056274
image_quality_sum_Backyard        -0.052199
image_quality_count_Backyard      -0.037178
ghg_value                         -0.033964
num_train_station                 -0.021388
exposition_Nord-Ouest             -0.019726
image_quality_mean_Bathroom       -0.017071
energy_performance_value          -0.012092
property_type_chambre             -0.011059
image_quality_mean_Frontyard      -0.006045
upper_floors                      -0.004130
property_type_ferme               -0.001397
property_type_maison              -0.001264
exposition_Nord                 

In [8]:
# LGB Optimized

"""
0.24475231717752646 Default (No imputation)
0.24499656814827206 Exposition
0.24540434465290145 Floor

"""
baseline_model=LGBMRegressor(**lgb_params)
baseline_model.fit(X_train_1, Y_train_1)
kf = KFold(n_splits=10)
baseline_result = cross_val_score(baseline_model, X_train_1, Y_train_1, scoring="neg_mean_absolute_error", cv=kf)
mean_baseline_result = -np.mean(baseline_result)
mean_baseline_result



0.25045640208470465

## Inference


In [9]:
tuned_models = {
    "xgboost":XGBRegressor(**xgb_params),
    "lgb":LGBMRegressor(**lgb_params),
    "cat":CatBoostRegressor(**cat_params, verbose=False),
}

for model_name, model in tuned_models.items():
    model.fit(X_train_1, Y_train_1)
    print(f"Finished Training {model_name}.")


Finished Training xgboost.
Finished Training lgb.
Finished Training cat.


In [10]:
weights = [4/40,14/40,20/40]

final_predictions_sum= (
    weights[0]*np.exp(tuned_models["xgboost"].predict(X_test_1)) +
    weights[1]*np.exp(tuned_models["lgb"].predict(X_test_1)) + 
    weights[2]*np.exp(tuned_models["cat"].predict(X_test_1)) 
    )

final_predictions=pd.Series(final_predictions_sum,name="price")
final_predictions.head()
final_submission = pd.concat([X_test_ids, final_predictions], axis=1)
final_submission.to_csv("data/final_submission_145.csv", index=False, header=True)
print("Finished submitting")


Finished submitting
