In [1]:
from tools.preprocess import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from catboost import CatBoostRegressor
import catboost as cat
from xgboost import XGBRegressor, plot_importance
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, cross_val_score
from tools.selector import *
from sklearn.metrics import r2_score
from clearml import Task
from rich.progress import *
from rich.console import Console
import time
pd.options.display.max_rows=400
pd.options.display.max_columns=400

In [2]:
import yaml
from yaml.loader import SafeLoader

with open("preprocess.yaml","r") as f:
    preprocessing_parameters = yaml.load(f, Loader=SafeLoader)

Output()

In [3]:
import string 
import random
letters = string.digits
buzzwords = ["khrba","dar","hmza","mdrasa","ard"]
morewords = ["kbira","sghira","zwina","khayba","jdida","9dima"]
task_name = f'{random.choice(letters)}-{random.choice(buzzwords)}-{random.choice(morewords)}'

In [4]:
from clearml import Task

task = Task.create(
            project_name="real-estate", task_name=task_name
        )
clearml_logger = task.get_logger()
task.connect(preprocessing_parameters, "preprocessing_parameters")
console = Console()


In [5]:
X_train_0, Y_train_0, X_test_0, X_test_ids = load_data('data/tabular/', False)

xgb_params, lgb_params, cat_params=load_hyperparameters()
X_train_1,Y_train_1, X_test_1 = preprocess(X_train_0, Y_train_0, X_test_0,preprocessing_parameters)




In [5]:
#X_train_1.drop(inplace=True, columns="id_annonce")

## Training & eval

In [6]:

from sklearn.metrics import mean_absolute_error, mean_squared_error
xgb_params, lgb_params, cat_params=load_hyperparameters()
N_FOLD = 10
EARLY_STOP =250 
# XGB_ITERATIONS = 2000
# LGB_ITERATIONS = 2000
# CAT_ITERATIONS = 3000

XGB_ITERATIONS = 1000
LGB_ITERATIONS = 1000
CAT_ITERATIONS = 1000

task.connect(xgb_params, name = "xgb_params")
task.connect(lgb_params, name= "lgb_params")
task.connect(cat_params, name = "cat_params")
clearml_logger.report_single_value("XGB_ITERATIONS", XGB_ITERATIONS)
clearml_logger.report_single_value("LGB_ITERATIONS", LGB_ITERATIONS)
clearml_logger.report_single_value("CAT_ITERATIONS", CAT_ITERATIONS)

In [7]:
X_train_1.columns.values.tolist()[2:]

['approximate_longitude',
 'postal_code',
 'size',
 'floor',
 'land_size',
 'energy_performance_value',
 'ghg_value',
 'nb_rooms',
 'nb_bedrooms',
 'nb_bathrooms',
 'nb_parking_places',
 'nb_boxes',
 'nb_photos',
 'has_a_balcony',
 'nb_terraces',
 'has_a_cellar',
 'has_a_garage',
 'has_air_conditioning',
 'last_floor',
 'upper_floors',
 'city',
 'radius',
 'angle',
 'rot_10_x',
 'rot_10_y',
 'rot_30_x',
 'rot_30_y',
 'rot_45_x',
 'rot_45_y',
 'rot_60_x',
 'rot_60_y',
 'rot_70_x',
 'rot_70_y',
 'rot_90_x',
 'rot_90_y',
 'rot_145_x',
 'rot_145_y',
 'rot_180_x',
 'rot_180_y',
 'geo_pca_x',
 'geo_pca_y',
 'num_train_station',
 'num_night_club',
 'num_gas_station',
 'user_ratings_total',
 'Has_pool',
 'Has_luxury',
 'Count_bathroom',
 'Count_bedroom',
 'Count_living',
 'Count_villa',
 'property_type_appartement',
 'property_type_atelier',
 'property_type_chalet',
 'property_type_chambre',
 'property_type_château',
 'property_type_divers',
 'property_type_duplex',
 'property_type_ferme',
 'p

In [8]:
def train_xgb(X_train,y_train,X_test):
    nfold = N_FOLD
    skf = KFold(n_splits=nfold, shuffle=True, random_state=2019)
    progress_bar  = Progress(
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
            BarColumn(),
            MofNCompleteColumn(),
            TextColumn("•"),
            TimeElapsedColumn(),
            TextColumn("•"),
            TimeRemainingColumn(),
    )

    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_train))
    final_predictions = np.zeros(len(X_test))

    predictors = X_train_1.columns.values.tolist()[2:]

    i = 1
    
    with progress_bar as progress:
        task1 = progress.add_task("[red]Training XGB", total=10)
        while not progress.finished:

            for train_index, valid_index in skf.split(X_train, y_train.values):
                # print("\nFold {}".format(i))
                xg_train = xgb.DMatrix(X_train.iloc[train_index][predictors].values,
                                    y_train.iloc[train_index].values,                           
                                    )
                xg_valid = xgb.DMatrix(X_train.iloc[valid_index][predictors].values,
                                    y_train.iloc[valid_index].values,                           
                                    )   
                
                clf = xgb.train(xgb_params, xg_train, XGB_ITERATIONS, evals=[(xg_train, "train"), (xg_valid, "eval")], verbose_eval=False)
                oof[valid_index] = clf.predict(xgb.DMatrix(X_train.iloc[valid_index][predictors].values)) 
                
                predictions += clf.predict(xgb.DMatrix(X_train[predictors].values)) / nfold
                final_predictions+= clf.predict((xgb.DMatrix(X_test[predictors].values)))/ nfold

                i = i + 1
                progress.update(task1, advance=1)
                time.sleep(0.5)
    R2 = r2_score(y_train.values, oof)
    mae = mean_absolute_error(y_train.values, oof)
    mse = mean_squared_error(y_train.values, oof)
    rmse = np.sqrt(mse)
    console.log(f"R2 = {R2:<0.4f} --- MAE={mae:<0.4f} ---MSE={mse:<0.4f} --- RMSE ={rmse:<0.4f} | CV ={N_FOLD}" )
    return final_predictions, R2, mae, mse, rmse

def train_lgb(X_train,y_train,X_test):
    nfold = N_FOLD
    skf = KFold(n_splits=nfold, shuffle=True, random_state=2019)
    progress_bar  = Progress(
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
            BarColumn(),
            MofNCompleteColumn(),
            TextColumn("•"),
            TimeElapsedColumn(),
            TextColumn("•"),
            TimeRemainingColumn(),
    )
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_train))
    final_predictions = np.zeros(len(X_test))

    predictors = X_train.columns.values.tolist()[2:]

    i = 1
    with progress_bar as progress:
        task1 = progress.add_task("[red]Training LGB", total=10)
        while not progress.finished:
            for train_index, valid_index in skf.split(X_train, y_train.values):
                d_train = lgb.Dataset(X_train.iloc[train_index][predictors].values,
                                    y_train.iloc[train_index].values,                           
                                    )
                d_valid = lgb.Dataset(X_train.iloc[valid_index][predictors].values,
                                    y_train.iloc[valid_index].values,                           
                                    )   
                watchlist = [d_valid]
                clf = lgb.train(lgb_params, d_train,num_boost_round=LGB_ITERATIONS, valid_sets=d_valid, callbacks = [lgb.log_evaluation(period=0)])
                oof[valid_index] = clf.predict(X_train.iloc[valid_index][predictors].values)
                
                predictions += clf.predict(X_train[predictors].values)/ nfold
                final_predictions+= clf.predict(X_test[predictors].values)/ nfold
                i = i + 1
                progress.update(task1, advance=1)
                time.sleep(0.5)

    R2 = r2_score(y_train.values, oof)
    mae = mean_absolute_error(y_train.values, oof)
    mse = mean_squared_error(y_train.values, oof)
    rmse = np.sqrt(mse)
    console.log(f"R2 = {R2:<0.4f} --- MAE={mae:<0.4f} ---MSE={mse:<0.4f} --- RMSE ={rmse:<0.4f} | CV ={N_FOLD}" )
    return final_predictions, R2, mae, mse, rmse

def train_cat(X_train,y_train, X_test):
    nfold = N_FOLD
    skf = KFold(n_splits=nfold, shuffle=True, random_state=2019)
    progress_bar  = Progress(
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
            BarColumn(),
            MofNCompleteColumn(),
            TextColumn("•"),
            TimeElapsedColumn(),
            TextColumn("•"),
            TimeRemainingColumn(),
    )
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_train))
    final_predictions = np.zeros(len(X_test))
    predictors = X_train.columns.values.tolist()[2:]

    i = 1
    with progress_bar as progress:
        task1 = progress.add_task("[red]Training CAT", total=10)
        while not progress.finished:
            for train_index, valid_index in skf.split(X_train, y_train.values):
                # print("\nFold {}".format(i))
                d_train = cat.Pool(X_train.iloc[train_index][predictors].values,
                                    y_train.iloc[train_index].values,                           
                                    )
                d_valid = cat.Pool(X_train.iloc[valid_index][predictors].values,
                                    y_train.iloc[valid_index].values,                           
                                    )   
                watchlist = [d_valid]
                clf = cat.train(d_train,cat_params, num_boost_round=CAT_ITERATIONS, eval_set=d_valid, early_stopping_rounds = EARLY_STOP, verbose_eval=0)
                oof[valid_index] = clf.predict(X_train.iloc[valid_index][predictors].values)
                
                predictions += clf.predict(X_train[predictors].values)/ nfold
                final_predictions+= clf.predict(X_test[predictors].values)/ nfold

                i = i + 1
                progress.update(task1, advance=1)
                time.sleep(0.5)
    R2 = r2_score(y_train.values, oof)
    mae = mean_absolute_error(y_train.values, oof)
    mse = mean_squared_error(y_train.values, oof)
    rmse = np.sqrt(mse)
    console.log(f"R2 = {R2:<0.4f} --- MAE={mae:<0.4f} ---MSE={mse:<0.4f} --- RMSE ={rmse:<0.4f} | CV ={N_FOLD}" )
    return final_predictions, R2, mae, mse, rmse

In [21]:
xgb_preds = train_xgb(X_train_1, Y_train_1,X_test_1) 
# R2 = 0.8096 --- MAE=0.2454 ---MSE=0.1246 --- RMSE =0.3529 using GPU hist 10 fold
# R2 = 0.8113 --- MAE=0.2433 ---MSE=0.1234 --- RMSE =0.3513 CV25


Output()

In [9]:
lgb_preds = train_lgb(X_train_1, Y_train_1,X_test_1) 
# R2 = 0.8252 --- MAE=0.2310 ---MSE=0.1144 --- RMSE =0.3382 CV10

Output()

In [None]:
cat_preds = train_cat(X_train_1, Y_train_1,X_test_1) 
# R2 = 0.8188 --- MAE=0.2386 ---MSE=0.1185 --- RMSE =0.3443 CV 10 


Output()

## Inference


In [None]:
tuned_models = {
    "xgboost":XGBRegressor(**xgb_params),
    "lgb":LGBMRegressor(**lgb_params),
    "cat":CatBoostRegressor(**cat_params, verbose=False),
}

for model_name, model in tuned_models.items():
    model.fit(X_train_1, Y_train_1)
    print(f"Finished Training {model_name}.")


In [None]:
# weights = [5/40,15/40,20/40] 22,840577560550404

weights = [5/20,9/20, 6/20]  
# final_predictions_sum= (
#     weights[0]*np.exp(tuned_"models["xgboost"].predict(X_test_1)) +
#     weights[1]*np.exp(tuned_models["lgb"].predict(X_test_1)) + 
#     weights[2]*np.exp(tuned_models["cat"].predict(X_test_1)) 
#     )

final_predictions_sum= (
    weights[0]*np.exp(xgb_preds) +
    weights[1]*np.exp(lgb_preds) + 
    weights[2]*np.exp(cat_preds) 
    )

final_predictions=pd.Series(final_predictions_sum,name="price")
final_predictions.head()
final_submission = pd.concat([X_test_ids, final_predictions], axis=1)
final_submission["id_annonce"]=final_submission["id_annonce"].astype(np.int32)
final_submission.to_csv("data/final_submission_167.csv", index=False, header=True)
print("Finished submitting")


Finished submitting
