In [None]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [None]:
pd.set_option('display.max_columns', None) # show all columns in a df

In [None]:
X_train = pd.read_csv("split_data/train_features_preprocessed.csv")
y_train = pd.read_csv("split_data/train_target_preprocessed.csv")

In [None]:
display(X_train.head(10))

In [None]:
y_train.head(10)

In [None]:
cols = [c for c in X_train.columns if c.startswith(('subtype_regrouped_', 'province_')) or c in ('area_imputed', 'bedrooms', 'median_price_advertiser_bin', 'advertiser_count', 'energy_value_imputed', 'MS_P_50_median_imputed', 'new_building', 'foto_amount')]

X_train = X_train[cols]
display(X_train)

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
# --- Set up parameters for XGBoost ---
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',  # Root Mean Squared Error (RMSE) for evaluation
    'max_depth': 10,  # Control complexity (higher can lead to overfitting)
    'learning_rate': 0.01,  # Smaller values help prevent overfitting
    'subsample': 0.8,  # Randomly sample 80% of the data for each tree to prevent overfitting
    'colsample_bytree': 0.8,  # Randomly sample 80% of features for each tree
    'lambda': 1,  # L2 regularization (prevents overfitting)
    'alpha': 0,  # L1 regularization (optional, useful for sparse data)
    'n_jobs': -1,  # Use all available cores
}


In [None]:
xgb_model = XGBRegressor(objective='reg:squarederror', eval_metric='mae')

# Define hyperparameters grid
param_grid = {
    # 'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring= 'neg_mean_absolute_error', verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

In [None]:
params = {
    'objective': 'reg:squarederror',  # Regression task
    #'eval_metric': 'rmse',  # Root Mean Squared Error (RMSE) for evaluation
    'eval_metric': 'mae',  # MAE
    'max_depth': 7,  # Control complexity (higher can lead to overfitting)
    'learning_rate': 0.1,  # Smaller values help prevent overfitting
    'subsample': 0.7,  # Randomly sample 80% of the data for each tree to prevent overfitting
    'colsample_bytree': 0.7,  # Randomly sample 80% of features for each tree
    'min_child_weight': 1,
    'lambda': 1,  # L2 regularization (prevents overfitting)
    'alpha': 0,  # L1 regularization (optional, useful for sparse data)
    'n_jobs': -1,  # Use all available cores
}


In [None]:
# --- Use Cross-Validation for better generalization ---
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,  # Max number of rounds to train
    early_stopping_rounds=50,  # Stop if no improvement in 50 rounds
    nfold=5,  # 5-fold cross-validation
    stratified=False,  # Not necessary for regression but defaults to False
    verbose_eval=50  # Print out results every 50 iterations
)

In [None]:
# --- Get the best boosting round from cross-validation ---
# best_num_boost_round = cv_results['test-rmse-mean'].idxmin()
best_num_boost_round = cv_results['test-mae-mean'].idxmin()

In [None]:
# --- Train the final model with best boosting rounds ---
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round
)

In [None]:
# --- Optional: Plot Feature Importances ---
import matplotlib.pyplot as plt
xgb.plot_importance(model, importance_type='weight', max_num_features=10)
plt.show()

In [None]:
# Plot feature importance by 'gain' (more useful for regression)
xgb.plot_importance(model, importance_type='gain', max_num_features=20)
plt.show()