In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
# ===== 1. Đọc dữ liệu =====
df = pd.read_csv("data/HousePrice_processed.csv")

# Các cột muốn giữ lại nhưng không đưa vào train
keep_cols = ["Title", "Description"]

# Cột target
target_col = "Amount"

# Xác định cột categorical
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in keep_cols + [target_col]]

In [3]:
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

In [4]:
# ===== 3. Tách features / target =====
X = df.drop(columns=[target_col] + keep_cols)
y = df[target_col]

In [5]:
# ===== 4. Train/Test Split =====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

models = {
    'Random Forest': RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=200, max_depth=None),
    'Gradient Boosting': HistGradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, n_estimators=200, max_depth=None,missing_values='auto')
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5

    print(f'{model_name}:')
    print(f'R-squared: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (MSE): {rmse:.2f}')
    print('----------------------------------------')

Random Forest:
R-squared: 0.97
Mean Absolute Error (MAE): 310626.94
Root Mean Squared Error (MSE): 2393135.51
----------------------------------------
Gradient Boosting:
R-squared: 0.60
Mean Absolute Error (MAE): 2255247.70
Root Mean Squared Error (MSE): 8966204.81
----------------------------------------


Parameters: { "missing_values" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost:
R-squared: 0.97
Mean Absolute Error (MAE): 415563.37
Root Mean Squared Error (MSE): 2628112.72
----------------------------------------
