In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import pickle

# 1. Load Dataset
dataset_path = 'daegu_cleaned.csv'
df = pd.read_csv(dataset_path)

# Feature Engineering: Creating price per square meter
df['price_per_sqm'] = df['SalePrice'] / df['Size(sqf)']

# Define columns
numerical_features = [
    'Size(sqf)', 'YearBuilt', 'N_Parkinglot(Basement)', 
    'N_FacilitiesNearBy(ETC)', 'N_FacilitiesNearBy(PublicOffice)', 
    'N_SchoolNearBy(University)', 'price_per_sqm'
]

categorical_features = ['HallwayType', 'TimeToSubway', 'SubwayStation']

# Define features and target
X = df[numerical_features + categorical_features]
y = df['SalePrice']

# 2. Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('ordencoder', OrdinalEncoder(), ['TimeToSubway']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['HallwayType', 'SubwayStation'])
    ]
)

# 4. Gradient Boosting Pipeline
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, random_state=42))
])

# 5. Train Gradient Boosting Model
gb_pipeline.fit(X_train, y_train)

# 6. Evaluate Gradient Boosting Model
y_pred = gb_pipeline.predict(X_test)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
gb_r2 = r2_score(y_test, y_pred)
gb_mae = mean_absolute_error(y_test, y_pred)

print(f"Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Gradient Boosting R²: {gb_r2:.4f}")
print(f"Gradient Boosting MAE: {gb_mae:.2f}")

# 7. Save the Gradient Boosting Model
with open('gradient_boosting_model.pkl', 'wb') as file:
    pickle.dump(gb_pipeline, file)

# 8. Train and Save XGBoost Model
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        learning_rate=0.1, n_estimators=200, max_depth=5, subsample=0.8, random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
xgb_r2 = r2_score(y_test, y_pred_xgb)
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {xgb_rmse:.2f}")
print(f"XGBoost R²: {xgb_r2:.4f}")
print(f"XGBoost MAE: {xgb_mae:.2f}")

with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgb_pipeline, file)

# Select the better model based on RMSE
if xgb_rmse < gb_rmse:
    best_model = xgb_pipeline
    model_name = 'XGBoost'
else:
    best_model = gb_pipeline
    model_name = 'Gradient Boosting'

print(f"The best model is {model_name} with RMSE: {min(xgb_rmse, gb_rmse):.2f}")

# Save the best model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)


Gradient Boosting RMSE: 5943.13
Gradient Boosting R²: 0.9966
Gradient Boosting MAE: 4568.52
XGBoost RMSE: 4257.73
XGBoost R²: 0.9983
XGBoost MAE: 2789.31
The best model is XGBoost with RMSE: 4257.73
