In [1]:
!pip install xgboost lightgbm



In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path)
    df = df.drop(columns=['No', 'NRP', 'Nama', 'Link Data Rumah'])
    df = df.replace(['-', '--'], np.nan)
    df['harga rumah'] = df['harga rumah'].apply(lambda x: float(re.sub('[,.]', '', str(x))))
    df['keamanan'] = df['keamanan (ada/tidak)'].map({'ada': 1, 'tidak': 0})
    df['taman'] = df['taman (ada/tidak)'].map({'ada': 1, 'tidak': 0})
    df = pd.get_dummies(df, columns=['Kabupaten/Kota', 'kecamatan', 'kelurahan'], drop_first=True)
    df = df.drop(columns=['keamanan (ada/tidak)', 'taman (ada/tidak)'])
    X = df.drop(columns=['harga rumah'])
    y = df['harga rumah']
    return X, y

def create_preprocessing_pipeline(X):
    numeric_features = X.select_dtypes(include=[np.number]).columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
        ]
    )
    return preprocessor

# Train models
def train_models(X, y):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Support Vector Regression": SVR(kernel='rbf'),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "XGBoost": xgb.XGBRegressor(random_state=42, verbosity=0),
        "Random Forest": RandomForestRegressor(random_state=42),
        "AdaBoost": AdaBoostRegressor(random_state=42),
        "KNN Regressor": KNeighborsRegressor(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(alpha=1.0, max_iter=100000),
        "ElasticNet Regression": ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=100000),
        "Bayesian Ridge": BayesianRidge(),
        "Huber Regressor": HuberRegressor(max_iter=100000),
        "Extra Trees Regressor": ExtraTreesRegressor(random_state=42),
        "LightGBM": lgb.LGBMRegressor(random_state=42, verbose=-1)
    }
    
    model_errors = {}
    
    print(f'{"Model":30s} Mean Absolute Error')
    print('-' * 50)
    for model_name, model in models.items():
        model.fit(X, y)
        y_pred = model.predict(X)
        mae = mean_absolute_error(y, y_pred)
        model_errors[model_name] = mae
        print(f'{model_name:30s} {mae:.2f}')
        joblib.dump(model, f'{model_name.lower().replace(" ", "_")}_house_price_model.pkl')
    
    # Find the best model
    best_model_name = min(model_errors, key=model_errors.get)
    best_model_mae = model_errors[best_model_name]
    
    return best_model_name, best_model_mae

# Make predictions
def predict_house_prices(input_data, preprocessor):
    model_paths = [
        'linear_regression_house_price_model.pkl',
        'decision_tree_house_price_model.pkl',
        'support_vector_regression_house_price_model.pkl',
        'gradient_boosting_house_price_model.pkl',
        'xgboost_house_price_model.pkl',
        'random_forest_house_price_model.pkl',
        'adaboost_house_price_model.pkl',
        'knn_regressor_house_price_model.pkl',
        'ridge_regression_house_price_model.pkl',
        'lasso_regression_house_price_model.pkl',
        'elasticnet_regression_house_price_model.pkl',
        'bayesian_ridge_house_price_model.pkl',
        'huber_regressor_house_price_model.pkl',
        'extra_trees_regressor_house_price_model.pkl',
        'lightgbm_house_price_model.pkl'
    ]
    
    predictions = {}
    
    for model_path in model_paths:
        model = joblib.load(model_path)
        if not isinstance(input_data, pd.DataFrame):
            input_data = pd.DataFrame([input_data])
        input_df = preprocessor.transform(input_data)
        predicted_price = model.predict(input_df)[0]
        model_name = model_path.replace('_house_price_model.pkl', '').replace('_', ' ').title()
        predictions[model_name] = f"Rp {predicted_price:,.2f}"
    
    return predictions

In [3]:
### contoh penggunaan
file_path = r"C:\Users\jonat\Downloads\datarumah.xlsx"
X, y = load_and_preprocess_data(file_path)
preprocessor = create_preprocessing_pipeline(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [4]:
best_model_name, best_model_mae = train_models(X_train, y_train)

Model                          Mean Absolute Error
--------------------------------------------------
Linear Regression              69205120916.89
Decision Tree                  140114.17
Support Vector Regression      46164120443.72
Gradient Boosting              52956032798.80
XGBoost                        10210504625.20
Random Forest                  23063774973.16
AdaBoost                       199063433735.76
KNN Regressor                  55675675578.62
Ridge Regression               69205072133.58
Lasso Regression               69205120916.79
ElasticNet Regression          69258422052.75
Bayesian Ridge                 70547949192.44
Huber Regressor                46269192567.72
Extra Trees Regressor          141670.99
LightGBM                       35646396056.12


In [5]:
input_data = {
    'jumlah kamar tidur': 12,
    'jumlah kamar mandi': 5,
    'luas tanah (m2)': 200,
    'luas bangunan (m2)': 90,
    'carport (mobil)': 3,
    'pasokan listrik (watt)': 7000,
    'Kabupaten/Kota_Jakarta Timur': 1,
    'kecamatan_Pengadegan': 1,
    'kelurahan_Pancoran': 1,
    'keamanan': 1,  # Assuming 'ada' is encoded as 1
    'taman': 1,  # Assuming 'ada' is encoded as 1
    'jarak dengan rumah sakit terdekat (km)': 0.9,
    'jarak dengan sekolah terdekat (km)': 12,
    'jarak dengan tol terdekat (km)': 12
}

In [6]:
### melakukan one hot encoding pada data kategorikal
input_data_df = pd.DataFrame([input_data])

In [7]:
### melakukan prediksi harga rumah terhadap setiap model
predictions = predict_house_prices(input_data_df, preprocessor)

In [8]:
### melakukan print model terbaik dan harga prediksinya
print(f'\nBest Model: {best_model_name}')
print(f'Best Model Prediction: {predictions[best_model_name]}')



Best Model: Decision Tree
Best Model Prediction: Rp 220,000,000,000.00


In [9]:
# melakukan print tebakan harga oleh model lain
print("\nOther Models:")
for model_name, predicted_price in sorted(predictions.items(), key=lambda x: x[0]):
    if model_name != best_model_name:
        print(f'{model_name:30s} {predicted_price}')



Other Models:
Adaboost                       Rp 167,148,040,540.54
Bayesian Ridge                 Rp 49,302,812,558.38
Elasticnet Regression          Rp 67,954,617,141.77
Extra Trees Regressor          Rp 150,599,800,000.00
Gradient Boosting              Rp 149,074,708,781.92
Huber Regressor                Rp 7,574,143,281.83
Knn Regressor                  Rp 158,340,000,000.00
Lasso Regression               Rp 71,126,706,551.37
Lightgbm                       Rp 119,029,637,637.58
Linear Regression              Rp 71,126,706,546.65
Random Forest                  Rp 317,423,300,000.00
Ridge Regression               Rp 71,130,183,877.09
Support Vector Regression      Rp 6,000,000,020.03
Xgboost                        Rp 133,474,615,296.00
