In [2]:
!pip install xgboost lightgbm

Collecting lightgbm
  Downloading lightgbm-4.4.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.4.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.4 MB 991.0 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.4 MB 3.2 MB/s eta 0:00:01
   ------------ --------------------------- 0.5/1.4 MB 3.5 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.4 MB 3.7 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.4 MB 3.8 MB/s eta 0:00:01
   ----------------------------- ---------- 1.1/1.4 MB 4.0 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.4 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 4.0 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.4.0


In [24]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path)
    df = df.drop(columns=['No', 'NRP', 'Nama', 'Link Data Rumah'])
    df = df.replace(['-', '--'], np.nan)
    df['harga rumah'] = df['harga rumah'].apply(lambda x: float(re.sub('[,.]', '', str(x))))
    df['keamanan'] = df['keamanan (ada/tidak)'].map({'ada': 1, 'tidak': 0})
    df['taman'] = df['taman (ada/tidak)'].map({'ada': 1, 'tidak': 0})
    df = pd.get_dummies(df, columns=['Kabupaten/Kota', 'kecamatan', 'kelurahan'], drop_first=True)
    df = df.drop(columns=['keamanan (ada/tidak)', 'taman (ada/tidak)'])
    X = df.drop(columns=['harga rumah'])
    y = df['harga rumah']
    return X, y

def create_preprocessing_pipeline(X):
    numeric_features = X.select_dtypes(include=[np.number]).columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
        ]
    )
    return preprocessor

# Train models
def train_models(X, y):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Support Vector Regression": SVR(kernel='rbf'),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "XGBoost": xgb.XGBRegressor(random_state=42, verbosity=0),
        "Random Forest": RandomForestRegressor(random_state=42),
        "AdaBoost": AdaBoostRegressor(random_state=42),
        "KNN Regressor": KNeighborsRegressor(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(alpha=1.0, max_iter=100000),
        "ElasticNet Regression": ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=100000),
        "Bayesian Ridge": BayesianRidge(),
        "Huber Regressor": HuberRegressor(max_iter=100000),
        "Extra Trees Regressor": ExtraTreesRegressor(random_state=42),
        "LightGBM": lgb.LGBMRegressor(random_state=42, verbose=-1)
    }
    
    model_errors = {}
    
    print(f'{"Model":30s} Mean Absolute Error')
    print('-' * 50)
    for model_name, model in models.items():
        model.fit(X, y)
        y_pred = model.predict(X)
        mae = mean_absolute_error(y, y_pred)
        model_errors[model_name] = mae
        print(f'{model_name:30s} {mae:.2f}')
        joblib.dump(model, f'{model_name.lower().replace(" ", "_")}_house_price_model.pkl')
    
    # Find the best model
    best_model_name = min(model_errors, key=model_errors.get)
    best_model_mae = model_errors[best_model_name]
    
    return best_model_name, best_model_mae

# Make predictions
def predict_house_prices(input_data, preprocessor):
    model_paths = [
        'linear_regression_house_price_model.pkl',
        'decision_tree_house_price_model.pkl',
        'support_vector_regression_house_price_model.pkl',
        'gradient_boosting_house_price_model.pkl',
        'xgboost_house_price_model.pkl',
        'random_forest_house_price_model.pkl',
        'adaboost_house_price_model.pkl',
        'knn_regressor_house_price_model.pkl',
        'ridge_regression_house_price_model.pkl',
        'lasso_regression_house_price_model.pkl',
        'elasticnet_regression_house_price_model.pkl',
        'bayesian_ridge_house_price_model.pkl',
        'huber_regressor_house_price_model.pkl',
        'extra_trees_regressor_house_price_model.pkl',
        'lightgbm_house_price_model.pkl'
    ]
    
    predictions = {}
    
    for model_path in model_paths:
        model = joblib.load(model_path)
        if not isinstance(input_data, pd.DataFrame):
            input_data = pd.DataFrame([input_data])
        input_df = preprocessor.transform(input_data)
        
        # Debugging: Print transformed input data
        print(f'Transformed input data for model {model_path}: {input_df}')
        
        predicted_price = model.predict(input_df)[0]
        model_name = model_path.replace('_house_price_model.pkl', '').replace('_', ' ').title()
        predictions[model_name] = f"Rp {predicted_price:,.2f}"
    
    return predictions

# Example usage
file_path = r"C:\Users\jieju\Downloads\datarumah.xlsx"
X, y = load_and_preprocess_data(file_path)
preprocessor = create_preprocessing_pipeline(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

best_model_name, best_model_mae = train_models(X_train, y_train)

input_data = {
    'jumlah kamar tidur': 13,
    'jumlah kamar mandi': 8,
    'luas tanah (m2)': 240,
    'luas bangunan (m2)': 180,
    'carport (mobil)': 7,
    'pasokan listrik (watt)': 10000,
    'Kabupaten/Kota_Jakarta Barat': 1,
    'kecamatan_Kebon Jeruk': 1,
    'kelurahan_Kebon Jeruk': 1,
    'keamanan': 1,  # Assuming 'ada' is encoded as 1
    'taman': 1,  # Assuming 'ada' is encoded as 1
    'jarak dengan rumah sakit terdekat (km)': 2,
    'jarak dengan sekolah terdekat (km)': 2,
    'jarak dengan tol terdekat (km)': 2
}

# One-hot encode categorical variables
input_data_df = pd.DataFrame([input_data])

# Align input_data_df with the training data columns
input_data_df = input_data_df.reindex(columns=X.columns, fill_value=0)

# Predict house prices from all models
predictions = predict_house_prices(input_data_df, preprocessor)

# Print the best model and its prediction
print(f'\nBest Model: {best_model_name}')
print(f'Best Model Prediction: {predictions[best_model_name]}')

# Print other model predictions
print("\nOther Models:")
for model_name, predicted_price in sorted(predictions.items(), key=lambda x: x[0]):
    if model_name != best_model_name:
        print(f'{model_name:30s} {predicted_price}')


Model                          Mean Absolute Error
--------------------------------------------------
Linear Regression              69205120916.89
Decision Tree                  140114.17
Support Vector Regression      46164120443.72
Gradient Boosting              52956032798.80
XGBoost                        10210504625.20
Random Forest                  23063774973.16
AdaBoost                       199063433735.76
KNN Regressor                  55675675578.62
Ridge Regression               69205072133.58
Lasso Regression               69205120916.79
ElasticNet Regression          69258422052.75
Bayesian Ridge                 70547949192.44
Huber Regressor                46269192567.72
Extra Trees Regressor          141670.99
LightGBM                       35646396056.12
Transformed input data for model linear_regression_house_price_model.pkl: [[ 3.49772179  2.06366758 -0.18014726 -0.42629152  2.54008113  0.31436042
  -0.02790054  0.01652566  1.76649689  0.83274017  0.8565558 ]]
Trans