In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('C:/Users/HomePC/Desktop/Personal/ihouserentpredictor/model_data.csv')
df.head()

# Drop duplicate rows
df = df.drop_duplicates()

# Define features and target
features = ['LOCATION', 'BEDROOMS', 'BATHROOMS', 'TOILETS', 'HOUSE_TYPE']
target = 'PRICE'

X = df[features]
y = df[target]

# Define categorical and numerical columns
categorical_cols = ['LOCATION', 'HOUSE_TYPE']
numerical_cols = ['BEDROOMS', 'BATHROOMS', 'TOILETS']

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

# Full pipeline with LGBM model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(n_estimators=100, random_state=42))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model_pipeline.fit(X_train, y_train)

# Predict
y_pred = model_pipeline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 56380, number of used features: 93
[LightGBM] [Info] Start training from score 2332226.434995
MAE: 822,607.06
RMSE: 1,640,503.11
R² Score: 0.6956


In [2]:
import joblib

# Save the trained pipeline
joblib.dump(model_pipeline, 'ipredictor_model.pkl')

print("Model saved as ipredictor_model.pkl")

Model saved as ipredictor_model.pkl
