In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
import joblib

# Load data
df = pd.read_csv('C:\\House_Price_Prediction\\data\\Bangladesh_property_prices.csv')

# Drop unnecessary columns
df.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "Location"], inplace=True)

# Handle missing values
df.fillna({
    "No. Beds": df["No. Beds"].median(),
    "No. Baths": df["No. Baths"].median()
}, inplace=True)
df.dropna(inplace=True)

# Feature Engineering
df['Price_per_sqft'] = df['Price'] / df['Area']
df['Beds_Baths_Ratio'] = df['No. Beds'] / df['No. Baths']
bins = [0, 1000, 2000, 3000, float('inf')]
labels = ['Small', 'Medium', 'Large', 'Very Large']
df['Area_Category'] = pd.cut(df['Area'], bins=bins, labels=labels, right=False)
df['Log_Price'] = np.log1p(df['Price'])

# Label Encoding
label_encoders = {}
for col in ['Type', 'Region', 'Sub-region', 'Area_Category']:
    le = LabelEncoder()
    df[col + '_n'] = le.fit_transform(df[col])
    label_encoders[col] = le

# Final dataset
final_df = df[['Price', 'No. Beds', 'No. Baths', 'Area', 'Type_n', 'Region_n', 'Sub-region_n']]

# Features and target
X = final_df.drop("Price", axis=1)
y = final_df["Price"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build preprocessing and model pipeline
numeric_features = ['No. Beds', 'No. Baths', 'Area']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ],
    remainder='passthrough'  # Leave encoded categorical values as-is
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

print(pipeline)

# Train model
pipeline.fit(X_train, y_train)

model_path = 'C:\\House_Price_Prediction\\src\\models\\random_forest_pipeline.joblib'
joblib.dump(pipeline, model_path)
print(f"Trained pipeline saved to: {model_path}")


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['No. Beds', 'No. Baths',
                                                   'Area'])])),
                ('regressor', RandomForestRegressor(random_state=42))])
Trained pipeline saved to: C:\House_Price_Prediction\src\models\random_forest_pipeline.joblib


In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Evaluation Metrics:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.4f}")


Evaluation Metrics:
MAE: 1550900.36
MSE: 15619632405293.62
R^2 Score: 0.8296
