In [2]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For advanced data visualization
import plotly.express as px


from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.impute import SimpleImputer  # For handling missing values
from sklearn.pipeline import Pipeline  # For creating machine learning pipelines

# Core machine learning algorithms for regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
import joblib

In [3]:
df = pd.read_csv('C:\\House_Price_Prediction\\data\\Bangladesh_property_prices.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Location,Price,Type,No. Beds,No. Baths,Area,Latitude,Longitude,Region,Sub-region
0,0,0.0,"Sector 10, Uttara, Dhaka",7500000,Apartment,3.0,3.0,1300.0,23.86846,90.3928,Uttara,Sector 10
1,1,1.0,"Section 11, Mirpur, Dhaka",7280000,Apartment,4.0,4.0,1456.0,23.81223,90.35967,Mirpur,Section 11
2,2,2.0,"Chowdhuripara, Khilgaon, Dhaka",13000000,Apartment,3.0,3.0,1550.0,23.75349,90.42469,Khilgaon,Chowdhuripara
3,3,3.0,"Road No 4, Banani, Dhaka",37000000,Apartment,3.0,3.0,2669.0,23.78855,90.40081,Banani,Road No 4
4,4,4.0,"South Banasree Project, Banasree, Dhaka",3600000,Apartment,2.0,2.0,835.0,23.76354,90.4318,Banasree,South Banasree Project


In [4]:
# Droping features
df.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "Location"], inplace=True)

In [5]:
# Set the display format to show whole numbers
pd.set_option('display.float_format', lambda x: '%.06f' % x)

In [6]:
df.shape

(4704, 9)

In [7]:
# Display basic information about the dataset
print("Original Data Information:")
print(df.info())

Original Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4704 entries, 0 to 4703
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Price       4704 non-null   int64  
 1   Type        4704 non-null   object 
 2   No. Beds    4500 non-null   float64
 3   No. Baths   4500 non-null   float64
 4   Area        4704 non-null   float64
 5   Latitude    4704 non-null   float64
 6   Longitude   4704 non-null   float64
 7   Region      4704 non-null   object 
 8   Sub-region  4680 non-null   object 
dtypes: float64(5), int64(1), object(3)
memory usage: 330.9+ KB
None


In [8]:
df.isnull().sum()

Price           0
Type            0
No. Beds      204
No. Baths     204
Area            0
Latitude        0
Longitude       0
Region          0
Sub-region     24
dtype: int64

In [9]:
df = df.fillna({
    "No. Beds": df["No. Beds"].median(),
    "No. Baths": df["No. Baths"].median()
})

In [10]:
# Handling missing values
df.dropna(inplace=True)  # Drop rows with missing values

In [11]:
# Display information after handling missing values and duplicates
print("\nData Information after Handling Missing Values and Duplicates:")
print(df.info())


Data Information after Handling Missing Values and Duplicates:
<class 'pandas.core.frame.DataFrame'>
Index: 4680 entries, 0 to 4703
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Price       4680 non-null   int64  
 1   Type        4680 non-null   object 
 2   No. Beds    4680 non-null   float64
 3   No. Baths   4680 non-null   float64
 4   Area        4680 non-null   float64
 5   Latitude    4680 non-null   float64
 6   Longitude   4680 non-null   float64
 7   Region      4680 non-null   object 
 8   Sub-region  4680 non-null   object 
dtypes: float64(5), int64(1), object(3)
memory usage: 365.6+ KB
None


In [12]:
# Outliers using Standard Deviation Method
mean = df[['Price', 'No. Beds', 'No. Baths', 'Area']].mean()
std_dev = df[['Price', 'No. Beds', 'No. Baths', 'Area']].std()
threshold = 3  # Adjust based on your preference
outliers = (df[['Price', 'No. Beds', 'No. Baths', 'Area']] - mean).abs() > threshold * std_dev

In [13]:
# Outliers using Box Plot (IQR) Method
Q1 = df[['Price', 'No. Beds', 'No. Baths', 'Area']].quantile(0.25)
Q3 = df[['Price', 'No. Beds', 'No. Baths', 'Area']].quantile(0.75)
IQR = Q3 - Q1
outliers = (df[['Price', 'No. Beds', 'No. Baths', 'Area']] < Q1 - 1.5 * IQR) | (df[['Price', 'No. Beds', 'No. Baths', 'Area']] > Q3 + 1.5 * IQR)

In [14]:
# Outliers using Z-Score Method:
z_scores = (df[['Price', 'No. Beds', 'No. Baths', 'Area']] - df[['Price', 'No. Beds', 'No. Baths', 'Area']].mean()) / df[['Price', 'No. Beds', 'No. Baths', 'Area']].std()
threshold = 3  # Adjust based on your preference
outliers = z_scores.abs() > threshold

In [15]:
# Price per Square Foot
df['Price_per_sqft'] = df['Price'] / df['Area']

In [16]:
# Interaction Features
df['Beds_Baths_Ratio'] = df['No. Beds'] / df['No. Baths']

In [17]:
# Binning - Convert Continuous Variable to Categorical
bins = [0, 1000, 2000, 3000, float('inf')]
labels = ['Small', 'Medium', 'Large', 'Very Large']
df['Area_Category'] = pd.cut(df['Area'], bins=bins, labels=labels, right=False)

In [18]:
# Log Transformation for Skewed Variables
df['Log_Price'] = np.log1p(df['Price'])

In [19]:
import os
from sklearn.preprocessing import LabelEncoder
import joblib

# Ensure the encoders directory exists
os.makedirs('encoders', exist_ok=True)

# Dictionary mapping original columns to encoded columns and encoder names
encoding_tasks = {
    'Region': ('Region_n', 'region_encoder.joblib'),
    'Type': ('Type_n', 'type_encoder.joblib'),
    'Sub-region': ('Sub-region_n', 'subregion_encoder.joblib'),
    'Area_Category': ('Area_Category', 'area_category_encoder.joblib')
}

# Process each column
for original_col, (encoded_col, encoder_filename) in encoding_tasks.items():
    try:
        # Initialize and fit the encoder
        encoder = LabelEncoder()
        df[encoded_col] = encoder.fit_transform(df[original_col])
        
        # Save the encoder
        joblib.dump(encoder, f'encoders/{encoder_filename}')
        print(f"Successfully encoded {original_col} and saved encoder to encoders/{encoder_filename}")
        
    except Exception as e:
        print(f"Error processing {original_col}: {str(e)}")

Successfully encoded Region and saved encoder to encoders/region_encoder.joblib
Successfully encoded Type and saved encoder to encoders/type_encoder.joblib
Successfully encoded Sub-region and saved encoder to encoders/subregion_encoder.joblib
Successfully encoded Area_Category and saved encoder to encoders/area_category_encoder.joblib


In [20]:
df.sample(5)

Unnamed: 0,Price,Type,No. Beds,No. Baths,Area,Latitude,Longitude,Region,Sub-region,Price_per_sqft,Beds_Baths_Ratio,Area_Category,Log_Price,Region_n,Type_n,Sub-region_n
2790,6100000,Apartment,3.0,3.0,1220.0,23.76427,90.36547,Mohammadpur,PC Culture Housing,5000.0,1.0,1,15.623799,43,0,276
2693,6000000,Apartment,3.0,3.0,1440.0,23.81467,90.37306,Mirpur,Kallyanpur,4166.666667,1.0,1,15.60727,40,0,163
4285,8200000,Apartment,3.0,3.0,1250.0,23.82566,90.37001,Mirpur,Pallabi,6560.0,1.0,1,15.919645,40,0,278
3244,6000000,Apartment,3.0,3.0,1200.0,23.81223,90.35967,Mirpur,Ahmed Nagar,5000.0,1.0,1,15.60727,40,0,11
2050,6500000,Apartment,3.0,3.0,1800.0,23.7121,90.4598,Demra,Matuail,3611.111111,1.0,1,15.687313,14,0,215


In [21]:
new_df = df[['Price', 'No. Beds', 'No. Baths', 'Area', 'Type_n', 'Region_n', 'Sub-region_n']]

In [22]:
new_df

Unnamed: 0,Price,No. Beds,No. Baths,Area,Type_n,Region_n,Sub-region_n
0,7500000,3.000000,3.000000,1300.000000,0,66,350
1,7280000,4.000000,4.000000,1456.000000,0,40,342
2,13000000,3.000000,3.000000,1550.000000,0,31,75
3,37000000,3.000000,3.000000,2669.000000,0,4,320
4,3600000,2.000000,2.000000,835.000000,0,6,406
...,...,...,...,...,...,...,...
4699,4950000,3.000000,2.000000,1100.000000,0,40,223
4700,4950000,3.000000,2.000000,1100.000000,0,40,223
4701,4950000,3.000000,2.000000,1100.000000,0,40,223
4702,4950000,3.000000,2.000000,1100.000000,0,40,223


In [23]:
df=new_df.copy()

In [24]:
# Define features (X) and target variable (y)
X = df.drop("Price", axis=1)  # Features
y = df["Price"]  # Target variable

# Perform train-test split
test_size = 0.2  # You can adjust the test size based on your preference
random_state = 42  # Set a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Display the shape of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (3744, 6) (3744,)
Testing set shape: (936, 6) (936,)


In [25]:

import os 

# Combine features and target for each set
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Save to CSV files
#train_data.to_csv('train_data.csv', index=False)
#test_data.to_csv('test_data.csv', index=False)

print("Training data saved to: train_data.csv")
print("Testing data saved to: test_data.csv")
print(f"Train shape: {train_data.shape}, Test shape: {test_data.shape}")

save_path = "C:\\House_Price_Prediction\\data"
os.makedirs(save_path, exist_ok=True)
train_data.to_csv(f'{save_path}/train.csv', index=False)
test_data.to_csv(f'{save_path}/test.csv', index=False)


Training data saved to: train_data.csv
Testing data saved to: test_data.csv
Train shape: (3744, 7), Test shape: (936, 7)


In [26]:
import os

# Create the 'processed' directory if it doesn't exist
processed_dir = '../data/processed/'
os.makedirs(processed_dir, exist_ok=True)

# Save the processed data to 'data/processed/'
processed_data_path = os.path.join(processed_dir, 'processed_data.csv')
df.to_csv(processed_data_path, index=False)

In [27]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR


In [28]:
# 1. Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_predictions = linear_model.predict(X_test)

In [29]:
# Evaluate Linear Regression model
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)

In [30]:
print("Linear Regression Model:")
print(f"Mean Absolute Error: {linear_mae}")
print(f"Mean Squared Error: {linear_mse}")
print(f"R-squared: {linear_r2}\n")

Linear Regression Model:
Mean Absolute Error: 3442509.7832359225
Mean Squared Error: 38339350324441.82
R-squared: 0.5817791564940172



In [31]:
# 2. Decision Tree Model
decision_tree_model = DecisionTreeRegressor(random_state=random_state)
decision_tree_model.fit(X_train, y_train)
dt_predictions = decision_tree_model.predict(X_test)

In [32]:
# Evaluate Decision Tree model
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

In [33]:
print("Decision Tree Model:")
print(f"Mean Absolute Error: {dt_mae}")
print(f"Mean Squared Error: {dt_mse}")
print(f"R-squared: {dt_r2}\n")

Decision Tree Model:
Mean Absolute Error: 1910511.684115801
Mean Squared Error: 28082623193487.41
R-squared: 0.6936636051353859



In [34]:
# 3. Random Forest Model
random_forest_model = RandomForestRegressor(random_state=random_state)
random_forest_model.fit(X_train, y_train)
rf_predictions = random_forest_model.predict(X_test)

In [35]:
# Evaluate Random Forest model
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

In [36]:
print("Random Forest Model:")
print(f"Mean Absolute Error: {rf_mae}")
print(f"Mean Squared Error: {rf_mse}")
print(f"R-squared: {rf_r2}")

Random Forest Model:
Mean Absolute Error: 1550345.7269322567
Mean Squared Error: 15613324938029.072
R-squared: 0.8296836573844426


In [37]:
# 4. K-Nearest Neighbors (KNN) Model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

In [38]:
# Evaluate KNN model
knn_mae = mean_absolute_error(y_test, knn_predictions)
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)

In [39]:
print("K-Nearest Neighbors (KNN) Model:")
print(f"Mean Absolute Error: {knn_mae}")
print(f"Mean Squared Error: {knn_mse}")
print(f"R-squared: {knn_r2}\n")

K-Nearest Neighbors (KNN) Model:
Mean Absolute Error: 1941915.666239316
Mean Squared Error: 20735168527090.316
R-squared: 0.7738125555531392



In [40]:
# 5. Support Vector Machine (SVM) Model
svm_model = SVR()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

In [41]:
# Evaluate SVM model
svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_r2 = r2_score(y_test, svm_predictions)

In [42]:
print("Support Vector Machine (SVM) Model:")
print(f"Mean Absolute Error: {svm_mae}")
print(f"Mean Squared Error: {svm_mse}")
print(f"R-squared: {svm_r2}\n")

Support Vector Machine (SVM) Model:
Mean Absolute Error: 5049360.640682706
Mean Squared Error: 100335839800140.11
R-squared: -0.09450314624512823



In [43]:
# Create a DataFrame to store evaluation metrics
metrics_df = pd.DataFrame(columns=['Model', 'Mean Absolute Error', 'Mean Squared Error', 'R-squared'])

# Function to add metrics for a model to the DataFrame

def add_metrics(model_name, mae, mse, r2):
    global metrics_df
    new_row = {'Model': model_name, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'R-squared': r2}
    metrics_df = pd.concat([metrics_df, pd.DataFrame([new_row])], ignore_index=True)  # Use concat instead of append

# ... (Rest of your code)


# Add metrics for each model
add_metrics('Linear Regression', linear_mae, linear_mse, linear_r2)
add_metrics('Decision Tree', dt_mae, dt_mse, dt_r2)
add_metrics('Random Forest', rf_mae, rf_mse, rf_r2)
add_metrics('K-Nearest Neighbors (KNN)', knn_mae, knn_mse, knn_r2)
add_metrics('Support Vector Machine (SVM)', svm_mae, svm_mse, svm_r2)

# Display the metrics DataFrame
metrics_df

  metrics_df = pd.concat([metrics_df, pd.DataFrame([new_row])], ignore_index=True)  # Use concat instead of append


Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,R-squared
0,Linear Regression,3442509.783236,38339350324441.82,0.581779
1,Decision Tree,1910511.684116,28082623193487.406,0.693664
2,Random Forest,1550345.726932,15613324938029.072,0.829684
3,K-Nearest Neighbors (KNN),1941915.666239,20735168527090.32,0.773813
4,Support Vector Machine (SVM),5049360.640683,100335839800140.1,-0.094503


In [44]:
import joblib


os.makedirs("models", exist_ok=True)

# Saving Models on models folder
# List of models
models = [
    ('Linear Regression', linear_model),
    ('Decision Tree', decision_tree_model),
    ('Random Forest', random_forest_model),
    ('K-Nearest Neighbors (KNN)', knn_model),
    ('Support Vector Machine (SVM)', svm_model),
    
    # Add other models if needed
]

import os
from pathlib import Path

# Create a proper path (using raw string and Path)
model_dir = Path(r"C:\\House_Price_Prediction\\src\\models")
model_dir.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist

for model_name, model in models:
    model_path = model_dir / f"{model_name.replace(' ', '_')}.joblib"
    joblib.dump(model, model_path)
    print(f"{model_name} model saved to: {model_path}")



Linear Regression model saved to: C:\House_Price_Prediction\src\models\Linear_Regression.joblib
Decision Tree model saved to: C:\House_Price_Prediction\src\models\Decision_Tree.joblib
Random Forest model saved to: C:\House_Price_Prediction\src\models\Random_Forest.joblib
K-Nearest Neighbors (KNN) model saved to: C:\House_Price_Prediction\src\models\K-Nearest_Neighbors_(KNN).joblib
Support Vector Machine (SVM) model saved to: C:\House_Price_Prediction\src\models\Support_Vector_Machine_(SVM).joblib


In [45]:
# Import necessary libraries
import joblib

# List of models and their corresponding paths
models_info = [
    ('Linear Regression', 'C:\\House_Price_Prediction\\src\\models\\Linear_Regression.joblib'),
    ('Decision Tree', 'C:\\House_Price_Prediction\\src\\models\\Decision_Tree.joblib'),
    ('Random Forest', 'C:\\House_Price_Prediction\\src\\models\\Random_Forest.joblib'),
    ('K-Nearest Neighbors (KNN)', 'C:\\House_Price_Prediction\\src\\models\\K-Nearest_Neighbors_(KNN).joblib'),
    ('Support Vector Machine (SVM)', 'C:\\House_Price_Prediction\\src\\models\\Support_Vector_Machine_(SVM).joblib'),
  
    # Add other models if needed
]

# Load each model
loaded_models = {}

for model_name, model_path in models_info:
    loaded_model = joblib.load(model_path)
    loaded_models[model_name] = loaded_model
    print(f'{model_name} model loaded from: {model_path}')

# Now, you can use the loaded_models dictionary to access each model as needed
# For example, loaded_models['Linear Regression'].predict(X_test)

Linear Regression model loaded from: C:\House_Price_Prediction\src\models\Linear_Regression.joblib
Decision Tree model loaded from: C:\House_Price_Prediction\src\models\Decision_Tree.joblib
Random Forest model loaded from: C:\House_Price_Prediction\src\models\Random_Forest.joblib
K-Nearest Neighbors (KNN) model loaded from: C:\House_Price_Prediction\src\models\K-Nearest_Neighbors_(KNN).joblib
Support Vector Machine (SVM) model loaded from: C:\House_Price_Prediction\src\models\Support_Vector_Machine_(SVM).joblib


In [46]:
# Load the model
loaded_model = joblib.load('C:\\House_Price_Prediction\\src\\models\\Random_Forest.joblib')
loaded_model

In [47]:
# Provide new data for prediction
new_data = pd.DataFrame({
    'No. Beds': [2.0],
    'No. Baths': [2.0],
    'Area': [1000.0],
    'Type_n': [1],
    'Region_n': [3],
    'Sub-region_n': [432]
})

In [48]:
import pandas as pd

test_data = pd.DataFrame({
    'No. Beds': [2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0],
    'No. Baths': [2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 4.0, 2.0, 3.0, 2.0],
    'Area': [735.0, 3640.0, 1110.0, 1450.0, 1066.0, 1185.0, 1406.0, 650.0, 1250.0, 620.0],
    'Type_n': [0, 3, 0, 0, 0, 0, 0, 0, 0, 0],
    'Region_n': [3, 1, 31, 20, 12, 40, 15, 40, 13, 35],
    'Sub-region_n': [432, 60, 125, 297, 209, 284, 310, 342, 113, 134]
})

test_data

Unnamed: 0,No. Beds,No. Baths,Area,Type_n,Region_n,Sub-region_n
0,2.0,2.0,735.0,0,3,432
1,3.0,3.0,3640.0,3,1,60
2,3.0,2.0,1110.0,0,31,125
3,3.0,3.0,1450.0,0,20,297
4,3.0,3.0,1066.0,0,12,209
5,3.0,2.0,1185.0,0,40,284
6,3.0,4.0,1406.0,0,15,310
7,2.0,2.0,650.0,0,40,342
8,3.0,3.0,1250.0,0,13,113
9,2.0,2.0,620.0,0,35,134


In [49]:
# Make predictions on the new data
predictions = loaded_model.predict(new_data)
print(f'Predicted Price for the new data: {predictions[0]}')

Predicted Price for the new data: 4341671.666666667


In [50]:
print(new_df['No. Beds'].max())
print(new_df['No. Beds'].min())
print(new_df['No. Baths'].max())
print(new_df['No. Baths'].min())
print(new_df['Area'].max())
print(new_df['Area'].min())
print(new_df['Type_n'].max())
print(new_df['Type_n'].min())
print(new_df['Region_n'].max())
print(new_df['Region_n'].min())
print(new_df['Sub-region_n'].max())
print(new_df['Sub-region_n'].min())

46.0
1.0
10.0
1.0
43344.0
320.0
3
0
67
0
456
0
