In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
data = pd.read_csv('Data/HousePricePredictor.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   object 
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 146.2+ KB


In [4]:
data.describe()

Unnamed: 0,Room,Price,Price(USD)
count,3479.0,3479.0,3479.0
mean,2.079908,5359023000.0,178634.1
std,0.758275,8099935000.0,269997.8
min,0.0,3600000.0,120.0
25%,2.0,1418250000.0,47275.0
50%,2.0,2900000000.0,96666.67
75%,2.0,6000000000.0,200000.0
max,5.0,92400000000.0,3080000.0


In [5]:
(data == 0).sum()

Area            0
Room           10
Parking       529
Warehouse     297
Elevator      740
Address         0
Price           0
Price(USD)      0
dtype: int64

In [6]:
data.isnull().sum()

Area           0
Room           0
Parking        0
Warehouse      0
Elevator       0
Address       23
Price          0
Price(USD)     0
dtype: int64

In [7]:
data = data.dropna(subset=['Address'])

In [8]:
data.isnull().sum()

Area          0
Room          0
Parking       0
Warehouse     0
Elevator      0
Address       0
Price         0
Price(USD)    0
dtype: int64

In [9]:
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1.850000e+09,61666.67
1,60,1,True,True,True,Shahran,1.850000e+09,61666.67
2,79,2,True,True,True,Pardis,5.500000e+08,18333.33
3,95,2,True,True,True,Shahrake Qods,9.025000e+08,30083.33
4,123,2,True,True,True,Shahrake Gharb,7.000000e+09,233333.33
...,...,...,...,...,...,...,...,...
3474,86,2,True,True,True,Southern Janatabad,3.500000e+09,116666.67
3475,83,2,True,True,True,Niavaran,6.800000e+09,226666.67
3476,75,2,False,False,False,Parand,3.650000e+08,12166.67
3477,105,2,True,True,True,Dorous,5.600000e+09,186666.67


In [10]:
unique_addresses = data['Address'].unique()
print(len(unique_addresses))

192


In [11]:
columns_to_convert = ['Parking', 'Warehouse', 'Elevator']
data[columns_to_convert] = data[columns_to_convert].astype(int)


In [12]:
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,1,1,1,Shahran,1.850000e+09,61666.67
1,60,1,1,1,1,Shahran,1.850000e+09,61666.67
2,79,2,1,1,1,Pardis,5.500000e+08,18333.33
3,95,2,1,1,1,Shahrake Qods,9.025000e+08,30083.33
4,123,2,1,1,1,Shahrake Gharb,7.000000e+09,233333.33
...,...,...,...,...,...,...,...,...
3474,86,2,1,1,1,Southern Janatabad,3.500000e+09,116666.67
3475,83,2,1,1,1,Niavaran,6.800000e+09,226666.67
3476,75,2,0,0,0,Parand,3.650000e+08,12166.67
3477,105,2,1,1,1,Dorous,5.600000e+09,186666.67


In [13]:
data = pd.get_dummies(data, columns=['Address'])

In [14]:
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Price(USD),Address_Abazar,Address_Abbasabad,Address_Absard,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
0,63,1,1,1,1,1.850000e+09,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,60,1,1,1,1,1.850000e+09,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,79,2,1,1,1,5.500000e+08,18333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,95,2,1,1,1,9.025000e+08,30083.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,123,2,1,1,1,7.000000e+09,233333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,86,2,1,1,1,3.500000e+09,116666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3475,83,2,1,1,1,6.800000e+09,226666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3476,75,2,0,0,0,3.650000e+08,12166.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3477,105,2,1,1,1,5.600000e+09,186666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
data.to_csv('Data/processed_data.csv', index=False, encoding='utf-8')

In [16]:
data

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Price(USD),Address_Abazar,Address_Abbasabad,Address_Absard,...,Address_Waterfall,Address_West Ferdows Boulevard,Address_West Pars,Address_Yaftabad,Address_Yakhchiabad,Address_Yousef Abad,Address_Zafar,Address_Zaferanieh,Address_Zargandeh,Address_Zibadasht
0,63,1,1,1,1,1.850000e+09,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,60,1,1,1,1,1.850000e+09,61666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,79,2,1,1,1,5.500000e+08,18333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,95,2,1,1,1,9.025000e+08,30083.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,123,2,1,1,1,7.000000e+09,233333.33,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3474,86,2,1,1,1,3.500000e+09,116666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3475,83,2,1,1,1,6.800000e+09,226666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3476,75,2,0,0,0,3.650000e+08,12166.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3477,105,2,1,1,1,5.600000e+09,186666.67,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
data.drop(columns=['Price(USD)'], inplace=True)
data.drop(columns=['Area'], inplace=True)

In [18]:
X = data.drop(columns=['Price'])
y = data['Price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')

Train size: (2764, 196), Test size: (692, 196)


In [21]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3456 entries, 0 to 3478
Columns: 197 entries, Room to Address_Zibadasht
dtypes: bool(192), float64(1), int64(4)
memory usage: 810.0 KB


In [23]:
object_columns = data.select_dtypes(include=['object']).columns
print(object_columns)


Index([], dtype='object')


In [24]:

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")

Mean Absolute Error (MAE): 2053958741.9815602
Mean Squared Error (MSE): 3.4201308368795947e+19
