In [None]:
import pandas as pd 

In [2]:
df = pd.read_csv('vn_housing_dataset.csv')


In [3]:
df.drop(['Unnamed: 0','Ngày','Địa chỉ'], axis = 1,inplace=True)

In [4]:
df = df.rename(columns = {"Quận":"district", 
                                  "Huyện":"ward", "Loại hình nhà ở":"type_of_housing",
                                 "Giấy tờ pháp lý":"legal_paper", "Số tầng":"num_floors",
                                 "Số phòng ngủ":"num_bed_rooms", "Diện tích":"squared_meter_area",
                                 "Dài":"length_meter", "Rộng":"width_meter", "Giá/m2":"price_in_million_per_square_meter"})

In [5]:
df.describe()

Unnamed: 0,district,ward,type_of_housing,legal_paper,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter
count,82495,82449,82465,53610,36399,82458,82495,19827,35445,82484
unique,29,309,4,3,29,11,504,487,410,8345
top,Quận Đống Đa,Phường Khương Trung,"Nhà ngõ, hẻm",Đã có sổ,5,4 phòng,40 m²,10 m,4 m,100 triệu/m²
freq,13991,2832,62537,52914,15770,29069,6606,3912,12021,2503


## Cleaning data and removing outliers

In [6]:
df.head(10)

Unnamed: 0,district,ward,type_of_housing,legal_paper,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter
0,Quận Cầu Giấy,Phường Nghĩa Đô,"Nhà ngõ, hẻm",Đã có sổ,4.0,5 phòng,46 m²,,,"86,96 triệu/m²"
1,Quận Thanh Xuân,Phường Kim Giang,"Nhà mặt phố, mặt tiền",,,3 phòng,37 m²,,,"116,22 triệu/m²"
2,Quận Hai Bà Trưng,Phường Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4.0,4 phòng,40 m²,10 m,4 m,65 triệu/m²
3,Quận Tây Hồ,Phường Thụy Khuê,"Nhà ngõ, hẻm",Đã có sổ,,6 phòng,51 m²,12.75 m,4 m,100 triệu/m²
4,Quận Thanh Xuân,Phường Kim Giang,"Nhà ngõ, hẻm",,,4 phòng,36 m²,9 m,4 m,"86,11 triệu/m²"
5,Quận Cầu Giấy,Phường Yên Hoà,"Nhà ngõ, hẻm",Đã có sổ,,nhiều hơn 10 phòng,46 m²,12.1 m,3.8 m,"104,35 triệu/m²"
6,Quận Đống Đa,Phường Trung Liệt,"Nhà ngõ, hẻm",,,3 phòng,52 m²,,4.5 m,"112,5 triệu/m²"
7,Quận Hai Bà Trưng,Phường Đống Mác,"Nhà mặt phố, mặt tiền",Đã có sổ,6.0,5 phòng,32 m²,,6.8 m,"184,38 triệu/m²"
8,Quận Tây Hồ,Phường Xuân La,"Nhà ngõ, hẻm",,,4 phòng,75 m²,12 m,6.5 m,120 triệu/m²
9,Quận Hà Đông,Phường Văn Quán,"Nhà ngõ, hẻm",Đã có sổ,4.0,3 phòng,41 m²,,3.5 m,"64,63 triệu/m²"


In [7]:
df = df.dropna()
df = df[df['num_bed_rooms'] != 'nhiều hơn 10 phòng']
df = df[df['num_floors'] != 'Nhiều hơn 10']

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11313 entries, 2 to 82362
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   district                           11313 non-null  object
 1   ward                               11313 non-null  object
 2   type_of_housing                    11313 non-null  object
 3   legal_paper                        11313 non-null  object
 4   num_floors                         11313 non-null  object
 5   num_bed_rooms                      11313 non-null  object
 6   squared_meter_area                 11313 non-null  object
 7   length_meter                       11313 non-null  object
 8   width_meter                        11313 non-null  object
 9   price_in_million_per_square_meter  11313 non-null  object
dtypes: object(10)
memory usage: 972.2+ KB


In [9]:
set(df['num_floors'])

{'1',
 '10',
 '12',
 '2',
 '3',
 '33',
 '35',
 '4',
 '45',
 '5',
 '50',
 '52',
 '54',
 '55',
 '6',
 '7',
 '73',
 '8',
 '9'}

In [10]:
# Remove houses with "10 plus" floors and bed rooms, since this cannot be exactly quantified
df = df[df['num_floors'] != 'Nhiều hơn 10']
df = df[df['num_bed_rooms'] != 'nhiều hơn 10 phòng']

# Clean columns and convert numerical columns to float type
df['district'] = df['district'].str.replace('Quận ','').str.strip()
df['ward'] = df['ward'].str.replace('Phường ','').str.strip()
df['num_floors'] = df['num_floors'].str.strip().astype(float)
df['num_bed_rooms'] = df['num_bed_rooms'].str.replace(' phòng','').str.strip().astype(float)
df['squared_meter_area'] = df['squared_meter_area'].str.replace(' m²','').str.strip().astype(float)
df['length_meter'] = df['length_meter'].str.replace(' m','').str.strip().astype(float)
df['width_meter'] = df['width_meter'].str.replace(' m','').str.strip().astype(float)

# Clean and convert all prices to million/m2 instead of VND/m2 or billion/m2
df.loc[df['price_in_million_per_square_meter'].str.contains(' tỷ/m²'), 'price_in_million_per_square_meter'] = df.loc[df['price_in_million_per_square_meter'].str.contains(' tỷ/m²'), 'price_in_million_per_square_meter'].str.replace(' tỷ/m²','').str.replace('.','').str.replace(',','.').astype(float) * 1000
df.loc[df['price_in_million_per_square_meter'].str.contains(' triệu/m²', na=False), 'price_in_million_per_square_meter'] = df.loc[df['price_in_million_per_square_meter'].str.contains(' triệu/m²', na=False), 'price_in_million_per_square_meter'].str.replace(' triệu/m²','').str.replace(',','.').astype(float)
df.loc[df['price_in_million_per_square_meter'].str.contains(' đ/m²', na=False), 'price_in_million_per_square_meter'] = df.loc[df['price_in_million_per_square_meter'].str.contains(' đ/m²', na=False), 'price_in_million_per_square_meter'].str.replace(' đ/m²','').str.replace('.','').astype(float) * 0.000001

In [11]:
df.head()

Unnamed: 0,district,ward,type_of_housing,legal_paper,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter
2,Hai Bà Trưng,Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4.0,4.0,40.0,10.0,4.0,65.0
15,Long Biên,Bồ Đề,"Nhà ngõ, hẻm",Đã có sổ,5.0,4.0,52.0,12.0,4.2,93.27
24,Hà Đông,La Khê,"Nhà mặt phố, mặt tiền",Đã có sổ,5.0,5.0,90.0,18.0,5.0,108.89
29,Nam Từ Liêm,Tây Mỗ,"Nhà ngõ, hẻm",Đã có sổ,4.0,3.0,32.0,6.6,4.5,60.94
34,Huyện Thanh Trì,Xã Tả Thanh Oai,"Nhà ngõ, hẻm",Đã có sổ,3.0,2.0,42.0,11.0,4.0,29.76


In [12]:
set(df['legal_paper'])

{'Giấy tờ khác', 'Đang chờ sổ', 'Đã có sổ'}

In [13]:
df['num_floors'] = pd.to_numeric(df['num_floors'],downcast= 'integer')
df['num_bed_rooms'] = pd.to_numeric(df['num_bed_rooms'],downcast= 'integer')
df['squared_meter_area'] = pd.to_numeric(df['squared_meter_area'],downcast= 'float')
df['length_meter'] = pd.to_numeric(df['length_meter'],downcast= 'float')
df['width_meter'] = pd.to_numeric(df['width_meter'],downcast= 'float')
df['price_in_million_per_square_meter'] = pd.to_numeric(df['price_in_million_per_square_meter'],downcast= 'float')


In [14]:
df.head()

Unnamed: 0,district,ward,type_of_housing,legal_paper,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter
2,Hai Bà Trưng,Minh Khai,"Nhà ngõ, hẻm",Đã có sổ,4,4,40.0,10.0,4.0,65.0
15,Long Biên,Bồ Đề,"Nhà ngõ, hẻm",Đã có sổ,5,4,52.0,12.0,4.2,93.27
24,Hà Đông,La Khê,"Nhà mặt phố, mặt tiền",Đã có sổ,5,5,90.0,18.0,5.0,108.89
29,Nam Từ Liêm,Tây Mỗ,"Nhà ngõ, hẻm",Đã có sổ,4,3,32.0,6.6,4.5,60.94
34,Huyện Thanh Trì,Xã Tả Thanh Oai,"Nhà ngõ, hẻm",Đã có sổ,3,2,42.0,11.0,4.0,29.76


In [15]:
df.dtypes

district                              object
ward                                  object
type_of_housing                       object
legal_paper                           object
num_floors                              int8
num_bed_rooms                           int8
squared_meter_area                   float32
length_meter                         float32
width_meter                          float32
price_in_million_per_square_meter    float64
dtype: object

In [16]:
dummy_type_of_housing = pd.get_dummies(df.type_of_housing, prefix="housing_type")
dummy_type_of_housing = dummy_type_of_housing.astype(int)
dummy_legal_paper = pd.get_dummies(df.legal_paper, prefix="legal_paper")
dummy_legal_paper = dummy_legal_paper.astype(int)
dummy_district = pd.get_dummies(df.district, prefix="district")
dummy_district = dummy_district.astype(int)
dummy_ward = pd.get_dummies(df.ward, prefix="ward")
dummy_ward = dummy_ward.astype(int)
df_cleaned = pd.concat([df, dummy_type_of_housing, dummy_legal_paper, dummy_district, dummy_ward], axis=1)
df_cleaned = df_cleaned.drop(['district', 'ward', 'type_of_housing', 'legal_paper'], axis = 1)


In [17]:
df_cleaned.to_csv('data_new.csv')

In [18]:
df_cleaned.head(10)

Unnamed: 0,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter,housing_type_Nhà biệt thự,"housing_type_Nhà mặt phố, mặt tiền","housing_type_Nhà ngõ, hẻm",housing_type_Nhà phố liền kề,...,ward_Đại Mỗ,ward_Định Công,ward_Đống Mác,ward_Đồng Mai,ward_Đồng Nhân,ward_Đồng Tâm,ward_Đồng Xuân,ward_Đội Cấn,ward_Đức Giang,ward_Đức Thắng
2,4,4,40.0,10.0,4.0,65.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15,5,4,52.0,12.0,4.2,93.27,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
24,5,5,90.0,18.0,5.0,108.89,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
29,4,3,32.0,6.6,4.5,60.94,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
34,3,2,42.0,11.0,4.0,29.76,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
37,4,4,43.0,11.0,4.0,50.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
41,4,4,38.0,10.0,3.0,68.42,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
48,3,5,200.0,22.0,9.0,47.25,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,5,9,77.0,17.0,4.0,2.08,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
68,5,4,30.0,9.0,3.4,82.67,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_cleaned.dtypes

num_floors               int8
num_bed_rooms            int8
squared_meter_area    float32
length_meter          float32
width_meter           float32
                       ...   
ward_Đồng Tâm           int32
ward_Đồng Xuân          int32
ward_Đội Cấn            int32
ward_Đức Giang          int32
ward_Đức Thắng          int32
Length: 281, dtype: object

In [20]:
def remove_outlier_IQR(df_cleaned, series):
    Q1=df_cleaned[series].quantile(0.25)
    Q3=df_cleaned[series].quantile(0.75)
    IQR=Q3-Q1
    df_final=df_cleaned[~((df_cleaned[series]<(Q1-1.5*IQR)) | (df_cleaned[series]>(Q3+1.5*IQR)))]
    return df_final

removed_outliers = df_cleaned
columns_to_remove_outliers = ['num_floors', 'num_bed_rooms', 'squared_meter_area', 'length_meter',
                              'width_meter', 'price_in_million_per_square_meter']
for column in columns_to_remove_outliers:
    removed_outliers = remove_outlier_IQR(removed_outliers, column)
    
print("The final length of the dataset is", str(len(removed_outliers)), "rows.")

The final length of the dataset is 7328 rows.


In [21]:
removed_outliers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7328 entries, 2 to 82351
Columns: 281 entries, num_floors to ward_Đức Thắng
dtypes: float32(3), float64(1), int32(275), int8(2)
memory usage: 7.9 MB


In [22]:
removed_outliers.head(10)

Unnamed: 0,num_floors,num_bed_rooms,squared_meter_area,length_meter,width_meter,price_in_million_per_square_meter,housing_type_Nhà biệt thự,"housing_type_Nhà mặt phố, mặt tiền","housing_type_Nhà ngõ, hẻm",housing_type_Nhà phố liền kề,...,ward_Đại Mỗ,ward_Định Công,ward_Đống Mác,ward_Đồng Mai,ward_Đồng Nhân,ward_Đồng Tâm,ward_Đồng Xuân,ward_Đội Cấn,ward_Đức Giang,ward_Đức Thắng
2,4,4,40.0,10.0,4.0,65.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15,5,4,52.0,12.0,4.2,93.27,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
29,4,3,32.0,6.6,4.5,60.94,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
34,3,2,42.0,11.0,4.0,29.76,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
37,4,4,43.0,11.0,4.0,50.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
41,4,4,38.0,10.0,3.0,68.42,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
68,5,4,30.0,9.0,3.4,82.67,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
70,4,2,18.0,6.0,3.2,105.56,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
71,5,5,33.0,11.0,3.0,136.36,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
76,5,3,32.0,11.0,3.2,89.06,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
import warnings
import os
import numpy as np
import datetime 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




In [24]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/93/21/9b035a4f823d6aee2917c75415be9a95861ff3d73a0a65e48edbf210cec1/tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting tensorflow-intel==2.15.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.15.0 from https://files.pythonhosted.org/packages/4c/48/1a5a15517f18eaa4ff8d598b1c000300b20c1bb0e624539d702117a0c369/tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata
  Using cached tensorflow_intel-2.15.0-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.15.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0

In [26]:
housing = removed_outliers

# Separate predictors and response (price) variables
X = housing.loc[:, housing.columns != 'price_in_million_per_square_meter']
y = housing[['price_in_million_per_square_meter']]
to_be_scaled = ['num_floors', 'num_bed_rooms', 'squared_meter_area', 'length_meter', 'width_meter']

# Initiate scaler
PredictorScaler=StandardScaler()
TargetVarScaler=StandardScaler()

X_scaled = X
y_scaled = y

# Storing the fit object for reference and reverse the scaling later
PredictorScalerFit=PredictorScaler.fit(X_scaled[to_be_scaled])
TargetVarScalerFit=TargetVarScaler.fit(y_scaled)
 
# Generating the standardized values of X and y
X_scaled[to_be_scaled]=PredictorScalerFit.transform(X_scaled[to_be_scaled])
y_scaled=TargetVarScalerFit.transform(y)

X_array = np.array(X_scaled.values).astype("float32")
y_array = np.array(y_scaled).astype("float32")

X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, random_state=2032)

# Sanity check to see if all train and test arrays have correct dimensions
if X_train.shape[0] == y_train.shape[0] and X_train.shape[1] == X_test.shape[1] and X_test.shape[0] == y_test.shape[0] and y_train.shape[1] == y_test.shape[1]:
    print("All train and test sets have correct dimensions.")

All train and test sets have correct dimensions.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_scaled[to_be_scaled]=PredictorScalerFit.transform(X_scaled[to_be_scaled])


In [27]:
# Create a dictionary of random parameters for the model
RF_random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [int(x) for x in np.linspace(10, 100, num = 10)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [None]:
# Turn off TensorFlow messages and warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["KMP_SETTINGS"] = "false"

# Create the base RF model and fit the random search
RF_regressor = RandomForestRegressor()
RF_random_search = RandomizedSearchCV(estimator=RF_regressor, param_distributions=RF_random_grid, n_iter=100, cv=5, 
                                      verbose=0, random_state=2022, n_jobs = -1).fit(X_train, np.ravel(y_train))
RF_best_params = RF_random_search.best_params_
RF_best_params

In [76]:
# Narrowing the parameters grid based on the best parameters given by the random search, then feed the grid to a grid search
RF_param_grid = {'n_estimators': [RF_best_params['n_estimators']-100, RF_best_params['n_estimators'], RF_best_params['n_estimators']+100],
               'max_features': ['sqrt', 'log2'],
               'max_depth': [RF_best_params['max_depth'] - 10, RF_best_params['max_depth'], RF_best_params['max_depth']+10],
               'min_samples_split': [5, 10],
               'min_samples_leaf': [1, 2],
               'bootstrap': [True, False]}

In [78]:
# Turn off TensorFlow messages and warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["KMP_SETTINGS"] = "false"

# Create another base RF model and fit the grid search
RF_regressor_2 = RandomForestRegressor()
RF_grid_search = GridSearchCV(estimator=RF_regressor_2, param_grid=RF_param_grid, 
                              cv=3, n_jobs=-1, verbose=0).fit(X_train, np.ravel(y_train))

# Showing the best parameters
RF_grid_search.best_params_

144 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\admin\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\admin\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\admin\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\admin\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 200}

In [79]:
y_test_orig = TargetVarScalerFit.inverse_transform(y_test)
Test_Data = np.concatenate((PredictorScalerFit.inverse_transform(X_test[:,:5]), X_test[:,5:]), axis=1)
TestingData = pd.DataFrame(data=Test_Data, columns=X.columns)
TestingData['Price'] = y_test_orig

In [84]:
# Fitting a RF model with the best parameters
RF = RF_grid_search.best_estimator_

# Generating Predictions on testing data
RF_predictions = RF.predict(X_test)
RF_predictions = RF_predictions.reshape(-1, 1)
 
# Scaling the predicted Price data back to original price scale
RF_predictions = TargetVarScalerFit.inverse_transform(RF_predictions)
TestingData['RF_predictions'] = RF_predictions

TestingData[['Price', 'RF_predictions']].head()

In [80]:
# Fitting a RF model with the best parameters
RF = RF_grid_search.best_estimator_

# Generating Predictions on testing data
RF_predictions = RF.predict(X_test)
RF_predictions = RF_predictions.reshape(-1, 1)
 
# Scaling the predicted Price data back to original price scale
RF_predictions = TargetVarScalerFit.inverse_transform(RF_predictions)

# Reshape the predictions to 2D array
RF_predictions = RF_predictions.reshape(-1, 1)

TestingData['RF_predictions'] = RF_predictions

TestingData[['Price', 'RF_predictions']].head()

Unnamed: 0,Price,RF_predictions
0,58.329998,68.099378
1,62.5,72.440069
2,74.190002,77.606034
3,78.720001,67.998055
4,142.5,72.621196


In [81]:
def Accuracy_Score(orig, pred):
    MAPE = np.mean(100 * (np.abs(orig - pred) / orig))
    return(100-MAPE)

In [82]:
print("Accuracy for the RF model is:", str(Accuracy_Score(TestingData['Price'], TestingData['RF_predictions'])))

Accuracy for the RF model is: 84.23171906631465
