In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [32]:
data = pd.read_csv('Real estate.csv')

In [33]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   No  X1 transaction date  X2 house age  \
0   1             2012.917          32.0   
1   2             2012.917          19.5   
2   3             2013.583          13.3   
3   4             2013.500          13.3   
4   5             2012.833           5.0   

   X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                84.87882                               10   
1                               306.59470                                9   
2                               561.98450                                5   
3                               561.98450                                5   
4                               390.56840                                5   

   X5 latitude  X6 longitude  Y house price of unit area  
0     24.98298     121.54024                        37.9  
1     24.98034     121.53951                        42.2  
2     24.98746     121.54391                        47.3  

In [34]:
# Check the shape of the dataset
print("\nDataset shape:", data.shape)


Dataset shape: (414, 8)


In [35]:
# Check the data types of columns
print("\nData types of columns:")
print(data.dtypes)


Data types of columns:
No                                          int64
X1 transaction date                       float64
X2 house age                              float64
X3 distance to the nearest MRT station    float64
X4 number of convenience stores             int64
X5 latitude                               float64
X6 longitude                              float64
Y house price of unit area                float64
dtype: object


In [36]:
# Summary statistics
print("\nSummary statistics of numerical columns:")
print(data.describe())


Summary statistics of numerical columns:
               No  X1 transaction date  X2 house age  \
count  414.000000           414.000000    414.000000   
mean   207.500000          2013.148971     17.712560   
std    119.655756             0.281967     11.392485   
min      1.000000          2012.667000      0.000000   
25%    104.250000          2012.917000      9.025000   
50%    207.500000          2013.167000     16.100000   
75%    310.750000          2013.417000     28.150000   
max    414.000000          2013.583000     43.800000   

       X3 distance to the nearest MRT station  \
count                              414.000000   
mean                              1083.885689   
std                               1262.109595   
min                                 23.382840   
25%                                289.324800   
50%                                492.231300   
75%                               1454.279000   
max                               6488.021000   

       X4 n

In [37]:
# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())


Missing values:
No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64


In [39]:
# Train Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [40]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

In [41]:
# predictions
rf_predictions = rf_model.predict(X_test)

In [42]:
# Evaluate Random Forest model
rf_mse = mean_squared_error(y_test, rf_predictions)
print("Random Forest Mean Squared Error:", rf_mse)

Random Forest Mean Squared Error: 54.560840703725646


In [43]:
# predictions
dt_predictions = dt_model.predict(X_test)

In [44]:
# Evaluate Random Forest model
dt_mse = mean_squared_error(y_test, dt_predictions)
print("Decision Tree Mean Squared Error:", dt_mse)

Decision Tree Mean Squared Error: 62.36728915662651


In [45]:
print("\nPerformance Comparison:")
if rf_mse < dt_mse:print("Random Forest better than Decision Tree.")
elif rf_mse > dt_mse:print("Decision Tree is better than Random Forest.")
else:print("Both models have similar performance.")


Performance Comparison:
Random Forest better than Decision Tree.
