In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lls import LinearLeastSquare
from sklearn.model_selection import train_test_split
from io import StringIO

In [2]:
data=pd.read_csv("input/HousePrice.csv")
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [3]:
data = data[data['Area'].apply(lambda x: x.isnumeric())]
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.reset_index(drop = True, inplace = True)

In [4]:
# update dollar price to April 2024

data['Price(USD)'] = data['Price'] / 64000
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,28906.25
1,60,1,True,True,True,Shahran,1850000000.0,28906.25
2,79,2,True,True,True,Pardis,550000000.0,8593.75
3,95,2,True,True,True,Shahrake Qods,902500000.0,14101.5625
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,109375.0


In [5]:
# 5 most expensive houses

data_expensive = data.sort_values(by=['Price'], ascending=False)
data_expensive.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
1606,420,4,True,True,True,Zaferanieh,92400000000.0,1443750.0
1704,705,5,True,True,False,Abazar,91000000000.0,1421875.0
405,400,5,True,True,False,Lavasan,85000000000.0,1328125.0
770,680,5,True,True,False,Ekhtiarieh,81600000000.0,1275000.0
1249,350,4,True,True,True,Niavaran,80500000000.0,1257812.5


In [7]:
# Use all possible features for X_train

data['Parking']=data['Parking'].replace([True,False],[0,1])
data['Warehouse']=data['Warehouse'].replace([True,False],[0,1])
data['Elevator']=data['Elevator'].replace([True,False],[0,1])
data.drop('Address',axis=1,inplace=True)
X = data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']].to_numpy().astype(int)
Y = data[['Price']].to_numpy().astype(int)

  Y = data[['Price']].to_numpy().astype(int)


In [8]:
data['Area'] = pd.to_numeric(data['Area'])
data['Area']

0        63
1        60
2        79
3        95
4       123
       ... 
3237     63
3238     86
3239     83
3240    105
3241     82
Name: Area, Length: 3242, dtype: int64

In [9]:
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Price(USD)
0,63,1,0,0,0,1850000000.0,28906.25
1,60,1,0,0,0,1850000000.0,28906.25
2,79,2,0,0,0,550000000.0,8593.75
3,95,2,0,0,0,902500000.0,14101.5625
4,123,2,0,0,0,7000000000.0,109375.0


In [10]:
# Split dataset

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True, test_size=0.2)

In [11]:
# Fit the LLS model on training dataset

X_train

array([[185,   3,   0,   0,   0],
       [116,   2,   0,   0,   0],
       [130,   2,   0,   0,   1],
       ...,
       [ 40,   1,   1,   0,   1],
       [ 50,   1,   0,   0,   0],
       [ 80,   2,   0,   0,   0]])

In [12]:
lls= LinearLeastSquare()
lls.fit(X_train,Y_train)

array([[-3.32019649e+06],
       [-4.15019110e+08],
       [ 8.83683608e+08],
       [ 9.79983499e+07],
       [ 7.26835323e+08]])

In [13]:
# Evaluate model on test dataset

Y_pred = lls.predict(X_test)
print('Evaluate MAE:', lls.evaluate(Y_test, Y_pred, 'mae'))
print('Evaluate MSE:', lls.evaluate(Y_test, Y_pred, 'mse'))
print('Evaluate RMAE:', lls.evaluate(Y_test, Y_pred, 'rmse'))

Evaluate MAE: 1108935010.4303415
Evaluate MSE: 1.8008297239218112e+18
Evaluate RMAE: 1341949970.7223854


In [15]:
# Compare result with Scikit-Learn's results

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV 
import sklearn.metrics as metrics

In [17]:
lls= LinearRegression()
lls.fit(X_train,Y_train)

In [18]:
Y_pred = lls.predict(X_test)
print('Evaluate MAE:', metrics.mean_absolute_error(Y_pred, Y_test))
print('Evaluate MSE:', metrics.mean_squared_error(Y_pred, Y_test))
print('Evaluate RMAE:', np.sqrt(metrics.mean_squared_error(Y_pred, Y_test)))

Evaluate MAE: 1108501987.0909965
Evaluate MSE: 1.7991372454018796e+18
Evaluate RMAE: 1341319218.307812


In [19]:
lls= RidgeCV()
lls.fit(X_train,Y_train)

In [20]:
Y_pred = lls.predict(X_test)
print('Evaluate MAE:', metrics.mean_absolute_error(Y_pred, Y_test))
print('Evaluate MSE:', metrics.mean_squared_error(Y_pred, Y_test))
print('Evaluate RMAE:', np.sqrt(metrics.mean_squared_error(Y_pred, Y_test)))

Evaluate MAE: 1110307015.0970385
Evaluate MSE: 1.7994063871069345e+18
Evaluate RMAE: 1341419541.794041
