#### Importing Needed packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

#### Read data

In [2]:
df = pd.read_csv('housePrice.csv')
df = df.dropna()
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


### Data standardization

In [3]:
def str_to_float(x):
    return float(str(x).replace(',', ''))

#### Convert "Area" values to decimal numbers and Delete rows that have an incorrect Area 

In [4]:
df['Area'] = df['Area'].apply(str_to_float)
df = df[df['Area'] < 1000]

#### Convert "Price" values to decimal numbers

In [5]:
df['Price'] = df['Price'].apply(str_to_float)
df['Price(USD)'] = df['Price(USD)'].apply(str_to_float)

#### Delete rows that do not have an address

In [6]:
df = df.dropna(subset=['Address'])

#### Convert boolean values to numbers for Parking, Warehouse and Elevator properties

In [7]:
df['Parking'] = df['Parking'].astype(int)
df['Warehouse'] = df['Warehouse'].astype(int)
df['Elevator'] = df['Elevator'].astype(int)
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63.0,1,1,1,1,Shahran,1850000000.0,61666.67
1,60.0,1,1,1,1,Shahran,1850000000.0,61666.67
2,79.0,2,1,1,1,Pardis,550000000.0,18333.33
3,95.0,2,1,1,1,Shahrake Qods,902500000.0,30083.33
4,123.0,2,1,1,1,Shahrake Gharb,7000000000.0,233333.33


#### Data separation

In [8]:
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

train_x = np.asanyarray(train[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']])
train_y = np.asanyarray(train[['Price']])
                              
test_x = np.asanyarray(test[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']])
test_y = np.asanyarray(test[['Price']]) 

#### Simple linear regression

In [9]:
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)

print('Coefficients: ', regr.coef_)
print('Intercept: ', regr.intercept_)

Coefficients:  [[ 7.07956298e+07  1.59572680e+09 -3.11477056e+08  1.11838701e+09
   1.12748694e+09]]
Intercept:  [-7.16634439e+09]


In [10]:
test_y_ = regr.predict(test_x)

print('mean absolute error: %.2f' % np.mean(np.absolute(test_y_ - test_y)))
print('residual sum of squares (MSE): %.2f' % np.mean((test_y_ - test_y) ** 2))
print('r2-score: %.2f' % r2_score(test_y_, test_y))

mean absolute error: 2694409315.36
residual sum of squares (MSE): 22273494188156694528.00
r2-score: 0.18


#### Polynomial regression

In [11]:
poly = PolynomialFeatures(degree = 2)
train_x_poly = poly.fit_transform(train_x)
train_x_poly

train_y_ = regr.fit(train_x_poly, train_y)
# The coefficients
print ('Coefficients: ', regr.coef_)
print ('Intercept: ', regr.intercept_)

Coefficients:  [[ 0.00000000e+00  7.25928381e+07 -5.82860362e+09  7.79848995e+08
  -5.81588830e+08 -3.98372396e+08 -9.90379727e+04  2.93720733e+07
  -5.31987642e+07 -4.39226096e+06  6.03850681e+07  6.51152781e+07
   1.14049804e+09  1.57742618e+09 -1.31787630e+09  7.79848995e+08
   7.32547079e+08  2.41107720e+08 -5.81588830e+08 -1.46948249e+09
  -3.98372396e+08]]
Intercept:  [1.57609238e+09]


In [12]:
test_x_poly = poly.fit_transform(test_x)
test_y_ = regr.predict(test_x_poly)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,test_y_ ) )

Mean absolute error: 2378026249.54
Residual sum of squares (MSE): 16466382378332141568.00
R2-score: 0.70
