import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

In [2]:
raw_data = pd.read_csv('real_estate_price_size_year_view.csv')
raw_data.head()

Unnamed: 0,price,size,year,view
0,234314.144,643.09,2015,No sea view
1,228581.528,656.22,2009,No sea view
2,281626.336,487.29,2018,Sea view
3,401255.608,1504.75,2015,No sea view
4,458674.256,1275.46,2009,Sea view


In [3]:
raw_data.describe(include='all')

Unnamed: 0,price,size,year,view
count,100.0,100.0,100.0,100
unique,,,,2
top,,,,No sea view
freq,,,,51
mean,292289.47016,853.0242,2012.6,
std,77051.727525,297.941951,4.729021,
min,154282.128,479.75,2006.0,
25%,234280.148,643.33,2009.0,
50%,280590.716,696.405,2015.0,
75%,335723.696,1029.3225,2018.0,


In [4]:
data = raw_data.copy()
data['view'] = data['view'].map({'Sea view': 1, 'No sea view': 0})
data.head()

Unnamed: 0,price,size,year,view
0,234314.144,643.09,2015,0
1,228581.528,656.22,2009,0
2,281626.336,487.29,2018,1
3,401255.608,1504.75,2015,0
4,458674.256,1275.46,2009,1


In [5]:
y = data['price'] # dependent variable
x = data[['size','year','view']] # independent variables


In [6]:
X = sm.add_constant(x)
results = sm.OLS(y,X).fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.913
Model:,OLS,Adj. R-squared:,0.91
Method:,Least Squares,F-statistic:,335.2
Date:,"Thu, 21 Jan 2021",Prob (F-statistic):,1.02e-50
Time:,23:35:27,Log-Likelihood:,-1144.6
No. Observations:,100,AIC:,2297.0
Df Residuals:,96,BIC:,2308.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.398e+06,9.94e+05,-5.431,0.000,-7.37e+06,-3.43e+06
size,223.0316,7.838,28.455,0.000,207.473,238.590
year,2718.9489,493.502,5.510,0.000,1739.356,3698.542
view,5.673e+04,4627.695,12.258,0.000,4.75e+04,6.59e+04

0,1,2,3
Omnibus:,29.224,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.957
Skew:,1.088,Prob(JB):,7.85e-15
Kurtosis:,6.295,Cond. No.,942000.0


# let try to solve this problen using sklearn linear regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

In [9]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression()

In [10]:
reg.coef_

array([66117.38856853, 12793.51409608, 28357.33672984])

In [11]:
reg.intercept_

292289.4701599997

In [12]:
reg.score(x_scaled,y)

0.9128639058979645

### Formula for Adjusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [13]:
x.shape

(100, 3)

In [14]:
# If we want to find the Adjusted R-squared we can do so by knowing the r2, the # observations, the # features
r2 = reg.score(x_scaled,y)
# Number of observations is the shape along axis 0
n = x.shape[0]
# Number of features (predictors, p) is the shape along axis 1
p = x.shape[1]

# We find the Adjusted R-squared using the formula
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.9101409029572759

In [15]:
reg.predict([[750,2009,0]]) #predicting without doing standardization

array([75582500.71558337])

In [16]:
#prediction of new data
new_data = [[750,2009,0]]
new_data_scaled = scaler.transform(new_data)
reg.predict(new_data_scaled)

array([231727.85029188])

In [17]:
from sklearn.feature_selection import f_regression

# This module allows us to select the most appopriate features for our regression

In [18]:
f_regression(x_scaled,y)

# There are two output arrays
# The first one contains the F-statistics for each of the regressions
# The second one contains the p-values of these F-statistics

(array([285.92105192,   0.85525799,  20.25908753]),
 array([8.12763222e-31, 3.57340758e-01, 1.86445030e-05]))

In [19]:
p_values = f_regression(x,y)[1] # x and x_scale giving same p value
p_values

array([8.12763222e-31, 3.57340758e-01, 1.86445030e-05])

In [20]:
p_values.round(3)

array([0.   , 0.357, 0.   ])

In [21]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary ['Coefficients'] = reg.coef_
reg_summary ['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,66117.388569,0.0
1,year,12793.514096,0.357
2,view,28357.33673,0.0


It seems that 'Year' is not event significant, therefore we should remove it from the model.