### Simple Linear Regression Models

##### **Note:** statsmodels.api gives **one** output table with all the relevant test statistics whereas with sklearn, each test statistic needs to be extracted individually. 

In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm 
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression

##### **1. Using statsmodels.api:**

In [2]:
data = pd.read_csv('real_estate_price_size_year.csv')
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [3]:
# define variables:
x1 = data[['size','year']]
y = data['price']

In [4]:
x = sm.add_constant(x1)

In [5]:
# fit the model:
model = sm.OLS(y,x).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,168.5
Date:,"Wed, 17 Apr 2024",Prob (F-statistic):,2.7700000000000004e-32
Time:,09:45:23,Log-Likelihood:,-1191.7
No. Observations:,100,AIC:,2389.0
Df Residuals:,97,BIC:,2397.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.772e+06,1.58e+06,-3.647,0.000,-8.91e+06,-2.63e+06
size,227.7009,12.474,18.254,0.000,202.943,252.458
year,2916.7853,785.896,3.711,0.000,1357.000,4476.571

0,1,2,3
Omnibus:,10.083,Durbin-Watson:,2.25
Prob(Omnibus):,0.006,Jarque-Bera (JB):,3.678
Skew:,0.095,Prob(JB):,0.159
Kurtosis:,2.08,Cond. No.,941000.0


##### **2. Using SKlearn:**

In [6]:
data_2 = pd.read_csv('real_estate_price_size_year.csv')
data_2.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [7]:
# define variables:
x = data[['size','year']]
y = data['price']

In [8]:
# fit the model
model = LinearRegression()
model.fit(x,y)

In [9]:
# extract intercept:
model.intercept_

-5772267.017463279

In [10]:
# extract coefficients:
model.coef_

array([ 227.70085401, 2916.78532684])

In [11]:
# extract p-values: 
f_regression(x,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [12]:
# create data frame for variable coefficients and corresponding p-values:
model_summary = pd.DataFrame(data = x.columns.values, columns = ['Features'])

In [13]:
model_summary['Coefficients'] = model.coef_.round(4)
model_summary['p_values'] = f_regression(x,y)[1].round(3)
model_summary

Unnamed: 0,Features,Coefficients,p_values
0,size,227.7009,0.0
1,year,2916.7853,0.357


In [14]:
# R2:
model.score(x,y)

0.7764803683276795

In [15]:
def r2_adj(x,y):
    r2 = model.score(x,y)
    n = x.shape[0]                       # number of observations
    p = x.shape[1]                       # number of parameters in the model
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1) # standard formula
    return adjusted_r2

In [16]:
# adjusted R2:
r2_adj(x,y)

0.7718717161282502

In [17]:
pd.options.display.float_format = '{:.3f}'.format
reg_results = pd.DataFrame(data =['Intercept Coefficient', 'R2', 'Adjusted R2'] , columns = ['Features'])
reg_results['Value'] = [model.intercept_, model.score(x,y).round(4), r2_adj(x,y).round(3)]
reg_results

Unnamed: 0,Features,Value
0,Intercept Coefficient,-5772267.017
1,R2,0.776
2,Adjusted R2,0.772
