# Linear Regression

### Data Load & Library Load

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = 'https://raw.githubusercontent.com/blackdew/tensorflow1/master/csv/boston.csv'
df = pd.read_csv(data)

In [3]:
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


### Data Preprocessing

In [4]:
# 독립변수와 종속변수를 나누는 작업
x = df.iloc[:,0:13]
y = df.iloc[:, -1]

In [5]:
# 절편 추가
x["intercept"]=1
x = x[x.columns[::-1]] # 역순으로 배열

In [6]:
x.head()

Unnamed: 0,intercept,lstat,b,ptratio,tax,rad,dis,age,rm,nox,chas,indus,zn,crim
0,1,4.98,396.9,15.3,296,1,4.09,65.2,6.575,0.538,0,2.31,18.0,0.00632
1,1,9.14,396.9,17.8,242,2,4.9671,78.9,6.421,0.469,0,7.07,0.0,0.02731
2,1,4.03,392.83,17.8,242,2,4.9671,61.1,7.185,0.469,0,7.07,0.0,0.02729
3,1,2.94,394.63,18.7,222,3,6.0622,45.8,6.998,0.458,0,2.18,0.0,0.03237
4,1,5.33,396.9,18.7,222,3,6.0622,54.2,7.147,0.458,0,2.18,0.0,0.06905


In [7]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: medv, dtype: float64

### Least Square Method

$$ \hat{\beta} = (X^TX)^{-1}X^TY $$

In [8]:
beta_hat = np.linalg.inv(x.T@x)@x.T@y
beta_hat

0     36.459488
1     -0.524758
2      0.009312
3     -0.952747
4     -0.012335
5      0.306049
6     -1.475567
7      0.000692
8      3.809865
9    -17.766611
10     2.686734
11     0.020559
12     0.046420
13    -0.108011
dtype: float64

### Comparison

In [9]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression(fit_intercept=False)
mlr.fit(x, y) 

LinearRegression(fit_intercept=False)

In [10]:
print("라이브러리: ",list(map('{:.3f}'.format,mlr.coef_)))
print("커스텀: ",list(round(beta_hat,3)))

라이브러리:  ['36.459', '-0.525', '0.009', '-0.953', '-0.012', '0.306', '-1.476', '0.001', '3.810', '-17.767', '2.687', '0.021', '0.046', '-0.108']
커스텀:  [36.459, -0.525, 0.009, -0.953, -0.012, 0.306, -1.476, 0.001, 3.81, -17.767, 2.687, 0.021, 0.046, -0.108]


### Variance & Standard Error of $\hat{\beta}$ and p-value

$$ V(\hat{\beta}) = (X^TX)^{-1} \sigma^2 $$

$$  \hat{\sigma^2} = \frac{\sum(Y_i-\hat{Y_i})^2}{n-k-1} $$

* n: 표본 개수
* k: 변수의 개수(intercept 제외)

In [11]:
y_hat = x@np.array(beta_hat)
mse = np.sum((y-y_hat)**2)/(506-14)
variance_of_beta_hat = np.linalg.inv(x.T@x)*mse
se = np.sqrt(np.diag(variance_of_beta_hat))
se

array([5.10345881e+00, 5.07152782e-02, 2.68596494e-03, 1.30826756e-01,
       3.76053645e-03, 6.63464403e-02, 1.99454735e-01, 1.32097820e-02,
       4.17925254e-01, 3.81974371e+00, 8.61579756e-01, 6.14956890e-02,
       1.37274615e-02, 3.28649942e-02])

$$ t-value = \frac{\hat{\beta}_i - 0}{SE(\hat{\beta}_i)} \sim t(n-k-1) $$

In [17]:
from scipy.stats import t
p_val=[]
for i in range(14):
    p_temp = 2*(1- t.cdf(abs(beta_hat[i]/se[i]), 506-14))
    p_val.append(round(float(p_temp),3))
print("p-value: ",list(map('{:.3f}'.format,p_val)))

p-value:  ['0.000', '0.000', '0.001', '0.000', '0.001', '0.000', '0.000', '0.958', '0.000', '0.000', '0.002', '0.738', '0.001', '0.001']


### Let's make a summary table

In [13]:
table= pd.DataFrame()
table['Variable'] = x.columns
table['coef'] = beta_hat
table['S.E']= se
table['p-value'] = p_val
table.loc[(table['p-value'] < 0.05) & (table['p-value'] >= 0.01), 'star'] = '*'  
table.loc[(table['p-value'] < 0.01) & (table['p-value'] >= 0.001), 'star'] = '**' 
table.loc[(table['p-value'] < 0.001), 'star'] = '***' 

In [14]:
table

Unnamed: 0,Variable,coef,S.E,p-value,star
0,intercept,36.459488,5.103459,0.0,***
1,lstat,-0.524758,0.050715,0.0,***
2,b,0.009312,0.002686,0.001,**
3,ptratio,-0.952747,0.130827,0.0,***
4,tax,-0.012335,0.003761,0.001,**
5,rad,0.306049,0.066346,0.0,***
6,dis,-1.475567,0.199455,0.0,***
7,age,0.000692,0.01321,0.958,
8,rm,3.809865,0.417925,0.0,***
9,nox,-17.766611,3.819744,0.0,***


In [15]:
from statsmodels.formula.api import ols
df_ols = df.copy()
df_ols['intercept'] = 1
df_ols

model = ols("medv~lstat+b+ptratio+tax+rad+dis+age+rm+chas+nox+indus+zn+crim",data=df_ols).fit()
model.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Thu, 05 May 2022",Prob (F-statistic):,6.72e-135
Time:,17:20:26,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.4595,5.103,7.144,0.000,26.432,46.487
lstat,-0.5248,0.051,-10.347,0.000,-0.624,-0.425
b,0.0093,0.003,3.467,0.001,0.004,0.015
ptratio,-0.9527,0.131,-7.283,0.000,-1.210,-0.696
tax,-0.0123,0.004,-3.280,0.001,-0.020,-0.005
rad,0.3060,0.066,4.613,0.000,0.176,0.436
dis,-1.4756,0.199,-7.398,0.000,-1.867,-1.084
age,0.0007,0.013,0.052,0.958,-0.025,0.027
rm,3.8099,0.418,9.116,0.000,2.989,4.631

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0
