# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 필요한 라이브러리 import 
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### 데이터 수집 및 분할

In [3]:
from sklearn.datasets import load_boston
boston = load_boston()

df=pd.DataFrame(data=boston.data,columns=boston.feature_names)

x= np.array(df['RM']).reshape(-1,1)
y= boston.target

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [6]:
def eval_score(y_test,y_pred):
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    r2=r2_score(y_test,y_pred)
    
    print('mse : ',np.round(mse,3))
    print('rmse : ',np.round(rmse,3))
    print('r2 : ',np.round(r2,3))

# 1. LinearRegression 모델을 사용한 경우

In [7]:
from sklearn.linear_model import LinearRegression

#모델 객체 생성
ols = LinearRegression()

# 모델 학습
ols.fit(x_train,y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(ols.coef_,ols.intercept_)

#회귀식
print('y = {:.3f}x + {:.3f}'.format(ols.coef_[0],ols.intercept_))

# 예측 수행
y_pred=ols.predict(x_test)

# MSE, RMSE, r2_score
eval_score(y_test, y_pred)

[8.46109164] -30.571032410898336
y = 8.461x + -30.571
mse :  36.517
rmse :  6.043
r2 :  0.602


# 2. SGDRegressor with hyperparameter

In [15]:
from sklearn.linear_model import SGDRegressor

#모델 객체 생성
sgd=SGDRegressor()

# 모델 학습
sgd.fit(x_train,y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_,sgd.intercept_)

#회귀식
print('y = {:.3f}x + {:.3f}'.format(sgd.coef_[0],sgd.intercept_[0]))

# 예측 수행
y_pred=sgd.predict(x_test)

# MSE, RMSE, r2_score
eval_score(y_test,y_pred)

[4.4667249] [-4.53246863]
y = 4.467x + -4.532
mse :  54.046
rmse :  7.352
r2 :  0.41


# 3. SGDRegressor with Scaling: 보스턴집값

In [16]:
# 스케일링(표준화 스케일링)
train_mean=np.mean(x_train,axis=0)
train_std=np.std(x_train,axis=0)

x_train_scaled=(x_train - train_mean) / train_std
x_test_scaled=(x_test-train_mean) / train_std # 학습 데이터를 변화시킨 것이 기준! 이거를 테스트에도 적용해야됨

#모델 객체 생성
sgd=SGDRegressor()

# 모델 학습
sgd.fit(x_train_scaled,y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_,sgd.intercept_)

#회귀식
print('y = {:.3f}x + {:.3f}'.format(sgd.coef_[0],sgd.intercept_[0]))

# 예측 수행
y_pred=sgd.predict(x_test_scaled)

# MSE, RMSE, r2_score
eval_score(y_test,y_pred)

[5.80646545] [22.32968233]
y = 5.806x + 22.330
mse :  36.654
rmse :  6.054
r2 :  0.6


# 4. SGD with StadardScaler()

In [18]:
#SGD with StandardScaler()
# 스케일링(표준화 스케일링)

#StandardScaler Code
from sklearn.preprocessing import StandardScaler # fit, transform 사용해야함

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

#모델 객체 생성
sgd=SGDRegressor()

# 모델 학습
sgd.fit(x_train_scaled,y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_,sgd.intercept_)

#회귀식
print('y = {:.3f}x + {:.3f}'.format(sgd.coef_[0],sgd.intercept_[0]))

# 예측 수행
y_pred=sgd.predict(x_test_scaled)

# MSE, RMSE, r2_score
eval_score(y_test,y_pred)

[5.87188533] [22.34340748]
y = 5.872x + 22.343
mse :  36.461
rmse :  6.038
r2 :  0.602


# 5. Pipeline with StandardScaler, LinearRegression, SGDRegressor

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

reg = make_pipeline(StandardScaler(),
                   SGDRegressor(max_iter=100000, eta0=0.01,))
# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성

#회귀식 - pipeline()을 사용했기 때문에 SGDRegressor의 parameter가 reg객체의 1번 인덱스에 들어감


# 예측 수행

# MSE, RMSE, r2_score


######################################여기까지#############################