# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 필요한 라이브러리 import 
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### 데이터 수집 및 분할

In [2]:
from sklearn.datasets import load_boston

boston = load_boston()

df = pd.DataFrame(data=boston.data, columns=boston.feature_names)

X = np.array(df["RM"]).reshape(-1, 1)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [15]:
def eval_score(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    r2 = r2_score(y_test, y_pred)
    
    print("mse: ",np.round(mse, 3))
    print('rmse:', np.round(rmse, 3))
    print('r2: ', np.round(r2, 3))

# 1. LinearRegression 모델을 사용한 경우

In [17]:
from sklearn.linear_model import LinearRegression


#모델 객체 생성
ols = LinearRegression()

# 모델 학습
ols.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(ols.coef_, ols.intercept_)

#회귀식

print("y = {:.3f}X + ({:.3f})".format(ols.coef_[0], ols.intercept_))
# 예측 수행

y_pred = ols.predict(X_test)
# MSE, RMSE, r2_score
eval_score(y_test, y_pred)


[8.46109164] -30.571032410898336
y = 8.461X + (-30.571)
mse:  36.517
rmse: 6.043
r2:  0.602


# 2. SGDRegressor with hyperparameter

In [23]:
from sklearn.linear_model import SGDRegressor

#모델 객체 생성
sgd = SGDRegressor()

# 모델 학습
sgd.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_, sgd.intercept_)

#회귀식
print("y = {:.3f}X + {:.3f}".format(sgd.coef_[0], sgd.intercept_[0]))

# 예측 수행
y_pred = sgd.predict(X_test)

# MSE, RMSE, r2_score
eval_score(y_test, y_pred)

[3.98054487] [-2.76201309]
y = 3.981X + -2.762
mse:  56.636
rmse: 7.526
r2:  0.382


## 3. SGDRegressor with Scaling: 보스턴집값

In [24]:
# 스케일링(표준화 스케일링)
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)

X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

from sklearn.linear_model import SGDRegressor

#모델 객체 생성
sgd = SGDRegressor()

# 모델 학습
sgd.fit(X_train_scaled, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_, sgd.intercept_)

#회귀식
print("y = {:.3f}X + {:.3f}".format(sgd.coef_[0], sgd.intercept_[0]))

# 예측 수행
y_pred = sgd.predict(X_test_scaled)

# MSE, RMSE, r2_score
eval_score(y_test, y_pred)

[5.8645052] [22.35825945]
y = 5.865X + 22.358
mse:  36.49
rmse: 6.041
r2:  0.602


In [25]:
X_train

array([[5.949],
       [5.966],
       [6.794],
       [6.595],
       [3.561],
       [6.538],
       [6.064],
       [6.631],
       [7.82 ],
       [8.78 ],
       [6.718],
       [6.006],
       [5.648],
       [6.23 ],
       [6.516],
       [6.38 ],
       [5.693],
       [6.302],
       [6.513],
       [5.783],
       [6.047],
       [7.079],
       [5.713],
       [6.245],
       [6.782],
       [5.88 ],
       [7.489],
       [5.885],
       [6.164],
       [6.405],
       [6.657],
       [7.107],
       [5.52 ],
       [5.986],
       [6.226],
       [7.691],
       [5.927],
       [5.87 ],
       [5.889],
       [6.525],
       [5.856],
       [7.313],
       [6.03 ],
       [6.081],
       [6.943],
       [5.968],
       [7.831],
       [5.875],
       [6.219],
       [4.628],
       [6.375],
       [6.297],
       [5.898],
       [6.678],
       [7.007],
       [6.041],
       [6.326],
       [5.879],
       [5.859],
       [6.376],
       [6.739],
       [6.096],
       [

# 4. SGD with StadardScaler()

In [27]:
#SGD with StandardScaler()
# 스케일링(표준화 스케일링)

#StandardScaler Code
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.linear_model import SGDRegressor

#모델 객체 생성
sgd = SGDRegressor()

# 모델 학습
sgd.fit(X_train_scaled, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(sgd.coef_, sgd.intercept_)

#회귀식
print("y = {:.3f}X + {:.3f}".format(sgd.coef_[0], sgd.intercept_[0]))

# 예측 수행
y_pred = sgd.predict(X_test_scaled)

# MSE, RMSE, r2_score
eval_score(y_test, y_pred)

[5.82195706] [22.34514014]
y = 5.822X + 22.345
mse:  36.613
rmse: 6.051
r2:  0.601


# 5. Pipeline with StandardScaler, LinearRegression, SGDRegressor

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

reg = make_pipeline(StandardScaler(),
                    SGDRegressor(max_iter=1000000, eta0=0.01,\
                                 tol=0.0001, random_state=42, loss='squared_loss'))
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg[1].coef_, reg[1].intercept_)

#회귀식 - pipeline()을 사용했기 때문에 SGDRegressor의 parameter가 reg객체의 1번 인덱스에 들어감
print("y = {:2f}X + {:.3f}".format(reg[1].coef_[0], reg[1].intercept_[0]))

# 예측 수행
y_pred = reg.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.84750366] [22.31897879]
y = 5.847504X + 22.319
MSE: 36.523
RMSE:  6.043
R2:  0.602




######################################여기까지#############################