## The Boston Housing Dataset

In [1]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np

In [2]:
boston = load_boston()
boston['data']

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [3]:
x_data = boston.data
y_data = boston.target.reshape(boston.target.size,1)

In [4]:
y_data.shape

(506, 1)

In [6]:
from sklearn import preprocessing

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,5)).fit(x_data)
x_scaled_data = minmax_scale.transform(x_data)

x_scaled_data[:3]

array([[0.00000000e+00, 9.00000000e-01, 3.39076246e-01, 0.00000000e+00,
        1.57407407e+00, 2.88752635e+00, 3.20803296e+00, 1.34601570e+00,
        0.00000000e+00, 1.04007634e+00, 1.43617021e+00, 5.00000000e+00,
        4.48399558e-01],
       [1.17961270e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 2.73998850e+00, 3.91349125e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 5.00000000e+00,
        1.02235099e+00],
       [1.17848872e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 3.47192949e+00, 2.99691040e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 4.94868627e+00,
        3.17328918e-01]])

In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_scaled_data, y_data, test_size=0.33)
# 33%를 test set으로 사용하기 위해 데이터를 나눔
# 분리한 데이터를 받아오는 순서 중요

In [17]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((339, 13), (167, 13), (339, 1), (167, 1))

In [18]:
from sklearn import linear_model

regr = linear_model.LinearRegression(fit_intercept=True,
                                    normalize=False,
                                    copy_X=True,
                                    n_jobs=8)
# fit_intercept=False를 주면 절편을 0으로 설정
regr.fit(x_train, y_train)
regr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=8, normalize=False)

In [19]:
regr.coef_, regr.intercept_  

(array([[-1.64408229,  0.8261525 ,  0.10044211,  0.50436211, -1.34556583,
          4.66996338, -0.22488571, -3.02255165,  1.22711207, -1.02373415,
         -1.58607004,  0.88012584, -3.74105608]]), array([23.18704573]))

In [20]:
print('Coefficients: ', regr.coef_)   # 기울기계수
print('intercept: ', regr.intercept_) # 절편

Coefficients:  [[-1.64408229  0.8261525   0.10044211  0.50436211 -1.34556583  4.66996338
  -0.22488571 -3.02255165  1.22711207 -1.02373415 -1.58607004  0.88012584
  -3.74105608]]
intercept:  [23.18704573]


In [21]:
regr.predict(x_data[:5])

array([[ 45.86214896],
       [ 62.05744243],
       [ 85.16297732],
       [109.87204627],
       [101.67528757]])

In [22]:
x_data[:5].dot(regr.coef_.T) + regr.intercept_  # x * a + b  => predict(x_data[:5]) 와 동일

array([[ 45.86214896],
       [ 62.05744243],
       [ 85.16297732],
       [109.87204627],
       [101.67528757]])

In [23]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [26]:
y_true = y_test
y_hat = regr.predict(x_test)

r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat)

(0.7374913563571464, 3.252888434458642, 21.310195667580487)

In [27]:
y_true = y_train
y_hat = regr.predict(x_train)

r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat)

(0.7368714087153716, 3.2915907542488414, 22.56786815587596)

In [28]:
regr.score(x_test, y_test)

0.7374913563571464