In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd

In [2]:
## load dataset
boston_dataset = load_boston()
boston_dataset.data.shape

(506, 13)

In [3]:
## 2 degree polynomial regression
polynomial_transformer = PolynomialFeatures(2)

In [4]:
## make all terms for 2 degree polynomial regression
polynomial_data = polynomial_transformer.fit_transform(boston_dataset.data)

In [5]:
polynomial_data.shape

(506, 105)

In [6]:
## make all features name
polynomial_features = polynomial_transformer.get_feature_names(boston_dataset.feature_names)

In [7]:
## make design matrix X
X = pd.DataFrame(polynomial_data, columns=polynomial_features)

In [8]:
X

Unnamed: 0,1,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,...,TAX^2,TAX PTRATIO,TAX B,TAX LSTAT,PTRATIO^2,PTRATIO B,PTRATIO LSTAT,B^2,B LSTAT,LSTAT^2
0,1.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,...,87616.0,4528.8,117482.40,1474.08,234.09,6072.570,76.194,157529.6100,1976.5620,24.8004
1,1.0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,...,58564.0,4307.6,96049.80,2211.88,316.84,7064.820,162.692,157529.6100,3627.6660,83.5396
2,1.0,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,...,58564.0,4307.6,95064.86,975.26,316.84,6992.374,71.734,154315.4089,1583.1049,16.2409
3,1.0,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,...,49284.0,4151.4,87607.86,652.68,349.69,7379.581,54.978,155732.8369,1160.2122,8.6436
4,1.0,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,...,49284.0,4151.4,88111.80,1183.26,349.69,7422.030,99.671,157529.6100,2115.4770,28.4089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,1.0,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,...,74529.0,5733.0,107013.27,2639.91,441.00,8231.790,203.070,153656.1601,3790.5433,93.5089
502,1.0,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,...,74529.0,5733.0,108353.70,2478.84,441.00,8334.900,190.680,157529.6100,3603.8520,82.4464
503,1.0,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,...,74529.0,5733.0,108353.70,1539.72,441.00,8334.900,118.440,157529.6100,2238.5160,31.8096
504,1.0,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,...,74529.0,5733.0,107411.85,1769.04,441.00,8262.450,136.080,154802.9025,2549.5560,41.9904


In [9]:
## make target vector
y = pd.DataFrame(boston_dataset.target, columns=['MEDV'])
y

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [10]:
## train, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [11]:
## make linear regression model and fit
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
## theta result
print(model.coef_)
print(model.intercept_)  ## theta_0

[[ 2.55996369e-07 -5.09146959e+00 -1.65753983e-01 -5.97358604e+00
   2.43179263e+01  1.65180559e+02  2.19910116e+01  1.03167123e+00
  -5.66895775e+00  3.22443249e+00 -1.10055942e-02  5.35127787e+00
  -4.81524409e-02  7.53109325e-01  2.16774682e-03  2.69938772e-01
   5.87901385e-01  2.41731932e+00 -2.52413195e-02  8.92859572e-02
  -5.18832420e-03 -5.77807152e-02  3.55602049e-01 -3.86092281e-02
   5.43572101e-01 -3.18134358e-04  2.40035425e-02 -7.48850220e-04
  -7.16133310e-03 -1.06886010e-01 -1.27782609e+00  2.50137719e-02
   1.14111417e-04 -1.25254119e-02 -4.68024813e-03  6.05725185e-04
  -8.57873132e-03  1.85030053e-03 -4.64730601e-03  3.08484808e-02
  -2.09065897e-01  1.30035723e+00  3.13497405e-01  6.72540164e-04
   7.51823883e-02 -7.38014889e-03  4.23364348e-04 -6.72155118e-03
   6.42107774e-03 -5.32275093e-03  2.43179258e+01 -1.84845896e+01
  -6.89090796e+00  3.60375828e-02  3.05451225e+00 -4.09746374e-01
   2.34143012e-02 -8.47140007e-01  2.67079534e-02 -4.67786369e-01
  -4.67850

In [13]:
## test with fit model
y_pred = model.predict(X_test)
y_pred

array([[42.00919669],
       [28.22372855],
       [25.52688254],
       [ 9.05454749],
       [33.5903981 ],
       [10.49656654],
       [23.0597188 ],
       [30.35330832],
       [24.2246103 ],
       [22.30153137],
       [33.11791368],
       [20.74987646],
       [20.19664017],
       [32.37012974],
       [27.35813074],
       [20.46264672],
       [13.68533394],
       [12.51890372],
       [15.88730519],
       [12.47719001],
       [ 3.72827179],
       [20.49819423],
       [44.0935012 ],
       [23.31624023],
       [33.2791559 ],
       [ 9.43400658],
       [24.71325022],
       [21.79459244],
       [24.06481669],
       [27.42603119],
       [15.32893991],
       [ 6.80742071],
       [16.76243454],
       [13.13446141],
       [25.10746984],
       [22.92666537],
       [29.58310464],
       [10.66362649],
       [47.75889196],
       [35.24353036],
       [19.90943076],
       [15.25028015],
       [28.15702648],
       [14.02415349],
       [26.11171521],
       [28

In [14]:
## evaluate model's pred power with RMSE
mean_squared_error(y_pred, y_test) ** 0.5

3.1965276512557117