In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from sklearn.datasets import fetch_california_housing

dataset = fetch_california_housing()

print("Số chiều dữ liệu input: ", dataset.data.shape)
print("Số chiều dữ liệu target: ", dataset.target.shape)
print()
print("5 mẫu dữ liệu đầu tiên:")
print("input: ", dataset.data[:5])
print("target: ",dataset.target[:5])



Số chiều dữ liệu input:  (20640, 8)
Số chiều dữ liệu target:  (20640,)

5 mẫu dữ liệu đầu tiên:
input:  [[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]
 [ 5.64310000e+00  5.20000000e+01  5.81735160e+00  1.07305936e+00
   5.58000000e+02  2.54794521e+00  3.78500000e+01 -1.22250000e+02]
 [ 3.84620000e+00  5.20000000e+01  6.28185328e+00  1.08108108e+00
   5.65000000e+02  2.18146718e+00  3.78500000e+01 -1.22250000e+02]]
target:  [4.526 3.585 3.521 3.413 3.422]


In [52]:
X = dataset.data
X_train = X[:16511]
y_train = dataset.target[:16511]
X_test = X[16512:]
y_test = dataset.target[16512:]


In [56]:
def linear_regression(X_train, y_train):
    
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    A = np.dot(X_train.T, X_train)
    A = np.linalg.pinv(A)
    B = np.dot(X_train.T, y_train)
    
    w = np.dot(A, B)
    coef = w[1:]
    intercept = w[0]
    
    return coef, intercept

def predict(coef, intercept, X_test):
    y_pred = []

    for i in X_test:
        pred = intercept + coef.dot(i.T)
        y_pred.append(pred)

    return y_pred

In [62]:
coef_, intercept_ = linear_regression(X_train, y_train)
print("[w1, ... w_n] = ", coef)
print("w0 = ", intercept)

[w1, ... w_n] =  [ 4.47862520e-01  9.40946426e-03 -1.20984752e-01  7.10381374e-01
 -1.46681761e-06 -8.81535236e-03 -4.21709305e-01 -4.29758954e-01]
w0 =  -36.388862089930626


In [66]:
y_pred = predict(coef, intercept, X_test)

pd.DataFrame(data = np.array([y_test, y_pred, abs(y_test - y_pred)]).T,
                             columns = ["Thực tế", "Dự đoán", "Lệch"])

Unnamed: 0,Thực tế,Dự đoán,Lệch
0,1.656,1.316282,0.339718
1,1.261,1.587536,0.326536
2,0.944,1.117385,0.173385
3,0.919,1.195473,0.276473
4,1.243,1.363545,0.120545
...,...,...,...
4123,0.781,0.106467,0.674533
4124,0.771,0.530342,0.240658
4125,0.923,0.143771,0.779229
4126,0.847,0.295032,0.551968


In [68]:
loss = np.sqrt(mean_squared_error(y_test, y_pred))
print(loss)

0.7033437845420468


In [71]:
from sklearn.linear_model import LinearRegression

reg_lib = LinearRegression()
reg_lib.fit(X_train, y_train)

print("[w1, ... w_n] = ", reg_lib.coef_)
print("w0 = ", reg_lib.intercept_)

[w1, ... w_n] =  [ 4.47862520e-01  9.40946424e-03 -1.20984752e-01  7.10381373e-01
 -1.46681765e-06 -8.81535235e-03 -4.21709306e-01 -4.29758955e-01]
w0 =  -36.38886219765743


In [73]:
y_pred = reg_lib.predict(X_test)

pd.DataFrame(data = np.array([y_test, y_pred, abs(y_test - y_pred)]).T,
                             columns = ["Thực tế", "Dự đoán", "Lệch"])


Unnamed: 0,Thực tế,Dự đoán,Lệch
0,1.656,1.316282,0.339718
1,1.261,1.587536,0.326536
2,0.944,1.117385,0.173385
3,0.919,1.195473,0.276473
4,1.243,1.363545,0.120545
...,...,...,...
4123,0.781,0.106467,0.674533
4124,0.771,0.530342,0.240658
4125,0.923,0.143771,0.779229
4126,0.847,0.295032,0.551968


In [74]:
reg_lib_pred = reg_lib.predict(X_test)

loss = np.sqrt(mean_squared_error(y_test, reg_lib_pred))

print(loss)

0.7033437844973904
