In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# cost or loss function
def cost(Y, Yhat):
    return np.mean((Yhat - Y) ** 2)

In [3]:
path = '../../data/KNN_Linear_Regression/real_estate.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [4]:
X = df.drop(['No', 'Y house price of unit area'], axis=1)
y = df['Y house price of unit area']

X_train = X[:350]
y_train = y[:350]

X_test = X[350:]
y_test = y[350:]

y_train = y_train.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1, 1)

**Huấn luyện mô hình ANN với dữ liệu tập Train**

In [5]:
d0 = 6
d1 = h = 100 # size of hidden layer
d2 = C = 1
# initialize parameters randomly
W1 = 0.01*np.random.randn(d0, d1)
b1 = np.zeros((d1, 1))
W2 = 0.01*np.random.randn(d1, d2)
b2 = np.zeros((d2, 1))

N = X_train.T.shape[1]
eta = 0.0001 # learning rate

for i in range(10000):
    ## Feedforward
    Z1 = np.dot(W1.T, X_train.T) + b1
    A1 = np.where(Z1 > 0, Z1, 0.01 * Z1)  # LeakyReLU
    Z2 = np.dot(W2.T, A1) + b2
    Yhat = Z2

    # print loss after each 1000 iterations
    if i %1000 == 0:
        # compute the loss: average cross-entropy loss
        loss = cost(y_train, Yhat)
        print("iter %d, loss: %f" %(i, loss))

    # backpropagation
    E2 = (Yhat - y_train.T)/N
    dW2 = np.dot(A1, E2.T)
    db2 = np.sum(E2, axis = 1, keepdims = True)
    E1 = np.dot(W2, E2)
    E1 = np.where(Z1 > 0, E1, 0.01 * E1)  # Gradient of LeakyReLU
    dW1 = np.dot(X_train.T, E1.T)
    db1 = np.sum(E1, axis = 1, keepdims = True)
    
    # Gradient clipping (To avoid booming gradient)
    clip_value = 1.0
    dW1 = np.clip(dW1, -clip_value, clip_value)
    dW2 = np.clip(dW2, -clip_value, clip_value)
    db1 = np.clip(db1, -clip_value, clip_value)
    db2 = np.clip(db2, -clip_value, clip_value)
    
    # Gradient Descent update
    W1 += -eta*dW1
    b1 += -eta*db1
    W2 += -eta*dW2
    b2 += -eta*db2

iter 0, loss: 1619.422158
iter 1000, loss: 295.998531
iter 2000, loss: 299.180491
iter 3000, loss: 301.723951
iter 4000, loss: 303.189345
iter 5000, loss: 303.889088
iter 6000, loss: 304.157836
iter 7000, loss: 304.311610
iter 8000, loss: 304.402162
iter 9000, loss: 304.532492


**Dự đoán của mô hình với dữ liệu tập Validation**

In [6]:
Z1 = np.dot(W1.T, X_test.T) + b1
A1 = np.maximum(Z1, 0)
Z2 = np.dot(W2.T, A1) + b2

Z2

array([[41.96357739, 25.75312647, 22.07847278, 25.84087266, 26.70103715,
        46.62580896, 48.310085  , 48.78280111, 51.35861872, 24.7123093 ,
        46.07251665, 39.66036648, 30.80807107, 45.76125999, 34.02766924,
        23.38684415, 25.09072732, 24.92590449, 43.38421364, 23.08597189,
        45.59801821, 48.06487088, 44.07576299, 49.50241604, 46.08294698,
        26.09952623, 25.30383785, 53.79028097, 34.17025009, 49.50902029,
        46.06232515, 51.26930394, 20.26121731, 37.59803667, 19.01568018,
        49.82576329, 51.28092905, 23.98399376, 24.55133797, 43.16715309,
        39.85292267, 26.01245019, 35.70488579, 30.84796604, 16.27412519,
        39.44277799, 26.56605613, 28.15542196, 25.30383785, 48.59054023,
        38.77031043, 25.18258148, 48.21948144, 44.89904281, 45.48712687,
        47.27255524, 47.50723623, 24.77083621, 23.4016113 , 19.36465131,
        52.74621577, 42.8959496 , 51.47783519, 52.54135845]])

**Trung bình bình phương sai số khi dùng ANN**

In [7]:
print("MSE:", mean_squared_error(y_test, Z2[0]))

MSE: 55.27148609519361


**Khởi tạo và huấn luyện mô hình Linear Regression**

In [8]:
linR = LinearRegression()

linR.fit(X_train, y_train)

y_pred = linR.predict(X_test)

**Trung bình bình phương sai số khi dùng Linear Regression**

In [9]:
print("MSE:", mean_squared_error(y_test, y_pred))

MSE: 65.18991450477287


**So sánh giữa ANN và Linear Regression:**

* MSE của ANN nhỏ hơn của Linear Regression.

=> Sai số trung bình của Linear Regression lớn hơn của ANN.

=> Mô hình ANN dự đoán tốt hơn.