## Import library

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

## Read data and split to train data and validata data

In [2]:
def read_data(path):

    data = pd.read_csv(path)
    x = data.loc[:, ~data.columns.isin(['No', 'Y house price of unit area'])]
    x['X1 transaction date'] = x["X1 transaction date"].apply(np.floor)
    x['X2 house age'] = x["X2 house age"].apply(np.floor)

    y = data['Y house price of unit area']

    x = np.asarray(x)
    y = np.asarray(y)

    train_data = x[:350]
    train_label = y[:350]

    validate_data = x[350:]
    validate_label = y[350:]

    return train_data, train_label, validate_data, validate_label

path = "./real_estate.csv"
train_data, train_label, validate_data, validate_label = read_data(path)

print("\n train-data : ",train_data)
print("\n train-label",train_label)
print("\n test-data : ",validate_data)
print("\n test-label : ",validate_label)


 train-data :  [[2.0120000e+03 3.2000000e+01 8.4878820e+01 1.0000000e+01 2.4982980e+01
  1.2154024e+02]
 [2.0120000e+03 1.9000000e+01 3.0659470e+02 9.0000000e+00 2.4980340e+01
  1.2153951e+02]
 [2.0130000e+03 1.3000000e+01 5.6198450e+02 5.0000000e+00 2.4987460e+01
  1.2154391e+02]
 ...
 [2.0130000e+03 1.7000000e+01 6.4880210e+03 1.0000000e+00 2.4957190e+01
  1.2147353e+02]
 [2.0120000e+03 4.0000000e+00 2.5966070e+02 6.0000000e+00 2.4975850e+01
  1.2154516e+02]
 [2.0120000e+03 7.0000000e+00 1.0481010e+02 5.0000000e+00 2.4966740e+01
  1.2154067e+02]]

 train-label [ 37.9  42.2  47.3  54.8  43.1  32.1  40.3  46.7  18.8  22.1  41.4  58.1
  39.3  23.8  34.3  50.5  70.1  37.4  42.3  47.7  29.3  51.6  24.6  47.9
  38.8  27.   56.2  33.6  47.   57.1  22.1  25.   34.2  49.3  55.1  27.3
  22.9  25.3  47.7  46.2  15.9  18.2  34.7  34.1  53.9  38.3  42.   61.5
  13.4  13.2  44.2  20.7  27.   38.9  51.7  13.7  41.9  53.5  22.6  42.4
  21.3  63.2  27.7  55.   25.3  44.3  50.7  56.8  36.2  42.   59.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['X1 transaction date'] = x["X1 transaction date"].apply(np.floor)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['X2 house age'] = x["X2 house age"].apply(np.floor)


## QR

In [3]:
def qr_householder(A):
    # Compute QR decomposition of A using Householder reflection"
    M = A.shape[0]
    N = A.shape[1]

    # set Q to the identity matrix
    Q = np.identity(M)

    # set R to zero matrix
    R = np.copy(A)

    for n in range(N):
        # vector to transform
        x = A[n:, n]
        k = x.shape[0]

        # compute ro=-sign(x0)||x||
        ro = -np.sign(x[0]) * np.linalg.norm(x)

        # compute the householder vector v
        e = np.zeros(k)
        e[0] = 1
        v = (1 / (x[0] - ro)) * (x - (ro * e))

    # apply v to each column of A to find R
    for i in range(N):
        R[n:, i] = R[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ R[n:, i])

    # apply v to each column of Q
    for i in range(M):
        Q[n:, i] = Q[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ Q[n:, i])

    return Q.transpose(), R

## Linear regression

In [4]:
def linear_regression(train_data, train_label):
    x_bars = np.concatenate((np.ones((train_data.shape[0], 1)), train_data), axis=1)

    Q, R = qr_householder(x_bars) # QR decomposition
    R_pinv = np.linalg.pinv(R) # calculate inverse matrix of R
    A = np.dot(R_pinv, Q.T) # apply formula

    return np.dot(A, train_label)

## Run program 

In [5]:
w = linear_regression(train_data, train_label)
w = w.T.tolist()
print("Regression coef:\n")

line = ['Intercept', 'Transaction date', 'House age', 'Distance to the nearest MRT station', 'Number of convenience stores', 'Latitude', 'Longitude']
res = list(zip(line, w))
for o in res:
    print("{: >20}: {: >10}".format(*o))

Regression coef:

           Intercept: -10820.312080385853
    Transaction date: 2.878020659739805
           House age: -0.2885589413556099
Distance to the nearest MRT station: -0.004036629677440811
Number of convenience stores: 1.1727462062289342
            Latitude: 265.1488562997572
           Longitude: -12.755398675992446


## Run program with validate data and show mean square error

In [7]:
y_pre = [w[0]]*len(validate_data)
for val_index in range(0, len(validate_data)):
    for i in range(0,len(validate_data[val_index])):
        y_pre[val_index] += w[i+1]*validate_data[val_index][i]

mse = mean_squared_error(validate_label, y_pre)
print("validate data predict:\n",y_pre)
print("\nMean square error is:\n",mse)

validate data predict:
 [42.49237846043161, 32.863950098506166, 26.400027552639585, 35.80287870360962, 30.96948193631124, 49.89482972840392, 39.40824835324497, 52.36327375111932, 48.43694285506763, 28.237795620186716, 46.32657188638905, 40.93812803446235, 44.5388402171975, 48.79215090450043, 41.713313652814804, 29.583620363292766, 25.544776711832128, 30.271868215888617, 39.73650692034971, 28.22250341292238, 44.2540755260552, 43.33540816052482, 40.65852461957911, 45.078688754112136, 48.916844781819236, 30.542237869084147, 33.82689565051078, 48.87991443815122, 40.07479366978737, 51.38733578259303, 47.420655127150894, 54.77455165073252, 15.004224653407618, 37.2757191820956, 13.008643801449125, 53.261732418123756, 40.390296659954174, 31.81292338610342, 33.90658287574047, 40.288676087542854, 43.77069440716423, 30.1617247947338, 39.21883347175117, 44.82739915855291, 15.558403779918308, 38.38430045345058, 27.918551829692433, 45.64936975223054, 33.82689565051078, 38.87786238610124, 41.19734672