# Import Necessary Libraries

In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler

# Load data

In [2]:
boston = load_boston()
X = boston.data
Y = boston.target

In [3]:
X.shape

(506, 13)

In [4]:
X

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

# Apply Standard scaler

### Feature scaling is a method to unify self-variables or feature ranges in data.
### In data processing, it is usually used in data pre-processing.
### Because in the original data, the range of variables is very different. Feature scaling is a necessary step in the calculation of stochastic gradient descent

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Gradient Descent Implementation

### Hypothesis function
$$
    h_{\theta} = \theta^{T}x
$$

In [10]:
def hypothesis(x , params):
    return np.sum(x * params)

### Geadient Descent
$$
\theta_j = \theta_j - \alpha/m \sum_{i = 1}^{m} (h_{\theta}x^{(i)} - y^{(i)})x_{j}^{(i)} 
$$

In [11]:
def gradientDescent(learningRate , n_iter , X_train , y_train , n_feature):
    trainSize = len(X_train)
    parameters = np.ones(n_feature + 1)
    features = np.apply_along_axis(lambda col:np.append(col,1) , 1 , X_train)
    for _ in range(n_iter):
        hypothesis_params = np.apply_along_axis(hypothesis , 1 , features , parameters)
        temp = hypothesis_params - y_train
        derivative = np.apply_along_axis(lambda col: np.sum(col * temp) , 0 , features)
        parameters = parameters - ((learningRate/trainSize) * derivative)
    return parameters    

# Run the algorithm

In [12]:
linregParams = gradientDescent(0.001 , 3500 , X_scaled , Y , X_scaled.shape[1])

In [13]:
def predict(x , params):
    intercept = params[-1]
    return np.sum(x * params[:-1]) + intercept

In [14]:
predictions = np.apply_along_axis(predict , 1 , X_scaled , linregParams)

# Calculate MSE

In [15]:
np.mean((predictions - Y) ** 2)

24.20431120057739