Modify the regression scratch code in our lecture such that:

- Implement early stopping in which if the absolute difference between old loss and new loss does not exceed certain threshold, we abort the learning.

- Implement options for stochastic gradient descent in which we use only one sample for training.  Make sure that sample does not repeat unless all samples are read at least once already.

- Put everything into class.

In [51]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

boston = load_boston()
X = boston.data
y = boston.target
m = X.shape[0]  
n = X.shape[1] 

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


intercept = np.ones((X_train.shape[0], 1))
X_train = np.concatenate((intercept, X_train), axis=1)
intercept = np.ones((X_test.shape[0], 1))
X_test = np.concatenate((intercept, X_test), axis=1)

class LinearRegression:
    
    def __init__(self, alpha=0.001, max_iter=10000, 
            loss_old=10000, tol=1e-5, method="batch",size=10):
        self.alpha = alpha
        self.max_iter = max_iter
        self.loss_old = loss_old
        self.tol = tol
        self.method = method
        self.size  = size
        
   
        
    def h_theta(self, X):
        return X @ self.theta

    def mse(self, yhat, y):
        return ((yhat - y)**2 / yhat.shape[0]).sum()

    def gradient(self, X, error):
        return X.T @ error 
    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        iter_stop = 0
        mem_i = [] 
        for i in range(self.max_iter):
            if self.method == "batch":
                X_train = X
                y_train = y         
            
            elif self.method == 'sto':
                i = np.random.randint(X.shape[0])
                if i in mem_i:
                    i = np.random.randint(X.shape[0])
                X_train = X[i,:].reshape(1,-1)
                y_train = y[i]
                mem_i.append(i)
                if len(mem_i) == m:
                    mem_i = []
            
            elif self.method == 'mini':
                i = np.random.randint(X.shape[0])
                if i in mem_i:
                    i = np.random.randint(X.shape[0])
                X_train = X[i:i+self.size,:]
                y_train = y[i:i+self.size]
                mem_i.append(i)
                if len(mem_i) == m:
                    mem_i = []
            else:
                print('method : batch or sto or mini')
                break
            
            yhat = self.h_theta(X_train)
            loss_curr = ((yhat - y_train)**2).sum() / yhat.shape[0]
            diff = np.abs(loss_curr-self.loss_old)
            if  (diff<self.tol):
                iter_stop = i
                break
            self.loss_old = loss_curr
            error = yhat - y_train
            grad = self.gradient(X_train, error)
            self.theta = self.theta - self.alpha * grad
            

model = LinearRegression(method="sto")
model.fit(X_train, y_train)
yhat = model.h_theta(X_test)
mse = model.mse(yhat, y_test)


# print the mse
print("MSE: ", mse)

MSE:  27.583877854049504
