# Import Necessary Libraries

In [20]:
import numpy as np
import plotly.express as px
from copy import deepcopy
import pandas as pd
from sklearn.datasets import load_diabetes

# Load data

In [7]:
diabetes = load_diabetes()
X = diabetes.data
Y = diabetes.target

In [8]:
X.shape

(442, 10)

In [9]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

# Apply Standard scaler

### Feature scaling is a method to unify self-variables or feature ranges in data.
### In data processing, it is usually used in data pre-processing.
### Because in the original data, the range of variables is very different. Feature scaling is a necessary step in the calculation of stochastic gradient descent

In [10]:
class Scaler:
    
    def __init__(self):
        self.mean = None
        self.std = None
      
    
    def set_mean(self, X):
        self.mean = np.mean(X, axis=0)
        
    def set_std(self, X):
        self.std = np.std(X, axis=0)
        
    def transform(self, X):
        return (X - self.mean)/self.std
    
    def fit(self, X):
        self.set_mean(X)
        self.set_std(X)
        

In [11]:
scaler = Scaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

# Gradient Descent Implementation

### Hypothesis function
$$
    h_{\theta} = \theta^{T}x
$$

### Cost function
$$
    J(\theta) = \frac{1}{m} \sum_{i = 1}^{m} (h_{\theta}x^{(i)} - y^{(i)})^2 + \frac{\lambda}{2m} \sum_{j=1}^{n} \theta_j^2
$$

### Geadient Descent
$$
\theta_j = \theta_j - \frac{2\alpha}{m} \sum_{i = 1}^{m} (h_{\theta}x^{(i)} - y^{(i)})x_{j}^{(i)} + \frac{\alpha\lambda}{m} \sum_{j=1}^{n} \theta_j
$$

In [34]:
class LinearRegression:
    
    @staticmethod
    def add_bias(X):
        bias = np.ones((X.shape[0], 1))
        return np.concatenate([X, bias] , axis=1)
    
    def __init__(self, X, y, mu=0, std=1) -> None:
        self.X = self.add_bias(X)
        self.y = y
        self.w = np.random.normal(mu, std, size=(self.X.shape[1], 1))
        
    def get_L2(self, landa):
        return (landa/(2 * self.X.shape[0])) * np.sum(self.w ** 2)
        
    def cost(self, y_pred, landa, y_true=None):
        if y_true is not None:
            y = y_true
        else:
            y = self.y
        loss = (y_pred - y) ** 2
        return (np.mean(loss) + self.get_L2(landa))
    
    def predict(self, X):
        y_pred = np.dot(X, self.w).reshape((-1,))
        return y_pred
    
    
    def get_gradients(self, y_hat, landa):
        diff = (y_hat - self.y).reshape((-1, 1))
        reg_part = (landa/self.X.shape[0]) * self.w
        return 2*np.mean(diff * self.X, axis=0).reshape((-1,1)) + reg_part
    
    def update_weights(self, grad, lr):
        self.w -= lr * grad
        
    def evaluate(self, landa, X, y=None):
        X = self.add_bias(X)
        y_pred = self.predict(X)
        return self.cost(y_pred, landa, y)
    
    def fit(self, lr, landa, iter, print_cost=False):
        for it in range(iter):
            y_pred = self.predict(self.X)
            cost_it = self.cost(y_pred, landa)
            grad = self.get_gradients(y_pred, landa)
            self.update_weights(grad, lr)
            if print_cost:
                print(f'cost in iteration {it} : {cost_it}')
        

# Run the algorithm

In [35]:
linear_reg = LinearRegression(X_scaled, Y)
linear_reg.fit(0.01, 0.001, 500, True)

cost in iteration 0 : 29420.24001002392
cost in iteration 1 : 28116.416446680232
cost in iteration 2 : 26903.06823076966
cost in iteration 3 : 25770.634120129653
cost in iteration 4 : 24710.85357460639
cost in iteration 5 : 23716.572890486248
cost in iteration 6 : 22781.5809993244
cost in iteration 7 : 21900.470359954357
cost in iteration 8 : 21068.519078191504
cost in iteration 9 : 20281.590985465144
cost in iteration 10 : 19536.050912165643
cost in iteration 11 : 18828.692818116073
cost in iteration 12 : 18156.678803308037
cost in iteration 13 : 17517.487327059946
cost in iteration 14 : 16908.869221667584
cost in iteration 15 : 16328.810304699477
cost in iteration 16 : 15775.499578494953
cost in iteration 17 : 15247.302161353779
cost in iteration 18 : 14742.736226760755
cost in iteration 19 : 14260.453338486062
cost in iteration 20 : 13799.221663686638
cost in iteration 21 : 13357.91162586436
cost in iteration 22 : 12935.48362696092
cost in iteration 23 : 12530.977524888272
cost in i

# Cross Validation

In [15]:
def kfold(k, data):
    length = len(data) // k
    folds = []
    for fold_num in range(k):
        row_start, row_end = fold_num * length , (fold_num+1) * length
        if fold_num == k - 1:
            row_end += len(data)%k
        fold_data = data[row_start:row_end, :]
        X_fold, y_fold = fold_data[: , :-1], fold_data[:, -1]
        folds.append((X_fold, y_fold))
    return folds

def get_train_from_folds(folds, idx):
    folds.pop(idx)
    X_folds = [fold[0] for fold in folds]
    y_folds = [fold[1] for fold in folds]
    X_train = np.concatenate(X_folds, axis=0)
    y_train = np.concatenate(y_folds, axis=0)
    return X_train, y_train

def cross_validation(k, model, data, params, repeat=10):
    train_history, valid_history = [], []
    for r in range(repeat):
        np.random.shuffle(data)
        folds = kfold(k, data)
        train_losses, valid_losses = [], []
        for valid_ind, (X_valid, y_valid) in enumerate(folds):
            X_train, y_train = get_train_from_folds(deepcopy(folds), valid_ind)
            m = model(X_train, y_train, 0, 1)
            m.fit(**params)
            train_loss = m.evaluate(params['landa'], X_train)
            valid_loss = m.evaluate(params['landa'], X_valid, y_valid)
            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
        train_history.extend(train_losses)
        valid_history.extend(valid_losses)
    return train_history, valid_history
            

In [16]:
def searchCV(k, model, data, params, iter=500):
    train_dict, valid_dict = {}, {}
    for lr_val in params['lr']:
        for lamda_val in params['landa']:
            curr_params = {'lr':lr_val, 'landa':lamda_val, 'iter':iter}
            train_his, valid_his = cross_validation(k, model, data, curr_params)
            train_dict[f'lr:{lr_val},landa:{lamda_val}'] = train_his
            valid_dict[f'lr:{lr_val},landa:{lamda_val}'] = valid_his
    return train_dict, valid_dict     

In [23]:
dt = np.concatenate([X_scaled, Y.reshape((-1,1))], axis=1)
params = {
    'lr':[0.01, 0.03, 0.1, 0.3],
    'landa':[0.01, 0.03, 0.1, 0.3, 0.5]
}
train_cv, valid_cv = searchCV(5, LinearRegression, dt, params)

In [24]:
px.box(pd.DataFrame(train_cv), log_y=True, title='Loss function of LinearRegression for various hyperparameters in train set')

In [25]:
px.box(pd.DataFrame(valid_cv), title='Loss function of LinearRegression for various hyperparameters in validation set')

# Linear Regression with `torch`

In [26]:
import torch

In [32]:
class LinearRegression:
    
    @staticmethod
    def add_bias(X):
        bias = torch.ones((X.shape[0], 1))
        return torch.cat([X, bias] , dim=1)
    
    def __init__(self, X, y, mu=0, std=1) -> None:
        self.X = self.add_bias(X)
        self.y = y
        self.w = torch.normal(mu, std, size=(self.X.shape[1], 1), requires_grad=True)
        
    def get_L2(self, landa):
        return (landa/(2 * self.X.shape[0])) * torch.sum(self.w ** 2)
        
    def cost(self, y_pred, landa, y_true=None):
        if y_true is not None:
            y = y_true
        else:
            y = self.y
        loss = (y_pred - y) ** 2
        return (torch.mean(loss) + self.get_L2(landa))
    
    def predict(self, X):
        y_pred = torch.matmul(X, self.w).reshape((-1,))
        return y_pred
    
    
    def update_weights(self, lr):
        with torch.no_grad():
            self.w -= lr * self.w.grad
        
    def evaluate(self, landa, X, y=None):
        X = self.add_bias(X)
        y_pred = self.predict(X)
        return self.cost(y_pred, landa, y)
    
    def fit(self, lr, landa, iter, print_cost=False):
        for it in range(iter):
            y_pred = self.predict(self.X)
            cost_it = self.cost(y_pred, landa)
            cost_it.backward()
            self.update_weights(lr)
            if print_cost:
                print(f'cost in iteration {it} : {cost_it}')
            self.w.grad.data.zero_()
        

In [33]:
torch.manual_seed(42)
X_scaled = torch.tensor(X_scaled).float()
y = torch.tensor(Y).float()
model = LinearRegression(X_scaled, y)
model.fit(0.01, 0.001, 500, True)

cost in iteration 0 : 29033.7109375
cost in iteration 1 : 27768.044921875
cost in iteration 2 : 26587.8515625
cost in iteration 3 : 25484.302734375
cost in iteration 4 : 24449.755859375
cost in iteration 5 : 23477.591796875
cost in iteration 6 : 22562.044921875
cost in iteration 7 : 21698.095703125
cost in iteration 8 : 20881.333984375
cost in iteration 9 : 20107.908203125
cost in iteration 10 : 19374.4140625
cost in iteration 11 : 18677.837890625
cost in iteration 12 : 18015.515625
cost in iteration 13 : 17385.072265625
cost in iteration 14 : 16784.375
cost in iteration 15 : 16211.5224609375
cost in iteration 16 : 15664.787109375
cost in iteration 17 : 15142.6123046875
cost in iteration 18 : 14643.583984375
cost in iteration 19 : 14166.41015625
cost in iteration 20 : 13709.90234375
cost in iteration 21 : 13272.9775390625
cost in iteration 22 : 12854.6318359375
cost in iteration 23 : 12453.9326171875
cost in iteration 24 : 12070.0244140625
cost in iteration 25 : 11702.095703125
cost in


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

