## **Imports**

In [None]:
# Importing Libraries
from sklearn.datasets import load_diabetes
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

## **Mini Batch Gradient Descent Explained**

*   **What it is:** A variation of Gradient Descent that uses a small, randomly selected subset (mini-batch) of the training data to compute the gradient and update model parameters in each iteration.

*   **Mechanism:**
    *   Divide the training data into mini-batches.
    *   For each epoch, iterate through these mini-batches.
    *   For each mini-batch, calculate the gradient of the loss function.
    *   Update the model parameters using the calculated gradient and a learning rate.

*   **Application:** Widely used in training various machine learning models, especially deep neural networks.

*   **Differences from others:**
    *   **Batch Gradient Descent:** Uses the entire dataset for each gradient calculation. Slower for large datasets, smoother convergence.
    *   **Stochastic Gradient Descent (SGD):** Uses a single data point for each gradient calculation. Faster, but with more noisy updates and potential oscillations during convergence.

*   **When to use:**
    *   When the dataset is too large to fit into memory (Batch Gradient Descent is not feasible).
    *   To achieve faster training compared to Batch Gradient Descent.
    *   To introduce some regularization effect and potentially escape shallow local minima (compared to Batch Gradient Descent).

*   **Advantages:**
    *   Faster convergence than Batch Gradient Descent for large datasets.
    *   More stable convergence than SGD.
    *   Can leverage vectorized operations for efficiency.

*   **Disadvantages:**
    *   Requires tuning the batch size.
    *   Convergence can still be noisy compared to Batch Gradient Descent.

*   **Overall:** Offers a good balance between the computational efficiency of SGD and the stability of Batch Gradient Descent, making it a popular choice for training large models.

## **Loading & Splitting Dataset**

In [None]:
# Loading Diabetes Dataset
X,y = load_diabetes(return_X_y=True)

In [None]:
# Printing the shape of X and y
print(X.shape)
print(y.shape)

(442, 10)
(442,)


In [None]:
# Splitting the Dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

## **Simple Linear Regression for reference**

In [None]:
# Training Simple LInear Regression
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
# Printing the ceofficient's & intercept for reference
print(reg.coef_)
print(reg.intercept_)

[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238]
151.88331005254167


In [None]:
# Prediciting and Printing the r2 score
y_pred = reg.predict(X_test)
r2_score(y_test,y_pred)

0.4399338661568968

## **Building MBGD from Scratch**

In [None]:
import random

class MBGDRegressor:
    def __init__(self,batch_size,learning_rate=0.01,epochs=100):

        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self,X_train,y_train):
        # Initialize coefficients and intercept
        self.intercept_ = 0
        self.coef_ = np.ones(X_train.shape[1])

        # Iterate over epochs
        for i in range(self.epochs):
            # Iterate over mini-batches
            for j in range(int(X_train.shape[0]/self.batch_size)):
                # Randomly sample instances for the current mini-batch
                instance = random.sample(range(X_train.shape[0]),self.batch_size)

                # Calculate predictions for the mini-batch
                y_hat = np.dot(X_train[instance],self.coef_) + self.intercept_

                # Calculate gradients for intercept and coefficients
                intercept_der = -2 * np.mean((y_train[instance] - y_hat))
                coef_der = -2 * np.dot((y_train[instance] - y_hat),X_train[instance])

                # Update intercept and coefficients using gradients and learning rate
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)
                self.coef_ = self.coef_ - (self.lr * coef_der)

        # Print the final intercept and coefficients
        print(self.intercept_,self.coef_)

    def predict(self,X_test):
        # Make predictions on the test set
        return np.dot(X_test,self.coef_) + self.intercept_

In [None]:
# Creating instance of our implementation
mbgd = MBGDRegressor(batch_size=int(X_train.shape[0]/50),learning_rate=0.01,epochs=100)

In [None]:
# Training The instance
mbgd.fit(X_train, y_train)

153.2031057950466 [  30.83377517 -136.40819121  459.40506636  308.23158873  -29.69302993
  -98.01145643 -191.41578267  105.27030955  406.88271112  121.54480588]


In [None]:
# Predicting
y_pred = mbgd.predict(X_test)

In [None]:
# R2 score
r2_score(y_test,y_pred)

0.4524939681335609

## **Explanation of Implementation**

In [None]:
# Create a small dataset with 2 rows
X_small = np.array([[1, 2], [3, 4]])
y_small = np.array([5, 7])

# Instantiate the MBGDRegressor with batch size 2 and 1 epoch
mbgd_small = MBGDRegressor(batch_size=2, learning_rate=0.01, epochs=1)

In [None]:
# Modify the fit method to print intermediate results
class MBGDRegressor_explain:
    def __init__(self,batch_size,learning_rate=0.01,epochs=100):

        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self,X_train,y_train):
        # Initialize coefficients and intercept
        self.intercept_ = 0 # intercept_ = 0
        print(f"Initial intercept: {self.intercept_}")
        self.coef_ = np.ones(X_train.shape[1]) # coef_ = [1., 1.]
        print(f"Initial coefficients: {self.coef_}")

        # Iterate over epochs
        for i in range(self.epochs):
            print(f"\nEpoch {i+1}/{self.epochs}")
            # Iterate over mini-batches
            for j in range(int(X_train.shape[0]/self.batch_size)):
                # Randomly sample instances for the current mini-batch
                # For a batch size of 2 and 2 instances, this will always be [0, 1] or [1, 0]
                instance = random.sample(range(X_train.shape[0]),self.batch_size)
                print(f"Selected instance indices for batch {j+1}: {instance}")

                # Calculate predictions for the mini-batch
                y_hat = np.dot(X_train[instance],self.coef_) + self.intercept_
                # y_hat = [1*1 + 2*1 + 0, 3*1 + 4*1 + 0] = [3, 7] or [7, 3]
                print(f"Predictions for batch {j+1}: {y_hat}")

                # Calculate gradients for intercept and coefficients
                intercept_der = -2 * np.mean((y_train[instance] - y_hat))
                # intercept_der = -2 * mean([5-3, 7-7]) = -2 * mean([2, 0]) = -2 * 1 = -2 or -2 * mean([7-7, 5-3]) = -2 * mean([0, 2]) = -2 * 1 = -2
                print(f"Intercept gradient for batch {j+1}: {intercept_der}")
                coef_der = -2 * np.dot((y_train[instance] - y_hat),X_train[instance])
                # coef_der = -2 * ([5-3, 7-7] . [ [1, 2], [3, 4] ]) = -2 * ([2, 0] . [ [1, 2], [3, 4] ]) = -2 * ([2*1 + 0*3], [2*2 + 0*4]) = -2 * [2, 4] = [-4, -8] or -2 * ([7-7, 5-3] . [ [3, 4], [1, 2] ]) = -2 * ([0, 2] . [ [3, 4], [1, 2] ]) = -2 * ([0*3 + 2*1], [0*4 + 2*2]) = -2 * [2, 4] = [-4, -8]
                print(f"Coefficient gradients for batch {j+1}: {coef_der}")

                # Update intercept and coefficients using gradients and learning rate
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)
                # intercept_ = 0 - (0.01 * -2) = 0.02
                print(f"Updated intercept after batch {j+1}: {self.intercept_}")
                self.coef_ = self.coef_ - (self.lr * coef_der)
                # coef_ = [1, 1] - (0.01 * [-4, -8]) = [1, 1] - [-0.04, -0.08] = [1.04, 1.08]
                print(f"Updated coefficients after batch {j+1}: {self.coef_}")


    def predict(self,X_test):
        # Make predictions on the test set
        return np.dot(X_test,self.coef_) + self.intercept_

In [None]:
# Instantiate the verbose MBGDRegressor and fit on the small dataset
mbgd_explanantion = MBGDRegressor_explain(batch_size=2, learning_rate=0.01, epochs=1)
mbgd_explanantion.fit(X_small, y_small)

Initial intercept: 0
Initial coefficients: [1. 1.]

Epoch 1/1
Selected instance indices for batch 1: [1, 0]
Predictions for batch 1: [7. 3.]
Intercept gradient for batch 1: -2.0
Coefficient gradients for batch 1: [-4. -8.]
Updated intercept after batch 1: 0.02
Updated coefficients after batch 1: [1.04 1.08]


## **Mini Batch in Machine Learning**

In ML you there's no such parameter called batch_size as in Deep Learning so this is how Mini Batch Gradient Descent is Implemented in Machine Learning

In [None]:
# Importing SGDRegressor for Mini Batch implementation in scikit-learn
from sklearn.linear_model import SGDRegressor

In [None]:
# Initializing SGDRegressor with a constant learning rate of 0.1
sgd = SGDRegressor(learning_rate='constant',eta0=0.1)

In [None]:
batch_size = 35

for i in range(100):
    instance = random.sample(range(X_train.shape[0]), batch_size)
    sgd.partial_fit(X_train[instance], y_train[instance]) # Using partial_fit for mini-batch training

In [None]:
sgd.coef_ # Displaying the coefficients after training

array([  64.18132266,  -68.51971095,  347.67219805,  244.92836136,
         19.56416235,  -26.09649701, -178.96512897,  126.8661091 ,
        324.0716205 ,  138.36583078])

In [None]:
sgd.intercept_ # Displaying the intercept after training

array([157.49431921])

In [None]:
y_pred = sgd.predict(X_test) # Making predictions on the test set

In [None]:
r2_score(y_test, y_pred) # Calculating the R2 score of the predictions

0.42780645886107593