# **Mini Batch Gradient Descent**

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import load_diabetes

X,y = load_diabetes(return_X_y=True)

print(X.shape)
print(y.shape, '\n')

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

reg = LinearRegression()
reg.fit(X_train, y_train)


print()
print(reg.coef_)
print()
print(reg.intercept_)




(442, 10)
(442,) 


[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238]

151.88331005254167


## **Type 1**

In [3]:
print(X_train.shape[0])
print(X_train.shape[1])
print(np.arange(10))
print(np.arange(1, 10))
print(np.arange(1, 10, 2))
print(np.dot(10,3))


a = np.array(np.arange(1,11))
b = np.array(np.arange(11,21))
print()
print(a)
print(b)
print(np.dot(a,b))
print(a*b)
print(sum(a*b))


353
10
[0 1 2 3 4 5 6 7 8 9]
[1 2 3 4 5 6 7 8 9]
[1 3 5 7 9]
30

[ 1  2  3  4  5  6  7  8  9 10]
[11 12 13 14 15 16 17 18 19 20]
935
[ 11  24  39  56  75  96 119 144 171 200]
935


In [4]:
import random

class MBGDRegressor:
    def __init__(self, batch_size, learning_rate=0.01, epochs=100):
        self.coef_ = None
        self.intercept_ = None
        self.batch_size = batch_size
        self.lr = learning_rate
        self.epochs = epochs
        
    def fit(self, X_train, y_train):
        self.intercept_ = 0
        self.coef_ = np.ones(X_train.shape[1])
        m = X_train.shape[0]  # Number of rows
        
        for i in range(self.epochs):
            
            # Shuffle the data at the start of each epoch
            # This ensures batches are random but cover all data
            indices = np.arange(m)
            np.random.shuffle(indices)
            X_shuffled = X_train[indices]
            y_shuffled = y_train[indices]
            
            # Loop through batches
            for j in range(0, m, self.batch_size):
                
                # Creating the Batch
                X_batch = X_shuffled[j : j + self.batch_size]
                y_batch = y_shuffled[j : j + self.batch_size]
                
                # mx + b
                y_hat = np.dot(X_batch, self.coef_) + self.intercept_

                # error = y - y_hat
                error = y_batch - y_hat
                
                intercept_der = -2 * np.mean(error)
                
                coef_der = -2 * np.dot(X_batch.T, error) / X_batch.shape[0]
                
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)
                self.coef_ = self.coef_ - (self.lr * coef_der)
        
        print("Intercept:", self.intercept_)
        print("Coefs:", self.coef_)
    
    def predict(self, X_test):
        return np.dot(X_test, self.coef_) + self.intercept_

In [8]:
reg = MBGDRegressor(10)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print('\nACCURACY : ', r2_score(y_test, y_pred))
 

Intercept: 150.61523531440818
Coefs: [ 42.11192634   2.76500316 128.21371529  97.49399045  35.56480679
  24.06830272 -76.01798806  76.58061608 123.50963287  70.62740726]

ACCURACY :  0.2709687070257346


### **Understanding Batch Slicing Logic**
The line `X_batch = X_shuffled[j : j + self.batch_size]` uses Python slicing to grab specific chunks of data.
**Example Scenario:**
- Total Rows (`m`) = **100**
- Batch Size (`batch_size`) = **10**
- Loop: `range(0, 100, 10)` $\rightarrow$ `j` takes values `0, 10, 20...`
---
#### **Iteration 1 ($j=0$)**
- **Start Index:** $0$
- **End Index:** $0 + 10 = 10$
- **Code:** `X_shuffled[0 : 10]`
- **Action:** Grabs rows **0 to 9** (First 10 rows).
#### **Iteration 2 ($j=10$)**
- **Start Index:** $10$
- **End Index:** $10 + 10 = 20$
- **Code:** `X_shuffled[10 : 20]`
- **Action:** Grabs rows **10 to 19** (Next 10 rows).
#### **Iteration 3 ($j=20$)**
- **Start Index:** $20$
- **End Index:** $20 + 10 = 30$
- **Code:** `X_shuffled[20 : 30]`
- **Action:** Grabs rows **20 to 29**.
---
This process repeats until the entire dataset has been processed in chunks of 10.

## **Type 2**

In [26]:
from sklearn.linear_model import SGDRegressor
from sklearn.utils import shuffle




model = SGDRegressor(learning_rate='constant', eta0=0.1, random_state=42)

batch_size = 10

epochs = 10
for epoch in range(epochs):
    X_shuffled, y_shuffled = shuffle(X_train, y_train, random_state=epoch)
    
    for i in range(0, X_train.shape[0], batch_size):
        
        X_batch = X_shuffled[i : i+batch_size]
        y_batch = y_shuffled[i : i+batch_size]
        
        # KEY PART: Use partial_fit() instead of fit()
        # partial_fit updates weights based on the current batch without forgetting previous learning
        # keeps the memory of what it learned before
        # .fit() would reset weights to random numbers every time you call it
        model.partial_fit(X_batch, y_batch)



print("Final Intercept:", model.intercept_)
print("Final Coefficients:", model.coef_, '\n')


y_pred = model.predict(X_test)

print(r2_score(y_test, y_pred))



Final Intercept: [147.5832478]
Final Coefficients: [  50.1616004   -71.63788668  351.35743363  245.71158932   19.41568848
  -24.91945094 -174.2600333   130.28248919  316.73524097  124.75413176] 

0.43097349182018674


- fit(): Loads ALL data into memory and runs the algorithm.
- partial_fit(): Designed for "Online Learning" or "Out-of-Core Learning". It takes a small chunk, updates the weights, and discards the chunk from memory. This is exactly what Mini-Batch Gradient Descent is.