In [138]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [139]:
from sklearn.datasets import load_diabetes

X,y = load_diabetes(return_X_y=True)

print(X.shape)
print(y.shape, '\n')

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

reg = LinearRegression()
reg.fit(X_train,y_train)


print('COEFFICIENT : ', reg.coef_)
print('\nINTERCEPT : ', reg.intercept_)

y_pred = reg.predict(X_test)
print('\n\nACCURACY : ', r2_score(y_test,y_pred))

(442, 10)
(442,) 

COEFFICIENT :  [  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238]

INTERCEPT :  151.88331005254167


ACCURACY :  0.4399338661568969


In [140]:

class SGDRegressor:
    
    def __init__(self,learning_rate=0.01,epochs=100):
        
        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        
    def fit(self,X_train,y_train):
        # init your coefs
        self.intercept_ = 0
        self.coef_ = np.ones(X_train.shape[1])
        
        for i in range(self.epochs):
            for j in range(X_train.shape[0]):

                idx = np.random.randint(0, X_train.shape[0])        # will pick any one number btwn 0 and X_train[0]
                
                y_hat = np.dot(X_train[idx],self.coef_) + self.intercept_
                
                intercept_der = -2 * (y_train[idx] - y_hat)
                self.intercept_ = self.intercept_ - (self.lr * intercept_der)
                # b new = b old - (lr * derivative of b)

                """
                The original formula is : (-2/m) * np.sum(y_train - y_hat)

                Since we are working on stocastic gradient descent, we will remove the m and also the sum, and then the formula will becomme 
                -2 * (y_train[idx] - y_hat)

                """
                # "intercept_der" (intercept derivative) is the gradient of the loss function with respect to the intercept.
                # It represents the direction and magnitude of the error for the current sample.
                # The -2 comes from the chain rule when differentiating the squared error (y - y_hat)^2.
                    # d/db [(y - y_hat)^2] 
                    # = 2 * (y - y_hat) * d/db(y - (wx + b)) 
                    # = 2 * (y - y_hat) * (-1) 
                    # = -2 * (y - y_hat)
                
                # The error value is (y_train[idx] - y_hat), representing the raw residual of the prediction.
                # The intercept_der is the gradient of the squared error loss function, which is the error 
                # scaled by -2 to indicate the direction and magnitude of the update needed to minimize loss.

                
                coef_der = -2 * np.dot((y_train[idx] - y_hat), X_train[idx])
                self.coef_ = self.coef_ - (self.lr * coef_der)
                # m new = m old - (lr * derivative of m)
        
        print(self.intercept_, '\n',self.coef_)
    
    def predict(self,X_test):
        return np.dot(X_test,self.coef_) + self.intercept_

In [141]:
print(X_train.shape)
print(X_train.shape[0])
print(X_train.shape[1])



(353, 10)
353
10


In [142]:
idx = np.random.randint(0, X_train.shape[0])
print(X_train.shape[0])
print(idx)


353
60


In [143]:
sgd = SGDRegressor(0.1, 50)     # you can even set the learning rate and the number of epocs you want
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)


print('\nACCURACY : ', r2_score(y_test, y_pred))


150.82518045928134 
 [ -28.45906415 -179.29454378  555.50437855  321.72921451  -69.01448495
  -78.15231814 -179.74588136   61.13558172  535.69130563   43.44501262]

ACCURACY :  0.4445668463710014


## **1. The Logic of the Two Loops**
The SGD algorithm uses two nested loops to train the model.

### **The Outer Loop (`epochs`)**
*   **Code:** `for i in range(self.epochs):`
*   **Purpose:** Controls how many times we cycle through the entire training process.
*   **Why:** One pass through the data is rarely enough. We need to repeat the process multiple times to fine-tune the weights effectively.

### **The Inner Loop (`data rows`)**
*   **Code:** `for j in range(X_train.shape[0]):`
*   **Purpose:** Runs once for every single row (sample) in your training data.
*   **What happens inside:**
    1.  Pick **ONE random row** (`idx`).
    2.  Calculate the error for *that specific row*.
    3.  **Immediately update** the weights (`coef_` and `intercept_`) based on that single error.


## **2. Total Number of Updates**
Unlike Batch Gradient Descent (which updates once per epoch), SGD updates **thousands of times per epoch**.

**Formula:**
$$ \text{Total Updates} = \text{Number of Rows} \times \text{Number of Epochs} $$

**Example Scenario:**
*   **Data Rows:** 1,000
*   **Epochs:** 100

| Loop | Iterations |
| :--- | :--- |
| Inner Loop | 1,000 times (per epoch) |
| Outer Loop | 100 times |
| **Total Weight Updates** | **$1,000 \times 100 = 100,000$ times** |
> *Note: It is multiplication, not power ($1000^{100}$).*


## **3. Why We Only Keep the Final Values**
You might wonder: *"Doesn't the variable just get overwritten every time? Do we lose the previous learning?"*
**No, we don't lose learning. We build upon it.**
*   `self.coef_` and `self.intercept_` are **persistent variables**.
*   Every update allows the values to "step" closer to the optimal solution.
**The Process:**
1.  **Update #1:** Value starts at `1.0` (Guess) $\rightarrow$ updates to `0.9` (Slightly better).
2.  **Update #2:** Starts at `0.9` $\rightarrow$ updates to `0.85` (Better).
3.  ...
4.  **Update #100,000:** Starts at `Optimal` $\rightarrow$ updates to `Even More Optimal`.
The value at the end of the 100th epoch represents the **culmination of 100,000 small improvements**. We only need this final, most accurate version to make future predictions.