In [1]:
from sklearn.datasets import load_diabetes

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
X,y = load_diabetes(return_X_y=True)

In [3]:
print(X.shape)
print(y.shape)

(442, 10)
(442,)


In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [5]:
reg = LinearRegression()
reg.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [6]:
print(reg.coef_)
print(reg.intercept_)

[  -9.15865318 -205.45432163  516.69374454  340.61999905 -895.5520019
  561.22067904  153.89310954  126.73139688  861.12700152   52.42112238]
151.88331005254167


In [7]:
y_pred = reg.predict(X_test)
r2_score(y_test,y_pred)

0.4399338661568968

In [8]:
X_train.shape

(353, 10)

In [9]:
np.ones(X_train.shape[1])
# here, we can see that there are 10 features in the diabetes dataset so np.ones returns an array of 10 ones because X_train has 10 columns (features)
# ones function creates an array of given shape and fills it with ones

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [15]:
np.ones(X_train.shape[1])

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [21]:
# or we can use list comprehension to create the same array of ones
a = [1  for i in range(X_train.shape[1])] 
a

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [10]:
class GDRegressor:
    
    def __init__(self,learning_rate=0.01,epochs=100):
        
        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
        
    def fit(self,X_train,y_train):
        # initialize your coefficients

        self.intercept_ = 0
        self.coef_ = np.ones(X_train.shape[1])
        
        for i in range(self.epochs):
            # update all the coef and the intercept ==> important step
            y_hat = np.dot(X_train,self.coef_) + self.intercept_
            #print("Shape of y_hat",y_hat.shape)
            intercept_der = -2 * np.mean(y_train - y_hat)
            self.intercept_ = self.intercept_ - (self.lr * intercept_der)
            
            coef_der = -2 * np.dot((y_train - y_hat),X_train)/X_train.shape[0]
            self.coef_ = self.coef_ - (self.lr * coef_der)
        
        print(self.intercept_,self.coef_)
    
    def predict(self,X_test):
        return np.dot(X_test,self.coef_) + self.intercept_

### y_hat calculation


The predicted values (y_hat) are calculated using the formula:

$$\hat{y} = X_{train} \cdot \text{coef} + \text{intercept}$$

---

## üìä Understanding Each Component

### 1Ô∏è‚É£ **X_train (Feature Matrix)**

This is your **input data** containing all the features (independent variables) for training.

**Structure:**
- **Rows** = Number of samples (data points)
- **Columns** = Number of features (variables)

**Example:** For the diabetes dataset with 353 training samples and 10 features:

$$X_{train} = \begin{bmatrix}
x_{1,1} & x_{1,2} & \cdots & x_{1,10} \\
x_{2,1} & x_{2,2} & \cdots & x_{2,10} \\
\vdots & \vdots & \ddots & \vdots \\
x_{353,1} & x_{353,2} & \cdots & x_{353,10}
\end{bmatrix}_{353 \times 10}$$

Each row represents one patient's measurements (age, BMI, blood pressure, etc.)

---

### 2Ô∏è‚É£ **coef (Coefficient Vector)**

These are the **weights** that determine how much each feature contributes to the prediction. They are **learned during training** through gradient descent.

**Initial Value:** `np.ones(X_train.shape[1])` ‚Üí All coefficients start at 1.0

**After Training:** Updated using gradient descent to minimize error

**Structure:** A 1D array with one coefficient per feature

$$\text{coef} = \begin{bmatrix}
w_1 \\
w_2 \\
\vdots \\
w_{10}
\end{bmatrix}_{10 \times 1}$$

**How it's calculated:**

Each epoch, coefficients are updated using this formula:

$$\text{coef}_{new} = \text{coef}_{old} - \alpha \cdot \frac{\partial \text{MSE}}{\partial \text{coef}}$$

Where:
- $\alpha$ = learning rate (controls step size)
- $\frac{\partial \text{MSE}}{\partial \text{coef}} = -2 \cdot \frac{X_{train}^T \cdot (y_{train} - \hat{y})}{n}$ (gradient of Mean Squared Error)

---

### 3Ô∏è‚É£ **intercept (Bias Term)**

The **y-intercept** of the regression line - it shifts the prediction up or down.

**Initial Value:** `0`

**After Training:** Updated using gradient descent

**How it's calculated:**

$$\text{intercept}_{new} = \text{intercept}_{old} - \alpha \cdot \frac{\partial \text{MSE}}{\partial \text{intercept}}$$

Where:
- $\frac{\partial \text{MSE}}{\partial \text{intercept}} = -2 \cdot \text{mean}(y_{train} - \hat{y})$ (gradient)

---

## üî¢ Step-by-Step Calculation Example

Let's use a simplified example with **3 samples** and **2 features**:

**Given:**

$$X_{train} = \begin{bmatrix}
1 & 2 \\
3 & 4 \\
5 & 6
\end{bmatrix}, \quad
\text{coef} = \begin{bmatrix}
0.5 \\
1.5
\end{bmatrix}, \quad
\text{intercept} = 2$$

**Matrix Multiplication:**

$$X_{train} \cdot \text{coef} = \begin{bmatrix}
1 & 2 \\
3 & 4 \\
5 & 6
\end{bmatrix} \cdot \begin{bmatrix}
0.5 \\
1.5
\end{bmatrix} = \begin{bmatrix}
1(0.5) + 2(1.5) \\
3(0.5) + 4(1.5) \\
5(0.5) + 6(1.5)
\end{bmatrix} = \begin{bmatrix}
3.5 \\
7.5 \\
11.5
\end{bmatrix}$$

**Adding Intercept:**

$$\hat{y} = \begin{bmatrix}
3.5 \\
7.5 \\
11.5
\end{bmatrix} + 2 = \begin{bmatrix}
5.5 \\
9.5 \\
13.5
\end{bmatrix}$$

So the predictions for the 3 samples are: **5.5, 9.5, and 13.5**


In [11]:
gdr = GDRegressor(epochs=1000,learning_rate=0.1)

In [12]:
gdr.fit(X_train,y_train)

151.94042847773682 [  62.27835432  -24.14017912  262.40285385  192.20751489   39.48809013
   10.26886323 -142.50597903  124.33312557  244.33510843  119.34350233]


In [13]:
y_pred = gdr.predict(X_test)

In [14]:
r2_score(y_test,y_pred)

0.3971698388048742