Execute OLS by yourself in python and after that using numpy

 ### 1. Importing the required libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes

### 2. Loading and exploring the dataset

In [2]:
data = load_diabetes()
data # Note that dataset is a dictionary which has multiple keys and arrays as the values


{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

Now as we know that X is the features or the input and Y is the label or the output so we will store the following in 2 variable named as the same X and Y

In [3]:
X = data['data']
y = data['target']

print(X.shape)
print(y.shape)

# Note that X is a matrix with 442 rows and 10 columns but Y is only 442 rows and no columns which means we have 1 output which accounts from 10 FEATURES.

print('Featues: ' , data.feature_names)
print("First 5 sample entries: " , data.target[:5])

(442, 10)
(442,)
Featues:  ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
First 5 sample entries:  [151.  75. 141. 206. 135.]


In [4]:
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

### 3. Preparing the data

Scaling the data to be in a common range. Point to note is that the features are already scaled properly but target needs to be scaled. So its fine if you wont scale the features.

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

feature_scaler = StandardScaler()
target_scaler = StandardScaler()

X = feature_scaler.fit_transform(X)

y = y.reshape(-1,1) # The need to reshape is to use the fit_tranfrom function as it needs an empty column to be present which we dont have in the case of y


y = target_scaler.fit_transform(y)

y = y.ravel() # This function will again delete that extra column which we made to use the fit_transform function

X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# The order of writing the variables is important as X should store 80% of X as the training data and X_test should store 20% of X as testing data but if the seqeunce is not maintained then 20% of x will go into y which will not be good.

### 4. Parameter Initaitalisation

m,n = shape of X<br>
w = weights <br>
b = bias<br>



In [6]:
m, n = X.shape
w = np.zeros(n) # 0 matrix of shape [10,]
b = 0 # bias term initialising to 0

Note that weight and bias both are initialised to 0 for the start

### 5. Defining the Prediction Function

y. = sigma(w*x +b)<br> y. (y hat) is the predicted value

In [7]:
def predict(X, w, b):
    return np.dot(X,w) + b

Note that the shape of X is [m,n] and shape of w is [n,] so the resulted will be a column only to which b will be added separately

### 6. Defining the cost function

Cost function is the average error<br>
J = (1/2m) sigma (y.-y)^2<br>
Note that the factor of 1/2m is to do the average and adjust the 2 which we will get by differentiating

In [8]:
def compute_cost(X, y, w, b):
    m = len(y)
    y_pred = predict(X, w, b)
    cost = (1/2*m)*np.sum((y_pred - y)**2)

    return cost

### 7. Computing the gradients 

Differentiate J with respect to w and b to find the change necessary in both the parameters. 

dw = (1/m) sigma (y. - y)X <br>
db = (1/m) sigma (y. - y)

In [9]:
def gradient(X,y,w,b):

    m = len(y)
    y_pred = predict(X,w,b)
    error = y_pred-y

    dw = (1/m) * np.dot(X.T,error) # multiplying X and error after transposing X to match the shape
    db = (1/m) * np.sum(error)

    return dw,db

### 8. Updating Parameters Using Gradient Descent

After adjusting the gradient values<br> 
w = w - lr * dw<br>
b = b - lr * db<br>
wehre lr is thew learning rate

In [10]:
def update_parameters(w,b,dw,db,learning_rate):
    w = w - learning_rate*dw
    b = b - learning_rate*db

    return w,b

### 9. Training the Model

Now we will start with w and b = 0 and then we will update the parameters depending upon the lr to get as close as we can to the best fit line

In [11]:
w = np.zeros(n)
b = 0

learning_rate = 0.001
num_iterations = 10000
cost_history = [] # we are making a list to store all the values of cost which are recorded along the for loop

parameters = [] # a list to store all the parameter values

for i in range(num_iterations):
    y_pred = predict(X,w,b)
    cost = compute_cost(X,y,w,b)
    dw,db = gradient(X,y,w,b)
    w , b = update_parameters(w,b,dw,db,learning_rate)

    if i%1000 == 0:
        cost_history.append(cost)
        print(f"Iteration = {i} -- Cost = {cost}")

        parameters = {"weights": w.tolist(), 'bias': b}

Iteration = 0 -- Cost = 63870.89407683908
Iteration = 1000 -- Cost = 32151.785459341572
Iteration = 2000 -- Cost = 30721.66661623815
Iteration = 3000 -- Cost = 30504.864721573933
Iteration = 4000 -- Cost = 30458.66004944069
Iteration = 5000 -- Cost = 30443.434541459155
Iteration = 6000 -- Cost = 30435.283266921517
Iteration = 7000 -- Cost = 30429.077489019484
Iteration = 8000 -- Cost = 30423.50912472783
Iteration = 9000 -- Cost = 30418.20542317004


### 9. Evaluating the model