# Gradient Descent

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

## Obtaining Data

In [95]:
X, y = load_diabetes(return_X_y=True)

In [96]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

## Using sklearn LinearRegression

let's first train a model using sklearn's LinearRegression. We can use this to verify results obtained by our own implementation.

In [98]:
sk_lr = LinearRegression()

In [99]:
sk_lr.fit(X_train, y_train)

In [100]:
sk_lr.intercept_

147.8363360547508

In [101]:
sk_lr.coef_

array([   7.0366653 , -246.78306312,  547.24888098,  324.79496319,
       -921.9697261 ,  472.19859546,  172.52537595,  292.87021204,
        764.64280056,   83.23467618])

In [102]:
sk_lr.score(X_train, y_train)

0.5305594711173106

In [103]:
sk_lr.score(X_test, y_test)

0.45058896473344867

# Batch Gradient Descent

In [123]:
class BGD:

    def __init__(self, lr=0.01, epochs=200):
        self.coef = None
        self.intercept = None
        self.lr = lr
        self.epochs = epochs

    def fit(self, X_train, y_train):

        m, n = X_train.shape

        # initialize coefficients and intercept
        self.coef = np.ones(n)
        self.intercept = 0

        for _ in range(self.epochs):

            # predictions using old parameters
            y_hat = (X_train @ self.coef) + self.intercept
            errors = y_train - y_hat

            # calculating gradients
            intercept_gradient = -2 * np.mean(errors)
            coef_gradient = (-2/m)*(X_train.T @ errors)

            # updating model parameters
            self.intercept = self.intercept - self.lr * intercept_gradient
            self.coef = self.coef - self.lr * coef_gradient

    @property
    def intercept_(self):
        return self.intercept

    @property
    def coef_(self):
        return self.coef

    def predict(self, X_test):
        return (X_test @ self.coef) + self.intercept

    def score(self, features, target):
        predictions = self.predict(features)
        r2 = r2_score(target, predictions)
        return r2

In [124]:
lrs = [0.5, 0.2, 0.1, 0.01, 0.002, 0.0002, 0.0001]
epochs = [200, 500, 1000, 2000]

train_results = []
val_results = []

for l in lrs:
    for e in epochs:
        cbgd = BGD(l, e)
        cbgd.fit(X_train, y_train)
        train_score = cbgd.score(X_train, y_train)
        val_score = cbgd.score(X_val, y_val)
        train_results.append({"learning_rate": l, "epochs": e, "score": train_score})
        val_results.append({"learning_rate": l, "epochs": e, "score": val_score})

In [125]:
pd.DataFrame(val_results).sort_values(by="score", ascending=False).head(1)

Unnamed: 0,learning_rate,epochs,score
7,0.2,2000,0.465556


In [126]:
final_model = BGD(0.2, 2000)
final_model.fit(X_train, y_train)

In [127]:
final_model.score(X_train, y_train)

0.5211432553998432

In [128]:
final_model.score(X_test, y_test)

0.454693107603353

In [129]:
final_model.intercept_

147.84986573691683

In [130]:
final_model.coef_

array([  29.5956226 , -195.00751728,  495.90797043,  297.74795821,
        -68.54576974, -135.56121984, -214.91100487,  148.86051514,
        382.71835819,  151.16703583])

# Stochastic Gradient Descent

In [131]:
class SGD:

    def __init__(self, lr=0.01, epochs=200):
        self.coef = None
        self.intercept = None
        self.lr = lr
        self.epochs = epochs

    def fit(self, X_train, y_train):

        m, n = X_train.shape

        # initialize coefficients and intercept
        self.coef = np.ones(n)
        self.intercept = 0

        for _ in range(self.epochs):
            for _ in range(m):

                # fetch random sample
                idx = np.random.randint(0,m)
                sample = X_train[idx]
                
                # predictions using old parameters
                y_hat = (sample @ self.coef) + self.intercept
                error = y_train[idx] - y_hat
    
                # calculating gradients
                intercept_gradient = -2 * error
                coef_gradient = -2 * sample * error
    
                # updating model parameters
                self.intercept = self.intercept - self.lr * intercept_gradient
                self.coef = self.coef - self.lr * coef_gradient

    @property
    def intercept_(self):
        return self.intercept

    @property
    def coef_(self):
        return self.coef

    def predict(self, X_test):
        return (X_test @ self.coef) + self.intercept

    def score(self, features, target):
        predictions = self.predict(features)
        r2 = r2_score(target, predictions)
        return r2

In [132]:
lrs = [0.5, 0.2, 0.1, 0.01, 0.002, 0.0002, 0.0001]
epochs = [200, 500, 1000, 2000]

train_results = []
val_results = []

for l in lrs:
    for e in epochs:
        csgd = SGD(l, e)
        csgd.fit(X_train, y_train)
        train_score = csgd.score(X_train, y_train)
        val_score = csgd.score(X_val, y_val)
        train_results.append({"learning_rate": l, "epochs": e, "score": train_score})
        val_results.append({"learning_rate": l, "epochs": e, "score": val_score})

In [133]:
pd.DataFrame(val_results).sort_values(by="score", ascending=False).head(1)

Unnamed: 0,learning_rate,epochs,score
12,0.01,200,0.515423


In [134]:
final_model = SGD(0.01, 200)
final_model.fit(X_train, y_train)

In [135]:
final_model.score(X_train, y_train)

0.5203623971236517

In [136]:
final_model.score(X_test, y_test)

0.4354726307675987

In [137]:
final_model.intercept_

142.65495551248677

In [138]:
final_model.coef_

array([  15.95618262, -228.91817716,  527.19026189,  303.58369177,
        -84.25484352, -151.18771549, -217.4293114 ,  147.4414896 ,
        406.42818352,  137.53855735])