In [2]:
from sklearn.datasets import load_diabetes
data = load_diabetes()
X = data.data
y = data.target
X.shape

# Linear regression is a statistical method used to model the relationship between a dependent variable (often
# denoted as yyy) and one or more independent variables (denoted as XXX). The goal is to find the linear
# relationship that best predicts yyy from XXX.

(442, 10)

In [3]:
# Gradient descent
# Gradient descent is an iterative optimization algorithm used to minimize the cost function in linear regression. The
# goal is to find the coefficients (weights) of the linear model that minimize the difference between the predicted
# and actual values of the dependent variable.


import numpy as np
def gradient_descent(X, y, alpha=0.01, iterations=1000):
    m, n = X.shape
    theta = np.zeros(n)
    cost_history = []
    for _ in range(iterations):
        predictions = X.dot(theta)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        theta -= alpha * gradient
        cost = (1/(2*m)) * np.sum(errors**2)
        cost_history.append(cost)
    return theta, cost_history
theta,cost_history=gradient_descent(X,y)
print(theta)


# Gradient descent can efficiently find the optimal solution but may converge to a local minimum if the cost
# function is non-convex. The learning rate α\alphaα plays a crucial role in the convergence speed and stability.

[  6.50828076   1.28806748  20.8265647   15.61326631   7.24508406
   5.85554099 -13.90078374  15.01590614  19.99313426  13.39901343]


In [4]:
# LEAST SQUARES LINEAR REGRESSION

# Least squares is a traditional approach to finding the coefficients that minimize the sum of squared differences
# between observed and predicted values.

def least_squares(X, y):
 return np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print(least_squares(X,y))

# The least squares method is computationally efficient for small datasets but becomes infeasible for large datasets
# due to the matrix inversion requirement. It also does not handle multicollinearity well.

[ -10.0098663  -239.81564367  519.84592005  324.3846455  -792.17563855
  476.73902101  101.04326794  177.06323767  751.27369956   67.62669218]


In [5]:
# Polynomial regression models the relationship between the independent variable XXX and the dependent variable
# yyy as an nnn-th degree polynomial.

from sklearn.preprocessing import PolynomialFeatures
def polynomial_regression(X, y, degree):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)
    theta = np.linalg.inv(X_poly.T.dot(X_poly)).dot(X_poly.T).dot(y)
    return theta
print(polynomial_regression(X,y,3))

# Polynomial regression can model more complex relationships but may suffer from overfitting, especially when the
# degree of the polynomial is high.

[ 2.16468886e+15 -7.27762660e+12  5.78333416e+15  4.02478758e+12
  9.15821182e+12 -4.17396778e+12  4.03229959e+11 -4.23439525e+12
  2.81824099e+12 -1.72544286e+12  5.65878986e+12  1.79151459e+03
 -1.94240520e+13 -3.35423655e+03 -3.99890352e+03  7.99579451e+06
 -7.02685506e+06 -2.99180951e+06 -1.95202294e+04 -2.62773687e+06
  1.11798650e+04 -9.56777103e+17  1.07421946e+13  2.44433516e+13
 -1.11403473e+13  1.07621504e+12 -1.13016440e+13  7.52191076e+12
 -4.60522648e+12  1.51033620e+13 -9.12682565e+03  4.05804403e+03
  1.81232462e+07 -1.59365415e+07 -6.76426796e+06  2.46433453e+04
 -5.96033278e+06 -2.50304127e+01 -5.09757469e+03 -4.34209355e+07
  3.81661217e+07  1.62299064e+07 -6.39353925e+03  1.42816071e+07
 -2.02959687e+03  7.20402003e+07 -1.30507252e+08 -2.73074166e+07
  1.82477150e+07 -3.29293926e+07 -1.22752484e+07  5.90505947e+07
  2.54329098e+07 -1.60558834e+07  3.02233005e+07  1.07888277e+07
  1.64010891e+05 -6.77012119e+06  3.56802734e+06  4.60133271e+06
  3.88458293e+04 -6.03050

In [6]:
# Least Absolute Shrinkage and Selection Operator
# Lasso Regression is a regularization technique used to prevent overfitting. It improves linear regression by adding a penalty term to the standard regression equation.
# It works by minimizing the sum of squared differences between the observed and predicted values by fitting a line to the data.
from sklearn.linear_model import Lasso
def lasso_regression(X, y, alpha=1.0):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X, y)
    return lasso.coef_
print(lasso_regression(X,y))

# LASSO is useful for feature selection and can improve model interpretability by reducing the number of features.
# However, it may lead to bias in the coefficients.

[  0.          -0.         367.70385976   6.29885756   0.
   0.          -0.           0.         307.6054181    0.        ]


In [7]:
# Ridge regression, also known as  L2 regularization, is a technique used in linear regression to address the problem of multicollinearity
# among predictor variables. Multicollinearity occurs when independent variables in a regression model are highly correlated, 
# which can lead to unreliable and unstable estimates of regression coefficients.
# Ridge regression is a variation of linear regression, specifically designed to address multicollinearity in the dataset.
from sklearn.linear_model import Ridge
def ridge_regression(X, y, alpha=1.0):
    ridge = Ridge(alpha=alpha)
    ridge.fit(X, y)
    return ridge.coef_
print(ridge_regression(X,y))
# Ridge regression reduces overfitting by penalizing large coefficients. It is particularly effective in the presence of
# multicollinearity but does not perform feature selection like LASSO.

[  29.46611189  -83.15427636  306.35268015  201.62773437    5.90961437
  -29.51549508 -152.04028006  117.3117316   262.94429001  111.87895644]


In [8]:
# # Comparison of Methods

# | **Method**               | **Advantages**                                                               | **Disadvantages**                                                        | **Use Cases**                                             |
# |----------------------------|----------------------------------------------------------------------------|---------------------------------------------------------------------------|------------------------------------------------------------|
# | **Gradient Descent**       | Efficient for large datasets, no need for matrix inversion.                | Requires tuning of learning rate, may converge slowly.                   | Large datasets, online learning.                          |
# | **Least Squares**          | Simple, direct computation.                                                | Infeasible for large datasets, sensitive to multicollinearity.           | Small datasets, baseline model.                           |
# | **Polynomial Regression**  | Captures non-linear relationships.                                         | Prone to overfitting, complex model interpretation.                      | When relationship between variables is non-linear.        |
# | **LASSO Regression**       | Performs feature selection, interpretable model.                          | May introduce bias, depends heavily on regularization term.              | Sparse models, feature selection.                         |
# | **Ridge Regression**       | Reduces overfitting, handles multicollinearity well.                       | Does not perform feature selection.                                      | When multicollinearity is a concern.                      |
