<a href="https://colab.research.google.com/github/Ehtisham1053/Optimization-ML-Algorithms/blob/main/Stochastic_Gradient_Descent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [6]:
df = pd.read_csv('audi.csv')
x = df.drop('price', axis=1)
y = df['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
c = ColumnTransformer([('encoder', OneHotEncoder(handle_unknown='ignore' , sparse_output=False , drop='first'), ['transmission', 'fuelType', 'model']),
                       ('scaler', StandardScaler(), ['mileage', 'tax', 'mpg', 'engineSize', 'year'])

                       ], remainder='passthrough')

x_train = c.fit_transform(x_train)
x_test = c.transform(x_test)



In [11]:
import numpy as np

class StochasticGradientDescent:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.m = None
        self.b = None

    def fit(self, X, y):
        # X = X.to_numpy()
        y= y.to_numpy()

        n_samples, n_features = X.shape
        self.m = np.zeros(n_features)
        self.b = 0

        for _ in range(self.epochs):
            for i in range(n_samples):
                y_pred = np.dot(X[i], self.m) + self.b
                error = y_pred - y[i]

                # Compute gradients for a single sample
                dm = X[i] * error
                db = error


                self.m -= self.learning_rate * dm
                self.b -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.m) + self.b

    def mse(self, X, y):
        return np.mean((self.predict(X) - y) ** 2)

    def mae(self, X, y):
        return np.mean(np.abs(self.predict(X) - y))

    def rmse(self, X, y):
        return np.sqrt(self.mse(X, y))

    def r2_score(self, X, y):
        y_mean = np.mean(y)
        ss_total = np.sum((y - y_mean) ** 2)
        ss_residual = np.sum((y - self.predict(X)) ** 2)
        return 1 - (ss_residual / ss_total)

    def adjusted_r2(self, X, y):
        n, k = X.shape
        r2 = self.r2_score(X, y)
        return 1 - ((1 - r2) * (n - 1) / (n - k - 1))


In [12]:
model = StochasticGradientDescent(learning_rate=0.1, epochs=1000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("MSE:", model.mse(x_test, y_test))
print("MAE:", model.mae(x_test, y_test))
print("RMSE:", model.rmse(x_test, y_test))
print("R² Score:", model.r2_score(x_test, y_test))
print("Adjusted R² Score:", model.adjusted_r2(x_test, y_test))


MSE: 24707862.106628567
MAE: 3391.789984154302
RMSE: 4970.70036379468
R² Score: 0.83651198296297
Adjusted R² Score: 0.8339428855523882


#Stochastic Gradient Descent (SGD)
Stochastic Gradient Descent updates the model parameters after each individual training sample rather than waiting for the entire dataset. This results in a faster update process but introduces noise, leading to a more erratic convergence path.

##When to Use:
SGD is best for very large datasets where loading the entire dataset at once is not feasible. It is useful when training models on real-time or online data streams and when some level of randomness is acceptable to escape local minima.

##Advantages:
* Faster updates, making it more suitable for large-scale and real-time learning.
* Can escape shallow local minima due to its randomness, which helps in non-convex problems.
* Works well when computational power is limited.

##Disadvantages:
* High variance in updates, leading to an unstable path toward convergence.
* Might never fully converge to the optimal solution due to excessive fluctuations.
* Requires careful tuning of the learning rate to balance speed and accuracy.