In [3]:
import numpy as np
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
from matplotlib.ticker import MaxNLocator

from typing import Tuple
# from typing import Annotated

In [2]:
from platform import python_version
print(python_version())

3.7.3


In [10]:
import warnings
warnings.simplefilter('always', Warning)

<h1> Linear Regression </h1>

Regularization:

- It is almost always preferable to have at least a little bit of regularization -> generally avoid plain Linear Regression.

- Ridge is a good default, but

- if only a few features are useful, you should prefer Lasso or Elastic Net -> they tend to reduce the useless features’ weights down to zero.

- Elastic Net is preferred over Lasso -> Lasso may behave erratically when the number of features is greater than the number of training instances or when several features are strongly correlated.

~ Aurélien Géron

<h3> Normal Equation </h3>

In [7]:
class linear_regression_NE:
    """
    Implementation of linear regression using Normal Equation.

    m - number of training instances, n - number of features

    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    """
    
    def __init__(self):
        self.theta_best = None
        self.X_test_b = None
        self.y_pred = None
        self.X_b = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray):
        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.theta_best = np.linalg.inv(self.X_b.T.dot(self.X_b)).dot(self.X_b.T).dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta_best)
        return self.y_pred

<h3> Singular Value Decomposition </h3>

In [None]:
class linear_regression_SVD:
    """
    Simplified implementation of linear regression using Singular Value Decomposition.
    
    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    - from sklearn.linear_model import LinearRegression
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.U = None
        self.E_vec = None
        self.V_t = None
        self.E_ = None
        self.X_b_ = None
        self.theta = None
        self.y_pred = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, threshold:float=0.0001):
        
    
        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]

        # Calculate SVD
        self.U, self.E_vec, self.V_t = np.linalg.svd(self.X_b)

        # Calculate pseudoinverse

        for i in range(len(self.E_vec)):
            if self.E_vec[i] < threshold:
                self.E_vec[i] = 0
            else:
                self.E_vec[i] = 1 / self.E_vec[i]

        self.E_vec[self.E_vec < threshold] = 0
        self.E_ = np.vstack([np.diag(self.E_vec), np.zeros([self.X_b.shape[0] - len(np.diag(self.E_vec)), self.X_b.shape[1]])])
        self.X_b_ = self.V_t.T.dot(self.E_.T).dot(self.U.T)

        # Calculate theta
        self.theta = self.X_b_.dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta)
        return self.y_pred

<h3> Batch Gradient Descent </h3>

In [5]:
class linear_regression_BGD:
    """
    Implementation of linear regression using Batch Gradient Descent.

    - slow for large m1. Linear Regression vs Ridge Regression vs Lasso Regression vs Elastic Net Regression.

    - no out-of-core support
    - fast for large n
    - 2 hyperparameters
    - scaling required
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_iterations:int, theta:np.ndarray=np.array([0,0]), eta:float=0.01,
            eta_reducer:float=1.0, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        - lasso path tends to bounds, when some theta's numbers changes to 0 (then slopes changes abruptly).
        So it's good idea to set eta_reducer to gradually reduce eta in order to converge to the global minimum.
        """
        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]

        self.m = self.X_b.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_iterations):
            if regularization == None:
                self.gradient = 2/self.m * self.X_b.T.dot(self.X_b.dot(self.theta) - y)
            elif regularization == 'ridge':
                self.gradient = 2/self.m * self.X_b.T.dot(self.X_b.dot(self.theta) - y) + alpha * np.array([0, *self.theta[1:]])
            elif regularization == 'lasso':
                self.gradient = 2/self.m * self.X_b.T.dot(self.X_b.dot(self.theta) - y) + alpha * np.array([0, *np.sign(self.theta[1:])])
            elif regularization == 'elastic':
                self.gradient = 2/self.m * self.X_b.T.dot(self.X_b.dot(self.theta) - y) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
            
            self.theta = eta_reducer * (self.theta - eta * self.gradient)
            self.theta_path.append(self.theta)
            
            if debugger and not np.isfinite(self.theta).all():
                warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                break

        self.theta_path = np.array(self.theta_path)
    
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta)
        return self.y_pred

<h3> Stochastic Gradient Descent </h3>

In [None]:
class linear_regression_SGD:
    """
    Implementation of linear regression using Stochastic Gradient Descent.

    - fast for large m
    - out-of-core support
    - fast for large n
    - 2 or more hyperparameters
    - scaling required
    - from sklearn.linear_model import SGDRegressor
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        self.random_sample = None
        self.x_i = None
        self.y_i = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray,  n_epochs:int, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1,
            mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        def learning_rate(t):
            return t0 / (t + t1)

        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.m = X.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_epochs):
            for j in range(self.m):
                self.random_sample = np.random.randint(0, self.m)
                self.x_i = self.X_b[self.random_sample]
                self.y_i = y[self.random_sample]
                
                if regularization == None:
                    self.gradient = 2 * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i)
                elif regularization == 'ridge':
                    self.gradient = 2 * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    self.gradient = 2 * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    self.gradient = 2 * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * self.m + j)
                self.theta = self.theta - self.eta * self.gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                    
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta)
        return self.y_pred

<h2> Mini-batch Gradient Descent </h2>

In [None]:
class linear_regression_MbGD:
    """
    Implementation of linear regression using Stochastic Gradient Descent.

    - fast for large m
    - out-of-core support
    - fast for large n
    - 2 or more hyperparameters
    - scaling required
    - from sklearn.linear_model import SGDRegressor
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        self.random_samples = None
        self.x_i = None
        self.y_i = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_epochs:int, batch_size_ratio:float=0.1, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        def learning_rate(t):
            return t0 / (t + t1)

        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.m = X.shape[0]

        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]
        
        self.batch_size = int(np.ceil(batch_size_ratio*len(X)))
        
        for i in range(n_epochs):
            for j in range(self.m):
                self.random_samples = random.sample(range(0, self.m), self.batch_size)
                self.x_i = self.X_b[self.random_samples]
                self.y_i = y[self.random_samples]
                
                if regularization == None:
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i)
                elif regularization == 'ridge':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(self.x_i.dot(self.theta) - self.y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * self.m + j)
                self.theta = self.theta - self.eta * self.gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta)
        return self.y_pred

<h2> Ridge Regression - Closed-form </h2>

In [None]:
class ridge_regression_Cf:
    """
    Implementation of ridge regression using Closed-form.

    m - number of training instances, n - number of features

    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    """
    
    def __init__(self):
        self.theta_best = None
        self.X_test_b = None
        self.y_pred = None
        self.X_b = None
        self.A = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, alpha:float=0.1):
        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.A = np.identity(self.X_b.shape[1])
        self.A[0][0] = 0
        self.theta_best = np.linalg.inv(self.X_b.T.dot(self.X_b) + alpha * self.A).dot(self.X_b.T).dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred = self.X_test_b.dot(self.theta_best)
        return self.y_pred

<h1> Tools </h1>

In [28]:
def linear_regression_compare_paths(paths:list, path_labels:list, figsize:tuple=(7,4), legend_loc:str='upper left', legend_fontsize:int=16,
                                   label_fontize:int=20, markers:list=['s', '+', 'o']):
    """
    Plots theta (two-dimensional) paths for different linear regression implementations.
    """
    
    plt.figure(figsize=figsize)
    for i in range(len(paths)):
        marker = markers[i%len(markers)]
        path_label = 'path ' + str(i)
        plt.plot(paths[i][:, 0], paths[i][:, 1], marker=marker, linewidth=1, alpha=0.5, label=path_labels[i])

    plt.legend(loc=legend_loc, fontsize=legend_fontsize)
    plt.xlabel(r"$\theta_x$", fontsize=label_fontize)
    plt.ylabel(r"$\theta_y$", fontsize=label_fontize, rotation=0)
    plt.axis()
    plt.show()

In [6]:
def plot_learning_curves(model, X, y, end_iteration:int, start_iteration:int=1, return_errors=False, model_hyperparameters={}):
    """
    Plots learning curves- functions where y: performance on training set and validation set, x: training set size 
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(start_iteration, end_iteration):
        model.fit(X_train[:m], y_train[:m], **model_hyperparameters)
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="Train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Val")
    plt.legend()
    plt.xlabel('Training set size')
    plt.ylabel('RMSE')
    plt.axis()
    plt.show()
    
    if return_errors:
        return train_errors, val_errors

In [None]:
def early_stopping(X_train, y_train, X_val, y_val, model, epochs_number):
    """
    Early stopping implementation for iterative learning Linear Regression.
    """
    minimum_val_error = float('inf')
    best_epoch = None
    best_model = None
    val_errors = []
    best_val_errors = []
    best_epochs = []
    
    
    for epoch in range(epochs_number):
        model.fit(X_train, y_train, 1, theta=model.theta)
        y_val_predict = model.predict(X_val)
        val_error = np.sqrt(mean_squared_error(y_val, y_val_predict))
        val_errors.append(val_error)
    
        if val_error < minimum_val_error:
            minimum_val_error = val_error
            best_epoch = epoch
            best_model = copy.deepcopy(model)
            best_val_errors.append(val_error)
            best_epochs.append(best_epoch)
            
    plt.plot(best_epochs, best_val_errors, "g .", linewidth=3, label='Best RMSE by epochs')
    plt.plot(np.arange(best_epoch+1, epochs_number), val_errors[best_epoch+1:], "r .", linewidth=1, label='Worser/equal RMSE since last best')
    plt.legend()
    plt.xlim((0, epochs_number-1))
    plt.title('Validation RMSE')
    plt.xlabel('epoch')
    plt.ylabel('RMSE')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    
    plt.show()
    
    print('best_epoch: ', best_epoch)
    print('best_theta: ', best_model.theta)
    
    return best_model

In [None]:
def OneDimension_decision_boundary(X, y, x_new, y_proba):
    """
    Plots decision boundary for one feature dataset.
    """
    decision_boundary = np.min(x_new[y_proba[:, 1] >= 0.5])
    print(np.min(x_new[y_proba[:, 1] >= 0.5]))

    plt.figure(figsize=(8, 3))
    plt.plot(X[y==0], y[y==0], "bs")
    plt.plot(X[y==1], y[y==1], "g^")
    plt.plot([decision_boundary, decision_boundary], [-1, 2], "k:", linewidth=2)
    plt.plot(x_new, y_proba[:, 1], "g-", linewidth=2, label="Class 1")
    plt.plot(x_new, y_proba[:, 0], "b--", linewidth=2, label="Class 0")
    plt.text(decision_boundary+0.02, 0.15, "Decision  boundary", fontsize=14, color="k", ha="center")
    plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')
    plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')
    plt.xlabel("Feature value", fontsize=14)
    plt.ylabel("Probability", fontsize=14)
    plt.legend(loc="center left", fontsize=14)
    plt.ylim([-0.02, 1.02])
    plt.show()

<h1> Logistic Regression </h1>

We can use Gradient Descent algorithm also for logistic regression. Ridge, Lasso and Elastic Net regularizations also works here, like for other linear models.

In [8]:
def logistic_function(t):
    return 1 / (1 + np.exp(-t))

<h3> Batch Gradient Descent </h3>

In [5]:
class logistic_regression_BGD:
    """
    Implementation of logistic regression using Batch Gradient Descent.
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        self.y_pred_proba = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_iterations:int, theta:np.ndarray=np.array([0,0]), eta:float=0.01,
            eta_reducer:float=1.0, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        - lasso path tends to bounds, when some theta's numbers changes to 0 (then slopes changes abruptly).
        So it's good idea to set eta_reducer to gradually reduce eta in order to converge to the global minimum.
        """
        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]

        self.m = self.X_b.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_iterations):
            if regularization == None:
                self.gradient = 1/self.m * self.X_b.T.dot(logistic_function(self.X_b.dot(self.theta)) - y)
            elif regularization == 'ridge':
                self.gradient = 1/self.m * self.X_b.T.dot(logistic_function(self.X_b.dot(self.theta)) - y) + alpha * np.array([0, *self.theta[1:]])
            elif regularization == 'lasso':
                self.gradient = 1/self.m * self.X_b.T.dot(logistic_function(self.X_b.dot(self.theta)) - y) + alpha * np.array([0, *np.sign(self.theta[1:])])
            elif regularization == 'elastic':
                self.gradient = 1/self.m * self.X_b.T.dot(logistic_function(self.X_b.dot(self.theta)) - y) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
            
            self.theta = eta_reducer * (self.theta - eta * self.gradient)
            self.theta_path.append(self.theta)
            
            if debugger and not np.isfinite(self.theta).all():
                warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                break

        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([self.y_pred_proba, 1 - self.y_pred_proba]).T
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        self.y_pred = np.zeros([len(self.y_pred_proba)])
        self.y_pred[self.y_pred_proba >= 0.5] = 1
        return self.y_pred

<h3> Stochastic Gradient Descent </h3>

In [None]:
class logistic_regression_SGD:
    """
    Implementation of logistic regression using Stochastic Gradient Descent.
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        self.y_pred_proba = None
        self.random_sample = None
        self.x_i = None
        self.y_i = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray,  n_epochs:int, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1,
            mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        def learning_rate(t):
            return t0 / (t + t1)

        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.m = X.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_epochs):
            for j in range(self.m):
                self.random_sample = np.random.randint(0, self.m)
                self.x_i = self.X_b[self.random_sample]
                self.y_i = y[self.random_sample]
                
                if regularization == None:
                    self.gradient = self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i)
                elif regularization == 'ridge':
                    self.gradient = self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    self.gradient = self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    self.gradient = self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * self.m + j)
                self.theta = self.theta - self.eta * self.gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                    
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([self.y_pred_proba, 1 - self.y_pred_proba]).T
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        self.y_pred = np.zeros([len(self.y_pred_proba)])
        self.y_pred[self.y_pred_proba >= 0.5] = 1
        return self.y_pred

<h2> Mini-batch Gradient Descent </h2>

In [None]:
class logistic_regression_MbGD:
    """
    Implementation of logistic regression using Stochastic Gradient Descent.
    """
    
    def __init__(self):
        self.X_b = None
        self.X_test_b = None
        self.m = None
        self.theta_path = []
        self.gradient = None
        self.theta = None
        self.y_pred = None
        self.y_pred_proba = None
        self.random_samples = None
        self.x_i = None
        self.y_i = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_epochs:int, batch_size_ratio:float=0.1, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        def learning_rate(t):
            return t0 / (t + t1)

        self.X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.m = X.shape[0]

        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]
        
        self.batch_size = int(np.ceil(batch_size_ratio*len(X)))
        
        for i in range(n_epochs):
            for j in range(self.m):
                self.random_samples = random.sample(range(0, self.m), self.batch_size)
                self.x_i = self.X_b[self.random_samples]
                self.y_i = y[self.random_samples]
                
                if regularization == None:
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i)
                elif regularization == 'ridge':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    self.gradient = 2 / self.batch_size * self.x_i.T.dot(logistic_function(self.x_i.dot(self.theta)) - self.y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * self.m + j)
                self.theta = self.theta - self.eta * self.gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([self.y_pred_proba, 1 - self.y_pred_proba]).T
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        self.y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        self.y_pred = np.zeros([len(self.y_pred_proba)])
        self.y_pred[self.y_pred_proba >= 0.5] = 1
        return self.y_pred