In [2]:
import numpy as np
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from typing import Tuple
from itertools import cycle
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from diagrams import Diagram, Edge, Node
import itertools
import random
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# from typing import Annotated

In [3]:
import warnings
warnings.simplefilter('always', Warning)

<h1> Linear Regression </h1>

Regularization:

- It is almost always preferable to have at least a little bit of regularization -> generally avoid plain Linear Regression.

- Ridge is a good default, but

- if only a few features are useful, you should prefer Lasso or Elastic Net -> they tend to reduce the useless features’ weights down to zero.

- Elastic Net is preferred over Lasso -> Lasso may behave erratically when the number of features is greater than the number of training instances or when several features are strongly correlated.

~ Aurélien Géron

<h3> Normal Equation </h3>

In [7]:
class linear_regression_NE:
    """
    Implementation of linear regression using Normal Equation.

    m - number of training instances, n - number of features

    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    """
    
    def __init__(self):
        self.theta_best = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = X_test_b.dot(self.theta_best)
        return y_pred

<h3> Singular Value Decomposition </h3>

In [None]:
class linear_regression_SVD:
    """
    Simplified implementation of linear regression using Singular Value Decomposition.
    
    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    - from sklearn.linear_model import LinearRegression
    """
    
    def __init__(self):
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, threshold:float=0.0001):
        
    
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        # Calculate SVD
        U, E_vec, V_t = np.linalg.svd(X_b)

        # Calculate pseudoinverse

        for i in range(len(E_vec)):
            if E_vec[i] < threshold:
                E_vec[i] = 0
            else:
                E_vec[i] = 1 / E_vec[i]

        E_vec[E_vec < threshold] = 0
        E_ = np.vstack([np.diag(E_vec), np.zeros([X_b.shape[0] - len(np.diag(E_vec)), X_b.shape[1]])])
        X_b_ = V_t.T.dot(E_.T).dot(U.T)

        # Calculate theta
        self.theta = X_b_.dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = X_test_b.dot(self.theta)
        return y_pred

<h3> Batch Gradient Descent </h3>

In [95]:
class linear_regression_BGD:
    """
    Implementation of linear regression using Batch Gradient Descent.

    - slow for large m1. Linear Regression vs Ridge Regression vs Lasso Regression vs Elastic Net Regression.

    - no out-of-core support
    - fast for large n
    - 2 hyperparameters
    - scaling required
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_iterations:int, theta:np.ndarray=np.array([0,0]), eta:float=0.01,
            eta_reducer:float=1.0, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        - lasso path tends to bounds, when some theta's numbers changes to 0 (then slopes changes abruptly).
        So it's good idea to set eta_reducer to gradually reduce eta in order to converge to the global minimum.
        """
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        m = X_b.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_iterations):
            if regularization == None:
                gradient = 2/m * X_b.T.dot(X_b.dot(self.theta) - y)
            elif regularization == 'ridge':
                gradient = 2/m * X_b.T.dot(X_b.dot(self.theta) - y) + alpha * np.array([0, *self.theta[1:]])
            elif regularization == 'lasso':
                gradient = 2/m * X_b.T.dot(X_b.dot(self.theta) - y) + alpha * np.array([0, *np.sign(self.theta[1:])])
            elif regularization == 'elastic':
                gradient = 2/m * X_b.T.dot(X_b.dot(self.theta) - y) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
            
            self.theta = eta_reducer * (self.theta - eta * gradient)
            self.theta_path.append(self.theta)
            
            if debugger and not np.isfinite(self.theta).all():
                warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                break

        self.theta_path = np.array(self.theta_path)
    
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = self.X_test_b.dot(self.theta)
        return y_pred

<h3> Stochastic Gradient Descent </h3>

In [None]:
class linear_regression_SGD:
    """
    Implementation of linear regression using Stochastic Gradient Descent.

    - fast for large m
    - out-of-core support
    - fast for large n
    - 2 or more hyperparameters
    - scaling required
    - from sklearn.linear_model import SGDRegressor
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray,  n_epochs:int, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1,
            mix_ratio:float=0.5, regularization:str=None, random_state=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        m = X.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_epochs):
            for j in range(m):
                random_sample = np.random.randint(0, m)
                x_i = X_b[random_sample]
                y_i = y[random_sample]
                
                if regularization == None:
                    gradient = 2 * x_i.T.dot(x_i.dot(self.theta) - y_i)
                elif regularization == 'ridge':
                    gradient = 2 * x_i.T.dot(x_i.dot(self.theta) - y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = 2 * x_i.T.dot(x_i.dot(self.theta) - y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = 2 * x_i.T.dot(x_i.dot(self.theta) - y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                    
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = self.X_test_b.dot(self.theta)
        return y_pred

<h2> Mini-batch Gradient Descent </h2>

In [None]:
class linear_regression_MbGD:
    """
    Implementation of linear regression using Mini-batch Gradient Descent.

    - fast for large m
    - out-of-core support
    - fast for large n
    - 2 or more hyperparameters
    - scaling required
    - from sklearn.linear_model import SGDRegressor
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_epochs:int, batch_size_ratio:float=0.1, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None, random_state=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        m = X.shape[0]

        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]
        
        self.batch_size = int(np.ceil(batch_size_ratio*len(X)))
        
        for i in range(n_epochs):
            for j in range(m):
                random_samples = random.sample(range(0, m), self.batch_size)
                x_i = X_b[random_samples]
                y_i = y[random_samples]
                
                if regularization == None:
                    gradient = 2 / self.batch_size * x_i.T.dot(x_i.dot(self.theta) - y_i)
                elif regularization == 'ridge':
                    gradient = 2 / self.batch_size * x_i.T.dot(x_i.dot(self.theta) - y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = 2 / self.batch_size * x_i.T.dot(x_i.dot(self.theta) - y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = 2 / self.batch_size * x_i.T.dot(x_i.dot(self.theta) - y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = self.X_test_b.dot(self.theta)
        return y_pred

<h2> Ridge Regression - Closed-form </h2>

In [None]:
class ridge_regression_Cf:
    """
    Implementation of ridge regression using Closed-form.

    m - number of training instances, n - number of features

    - fast for large m
    - no out-of-core support
    - slow for large n
    - 0 hyperparameters
    - no scaling required
    """
    
    def __init__(self):
        self.theta_best = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, alpha:float=0.1):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        A = np.identity(X_b.shape[1])
        A[0][0] = 0
        self.theta_best = np.linalg.inv(X_b.T.dot(X_b) + alpha * A).dot(X_b.T).dot(y)
        
        
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred = X_test_b.dot(self.theta_best)
        return y_pred

<h1> Tools </h1>

In [28]:
def linear_regression_compare_paths(paths:list, path_labels:list, figsize:tuple=(7,4), legend_loc:str='upper left', legend_fontsize:int=16,
                                   label_fontize:int=20, markers:list=['s', '+', 'o']):
    """
    Plots theta (two-dimensional) paths for different linear regression implementations.
    """
    
    plt.figure(figsize=figsize)
    for i in range(len(paths)):
        marker = markers[i%len(markers)]
        path_label = 'path ' + str(i)
        plt.plot(paths[i][:, 0], paths[i][:, 1], marker=marker, linewidth=1, alpha=0.5, label=path_labels[i])

    plt.legend(loc=legend_loc, fontsize=legend_fontsize)
    plt.xlabel(r"$\theta_x$", fontsize=label_fontize)
    plt.ylabel(r"$\theta_y$", fontsize=label_fontize, rotation=0)
    plt.axis()
    plt.show()

In [96]:
def plot_learning_curves(model, X, y, end_iteration:int, start_iteration:int=1, return_errors=False, model_hyperparameters={}, random_state=None):
    """
    Plots learning curves- functions where y: performance on training set and validation set, x: training set size 
    """

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    train_errors, val_errors = [], []
    for m in range(start_iteration, end_iteration):
        model.fit(X_train[:m], y_train[:m], **model_hyperparameters)
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="Train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Val")
    plt.legend()
    plt.xlabel('Training set size')
    plt.ylabel('RMSE')
    plt.axis()
    plt.show()
    
    if return_errors:
        return train_errors, val_errors

In [None]:
def early_stopping(X_train, y_train, X_val, y_val, model, epochs_number):
    """
    Early stopping implementation for iterative learning Linear Regression.
    """
    minimum_val_error = float('inf')
    best_epoch = None
    best_model = None
    val_errors = []
    best_val_errors = []
    best_epochs = []
    
    
    for epoch in range(epochs_number):
        model.fit(X_train, y_train, 1, theta=model.theta)
        y_val_predict = model.predict(X_val)
        val_error = np.sqrt(mean_squared_error(y_val, y_val_predict))
        val_errors.append(val_error)
    
        if val_error < minimum_val_error:
            minimum_val_error = val_error
            best_epoch = epoch
            best_model = copy.deepcopy(model)
            best_val_errors.append(val_error)
            best_epochs.append(best_epoch)
            
    plt.plot(best_epochs, best_val_errors, "g .", linewidth=3, label='Best RMSE by epochs')
    plt.plot(np.arange(best_epoch+1, epochs_number), val_errors[best_epoch+1:], "r .", linewidth=1, label='Worser/equal RMSE since last best')
    plt.legend()
    plt.xlim((0, epochs_number-1))
    plt.title('Validation RMSE')
    plt.xlabel('epoch')
    plt.ylabel('RMSE')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    
    plt.show()
    
    print('best_epoch: ', best_epoch)
    print('best_theta: ', best_model.theta)
    
    return best_model

In [2]:
def decision_boundary(X, y, fitted_model):
    """
    Plots decision boundary for 1,2 or 3 feature dataset.
    """
    if X.shape[1] == 1:
        
        x_new = np.linspace(np.min(X), np.max(X), 1000).reshape(-1, 1)
        y_proba = fitted_model.predict_proba(x_new)
        decision_boundary = x_new[y_proba[:, 1] >= 0.5][0]

        plt.figure(figsize=(8, 3))
        plt.plot(X[y==0], y[y==0], "bs")
        plt.plot(X[y==1], y[y==1], "g^")
        plt.plot([decision_boundary, decision_boundary], [-1, 2], "k:", linewidth=2)

        plt.plot(x_new, y_proba[:, 1], "g--", linewidth=2, label="Class 1")
        plt.plot(x_new, y_proba[:, 0], "b-", linewidth=2, label="Class 0")

        plt.text(decision_boundary+0.02, 0.15, "Decision  boundary", fontsize=14, color="k", ha="center")
        plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')
        plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')
        plt.xlabel("Feature value", fontsize=14)
        plt.ylabel("Probability", fontsize=14)
        plt.legend(loc="center left", fontsize=14)
        plt.ylim([-0.02, 1.02])
        plt.show()
    elif X.shape[1] == 2:

        theta = fitted_model.theta

        plt.figure(figsize=(10, 4))

        left_right = np.array([np.min(X[:,0]), np.max(X[:,0])])
        down_up = np.array([np.min(X[:,1]), np.max(X[:,1])])

        x0, x1 = np.meshgrid(
                np.linspace(left_right[0], left_right[1], 500).reshape(-1, 1),
                np.linspace(down_up[0], down_up[1], 500).reshape(-1, 1),
            )
        X_new = np.c_[x0.ravel(), x1.ravel()]

        y_proba = fitted_model.predict_proba(X_new)

        zz = y_proba[:, 1].reshape(x0.shape)
        contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)
        plt.clabel(contour, inline=1, fontsize=12)

        plt.plot(X[y==0, 0], X[y==0, 1], "bs")
        plt.plot(X[y==1, 0], X[y==1, 1], "g^")

        boundary = -(theta[1] * left_right + theta[0]) / theta[2] # x_2  = -c / b - a / b * x_1 https://www.youtube.com/watch?v=3qzWeokRYTA&ab_channel=AppliedAICourse

        plt.plot(left_right, boundary, "k--", linewidth=3)

        plt.xlabel("Feature 1", fontsize=14)
        plt.ylabel("Feature 2", fontsize=14)

        plt.show()
            
    elif X.shape[1] == 3:
            
       # Interactions with plot may not work with jupyter lab. Please use jupyter notebook instead.

        theta = fitted_model.theta

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.set_aspect("auto")

        c = np.zeros_like(X[:,1], dtype=float)
        c[y == 1] = 1

        ax.scatter(X[:,0], X[:,1], X[:,2], c=c, cmap="brg")

        x = np.linspace(np.min(X[:,0]), np.max(X[:,0]), 100)
        y = np.linspace(np.min(X[:,1]), np.max(X[:,1]), 100)

        x, y = np.meshgrid(x,y)
        boundary = -(theta[1] * x + theta[2] * y + theta[0]) / theta[3]

        ax.plot_surface(x, y, boundary)

        ax.set_title("3D plot")
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.set_zlabel('Feature 3')

        plt.show()
            
    else: 
        warnings.warn('Wrong number of features. Please use dataset with 1,2 or 3 features instead.', Warning)

In [None]:
def decision_boundary_multiclass(X:np.ndarray, y:np.ndarray, n_classes:int):
    """
    Decision boundary for 2 featuers for multi-class target value.
    """
    
    left_right = [np.min(X[:,0]), np.max(X[:,0])]
    down_up = [np.min(X[:,1]), np.max(X[:,1])]
    
    x0, x1 = np.meshgrid(
        np.linspace(left_right[0], left_right[1], 500).reshape(-1, 1),
        np.linspace(down_up[0], down_up[1], 500).reshape(-1, 1),
    )
    
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_proba = softmax_reg.predict_proba(X_new)
    y_predict = softmax_reg.predict(X_new)

    zz1 = y_proba[:, 1].reshape(x0.shape)
    zz = y_predict.reshape(x0.shape)

    plt.figure(figsize=(10, 4))
    
    for i in range(n_classes):
        label = 'Class ' + str(i)
        plt.plot(X[y==i, 0], X[y==i, 1], ".", label=label)

    plt.contourf(x0, x1, zz, cmap='Pastel1')
    contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)
    plt.clabel(contour, inline=1, fontsize=12)
    plt.xlabel("Feature 1", fontsize=14)
    plt.ylabel("Feature 2", fontsize=14)
    plt.legend(loc="center left", fontsize=14)
    plt.axis([left_right[0], left_right[1], down_up[0], down_up[1]])
    plt.show()

<h1> Logistic Regression </h1>

We can use Gradient Descent algorithm also for logistic regression. Ridge, Lasso and Elastic Net regularizations also works here, like for other linear models.

In [8]:
def logistic_function(t):
    return 1 / (1 + np.exp(-t))

<h3> Batch Gradient Descent </h3>

In [5]:
class logistic_regression_BGD:
    """
    Implementation of logistic regression using Batch Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_iterations:int, theta:np.ndarray=np.array([0,0]), eta:float=0.01,
            eta_reducer:float=1.0, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        - lasso path tends to bounds, when some theta's numbers changes to 0 (then slopes changes abruptly).
        So it's good idea to set eta_reducer to gradually reduce eta in order to converge to the global minimum.
        """
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        m = X_b.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_iterations):
            if regularization == None:
                gradient = 1/m * X_b.T.dot(logistic_function(X_b.dot(self.theta)) - y)
            elif regularization == 'ridge':
                gradient = 1/m * X_b.T.dot(logistic_function(X_b.dot(self.theta)) - y) + alpha * np.array([0, *self.theta[1:]])
            elif regularization == 'lasso':
                gradient = 1/m * X_b.T.dot(logistic_function(X_b.dot(self.theta)) - y) + alpha * np.array([0, *np.sign(self.theta[1:])])
            elif regularization == 'elastic':
                gradient = 1/m * X_b.T.dot(logistic_function(X_b.dot(self.theta)) - y) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
            
            self.theta = eta_reducer * (self.theta - eta * gradient)
            self.theta_path.append(self.theta)
            
            if debugger and not np.isfinite(self.theta).all():
                warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                break

        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([1 - y_pred_proba, y_pred_proba]).T
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        y_pred = np.zeros([len(y_pred_proba)])
        y_pred[y_pred_proba >= 0.5] = 1
        return y_pred

<h3> Stochastic Gradient Descent </h3>

In [None]:
class logistic_regression_SGD:
    """
    Implementation of logistic regression using Stochastic Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray,  n_epochs:int, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1,
            mix_ratio:float=0.5, regularization:str=None, random_state=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        m = X.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_epochs):
            for j in range(m):
                random_sample = np.random.randint(0, m)
                x_i = X_b[random_sample]
                y_i = y[random_sample]
                
                if regularization == None:
                    gradient = x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i)
                elif regularization == 'ridge':
                    gradient = x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                    
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([1 - y_pred_proba, y_pred_proba]).T
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        y_pred = np.zeros([len(y_pred_proba)])
        y_pred[y_pred_proba >= 0.5] = 1
        return y_pred

<h2> Mini-batch Gradient Descent </h2>

In [None]:
class logistic_regression_MbGD:
    """
    Implementation of logistic regression using Mini-batch Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_epochs:int, batch_size_ratio:float=0.1, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None, random_state=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        m = X.shape[0]

        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]
        
        self.batch_size = int(np.ceil(batch_size_ratio*len(X)))
        
        for i in range(n_epochs):
            for j in range(m):
                random_samples = random.sample(range(0, m), self.batch_size)
                x_i = X_b[random_samples]
                y_i = y[random_samples]
                
                if regularization == None:
                    gradient = 2 / self.batch_size * x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i)
                elif regularization == 'ridge':
                    gradient = 2 / self.batch_size * x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + alpha * np.array([0, *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = 2 / self.batch_size * x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + alpha * np.array([0, *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = 2 / self.batch_size * x_i.T.dot(logistic_function(x_i.dot(self.theta)) - y_i) + mix_ratio * alpha * np.array([0, *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([0, *self.theta[1:]])
                
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        return np.array([1 - y_pred_proba, y_pred_proba]).T
    
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        y_pred_proba = np.array(logistic_function(self.X_test_b.dot(self.theta)))
        y_pred = np.zeros([len(y_pred_proba)])
        y_pred[y_pred_proba >= 0.5] = 1
        return y_pred

<h1> Softmax Regression </h1>

In [12]:
def softmax_score(X:np.ndarray, theta:np.ndarray) -> np.ndarray:
    """
    Calculates softmax score.
    
    X = [[1, x1, x2, ...]
         [1, x1, x2, ...]
         [1, x1, x2, ...]
         [1, x1, x2, ...]
         [     ...      ]]
        
    theta = [[theta1, theta1', theta1'', ...]
             [theta2, theta2', theta2'', ...]
             [theta3, theta3', theta3'', ...]
             [           ...           ]]
    """
    
    return X.dot(theta)

In [13]:
def softmax_function(softmax_scores:np.ndarray) -> np.ndarray:
    """
    Calculate Softmax function. Returns estimated probabilities.
    """

    score = np.zeros_like(softmax_scores, dtype=float)
    
    try: 
        denominator = np.sum(np.exp(softmax_scores), axis=1)

        for i in range(softmax_scores.shape[1]):
            score[:,i] = np.exp(softmax_scores[:,i]) / denominator
    except:
        denominator = np.sum(np.exp(softmax_scores), axis=0)

        for i in range(len(softmax_scores)):
            score[i] = np.exp(softmax_scores[i]) / denominator
            
        score = np.expand_dims(score, axis=0)

    return score

In [14]:
def softmax_regression_classifier_prediction(estimated_probabilities:np.ndarray) -> np.ndarray:
    """
    Calculate softmax regression classifier prediction.
    """
    
    return np.argmax(estimated_probabilities, axis=1)

In [15]:
def cross_entropy_gradient(X:np.ndarray, y:np.ndarray, p:np.ndarray, n_classes:int) -> np.ndarray:
    """
    Cross entropy vector for class k.
    p - estimated probabilities
    """
    
    m = X.shape[0]

    k = n_classes
    y_k = np.zeros([len(y), k])

    for i in range(k):
        ind = np.where(y==i)[0]
        y_k[ind, i] = 1

    if y_k.shape[0] !=1:
        return 1/m * X.T.dot(p - y_k)
    else:
        X = np.expand_dims(X, axis=0)
        return X.T.dot(p - y_k)

<h3> Batch Gradient Descent </h3>

In [16]:
class softmax_regression_BGD:
    """
    Implementation of softmax regression using Batch Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None


    def fit(self, X:np.ndarray, y:np.ndarray, n_iterations:int, n_classes:int, theta:np.ndarray=np.array([0,0]), eta:float=0.01,
            eta_reducer:float=1.0, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        - lasso path tends to bounds, when some theta's numbers changes to 0 (then slopes changes abruptly).
        So it's good idea to set eta_reducer to gradually reduce eta in order to converge to the global minimum.
        """
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        m = X_b.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_iterations):
            p = softmax_function(softmax_score(X_b, self.theta))
            
            if regularization == None:
                gradient = cross_entropy_gradient(X_b, y, p, n_classes)
            elif regularization == 'ridge':
                gradient = cross_entropy_gradient(X_b, y, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
            elif regularization == 'lasso':
                gradient = cross_entropy_gradient(X_b, y, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])])
            elif regularization == 'elastic':
                gradient = cross_entropy_gradient(X_b, y, p, n_classes) + mix_ratio * alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
            
            self.theta = eta_reducer * (self.theta - eta * gradient)
            self.theta_path.append(self.theta)
            
            if debugger and not np.isfinite(self.theta).all():
                warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                break

        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_function(softmax_score(self.X_test_b, self.theta))
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_regression_classifier_prediction(softmax_function(softmax_score(self.X_test_b, self.theta)))

<h3> Stochastic Gradient Descent </h3>

In [None]:
class softmax_regression_SGD:
    """
    Implementation of softmax regression using Stochastic Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
    def fit(self, X:np.ndarray, y:np.ndarray,  n_epochs:int, n_classes:int, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, random_state=None,
            mix_ratio:float=0.5, regularization:str=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        m = X.shape[0]
        
        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]

        for i in range(n_epochs):
            for j in range(m):
                random_sample = np.random.randint(0, m)
                x_i = X_b[random_sample]
                y_i = np.array([y[random_sample]])
                
                p = softmax_function(softmax_score(x_i, self.theta))

                if regularization == None:
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes)
                elif regularization == 'ridge':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + mix_ratio * alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
                    
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                    
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_function(softmax_score(self.X_test_b, self.theta))
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_regression_classifier_prediction(softmax_function(softmax_score(self.X_test_b, self.theta)))

<h2> Mini-batch Gradient Descent </h2>

In [8]:
class softmax_regression_MbGD:
    """
    Implementation of logistic regression using Mini-batch Gradient Descent.
    """
    
    def __init__(self):
        self.X_test_b = None
        self.theta_path = []
        self.theta = None
        
        
    def fit(self, X:np.ndarray, y:np.ndarray, n_epochs:int, n_classes:int, batch_size_ratio:float=0.1, theta:np.ndarray=np.array([0,0]),
            t0:float=10, t1:float=100, debugger:bool=True, alpha:float=0.1, mix_ratio:float=0.5, regularization:str=None, random_state=None):
        """
        regularization = [None, 'ridge', 'lasso', 'elastic']
        - ridge, lasso, elastic requires to set alpha.
        - elastic requires to set mix_ratio
        """
        
        random.seed(random_state)
        
        def learning_rate(t):
            return t0 / (t + t1)
        

        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        m = X.shape[0]

        if type(theta) == type(None):
        
            self.theta = np.array([0] * (X.shape[1] + 1))
            self.theta_path = [self.theta]
            
        else:
            
            self.theta = theta
            self.theta_path = [theta]
        
        self.batch_size = int(np.ceil(batch_size_ratio*len(X)))
        
        for i in range(n_epochs):
            for j in range(m):
                random_samples = random.sample(range(0, m), self.batch_size)
                x_i = X_b[random_samples]
                y_i = y[random_samples]
                
                p = softmax_function(softmax_score(x_i, self.theta))
                
                if regularization == None:
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes)
                elif regularization == 'ridge':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
                elif regularization == 'lasso':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])])
                elif regularization == 'elastic':
                    gradient = cross_entropy_gradient(x_i, y_i, p, n_classes) + mix_ratio * alpha * np.array([[0] * self.theta.shape[1], *np.sign(self.theta[1:])]) + (1 - mix_ratio) * alpha * np.array([[0] * self.theta.shape[1], *self.theta[1:]])
                
                self.eta = learning_rate(i * m + j)
                self.theta = self.theta - self.eta * gradient
                self.theta_path.append(self.theta)
                
                if debugger and not np.isfinite(self.theta).all():
                    warnings.warn('Infinite value of theta. Further calculations may lead to errors. Path ended.', Warning)
                    break
                
            else:
                continue
            break
            
        self.theta_path = np.array(self.theta_path)
    
    
    def predict_proba(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_function(softmax_score(self.X_test_b, self.theta))
    
    def predict(self, X_test:np.ndarray) -> np.ndarray:
        self.X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
        return softmax_regression_classifier_prediction(softmax_function(softmax_score(self.X_test_b, self.theta)))

<h1>Decision Trees</h1>

In [3]:
def gini_impurity(y:np.ndarray) -> float:
    """
    Calculates gini impurity for a given y column vector.
    """
    unique, counts = np.unique(y, return_counts=True)

    return 1 - sum((counts/len(y))**2)

# example
# gini_impurity(numpy.array([[0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4]]).T)

In [4]:
def entropy(y:np.ndarray) -> float:
    
    
    unique, counts = np.unique(y, return_counts=True)
    print(np.argmax(counts))
    ratio = counts/len(y)
    
    return sum(- ratio * np.log2(ratio + 1e-16)) # to avoid log(0)
    
    
# example
# entropy(numpy.array([[1, 3, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 1, 3, 4]]).T)

In [90]:
class decision_tree_classification():
    
    def __init__(self):
        self.max_depth = None
        self.features_order = None
        self.thresholds_order = None
        self.predicted_labels = None
        self.node_type = None
    
    def fit(self, X:np.ndarray, y:np.ndarray, max_depth:int=3, thresholds_count:int=10,
            counter=1, features_order=None, thresholds_order=None, predicted_labels=None, node_type=[]) -> [list, list, list, list]:
        
        if counter==1:
            node_type = []
            features_order=[]
            thresholds_order=[]
            predicted_labels=[]
        
        if counter <= max_depth:
                
            unique, counts = np.unique(y, return_counts=True)
            m = X.shape[0]
            
            if len(unique) == 1:
                predicted_label = unique[np.argmax(counts)]
                predicted_labels.append(predicted_label)
                node_type.append('leaf')
                
            else:
                predicted_label = unique[np.argmax(counts)]
                predicted_labels.append(predicted_label)
                
                
                min_ = np.inf
                threshold_found = False
                
                for i in range(X.shape[1]):
                    
                    median = np.median(X[:, i])
                    percentiles = np.linspace(0, 1, thresholds_count+2)[1:-1]
                    thresholds = percentiles * median
                    
                    for thresh in thresholds:

                        left_ind = np.where((X[:, i] < thresh) | (X[:, i] == thresh))
                        right_ind = np.where(X[:, i] > thresh)

                        m_left = len(left_ind[0])
                        m_right = len(right_ind[0])
                        if m_left != 0 and m_right != 0:
                            G_left = gini_impurity(y[left_ind])
                            G_right = gini_impurity(y[right_ind])

                            if m_left / m * G_left + m_right / m *G_right < min_:
                                min_ = m_left / m * G_left + m_right / m *G_right
                                feature = i
                                threshold = thresh
                                threshold_found = True
                if threshold_found:
                            
                    node_type.append('node '+str(m_left)+' '+str(m_right))
                    left_ind = np.where((X[:, feature] < threshold) | (X[:, feature] == threshold)) 
                    right_ind = np.where((X[:, feature] > threshold))

                    X_left = X[left_ind]
                    X_right = X[right_ind]

                    y_left = y[left_ind]
                    y_right = y[right_ind]

                    features_order.append(feature)
                    thresholds_order.append(threshold)

                    self.fit(X_left, y_left, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)

                    self.fit(X_right, y_right, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)
                else:
                    node_type.append('leaf')

                    predicted_label = unique[np.argmax(counts)]
                    predicted_labels.append(predicted_label)
            
        else:
            node_type.append('leaf')
            unique, counts = np.unique(y, return_counts=True)

            predicted_label = unique[np.argmax(counts)]
            predicted_labels.append(predicted_label)

        
        self.max_depth = max_depth
        self.features_order = cycle(features_order)
        self.thresholds_order = cycle(thresholds_order)
        self.predicted_labels = cycle(predicted_labels)
        self.node_type = cycle(node_type)
        
        return features_order, thresholds_order, predicted_labels, node_type
    
    def predict(self, Xy, counter=1, index=[]):
        node_type = next(self.node_type)
        if counter == 1:
            Xy = np.concatenate([Xy, np.zeros([Xy.shape[0], 1])], axis=1)

            index = np.arange(0, Xy.shape[0], 1)
            
        if counter <= self.max_depth:
            if node_type == 'leaf':
                label = next(self.predicted_labels)
                Xy[index,-1] = label
                
            else:
            
            
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)

                Xy[index,-1] = label

                l_ind = np.where((Xy[index, feature] < threshold) | (Xy[index, feature] == threshold))
                r_ind = np.where((Xy[index, feature] > threshold))

                left_index = index[l_ind]
                right_index = index[r_ind]


                self.predict(Xy, counter=counter+1, index=left_index)
                self.predict(Xy, counter=counter+1, index=right_index)
        else:
            label = next(self.predicted_labels)
            Xy[index,-1] = label

        return Xy[:,-1]

    
    def plot(self, count=1, root=0, nodes={}):
        
        def create_nodes(max_depth, count=1, root=0, nodes={}, node_type=None, color='purple'):
            
            
            if count == 1:
                global node_number
                node_number = 0
                node_type = next(self.node_type)
            
            if node_type != 'leaf':
                
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('feature "'+str(feature)+'" <= '+str(threshold)+'\n'+'class: '+str(label), height="1", width="4", color=color)
                
                if count != 1:
                    nodes[root] >> nodes[node_number]
                    
                root = node_number
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="green")
                
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="red")
                
            else:
                
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('leaf, class: '+str(label), height="1", width="4", color=color)
                
                nodes[root] >> nodes[node_number]
                
                node_number += 1
                
                
                
        with Diagram('My Tree (green child - condition is True, red child - condition is False)', direction='TB'):
            create_nodes(self.max_depth)



## example

# X, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=3, n_clusters_per_class=1, n_classes=3, random_state = 42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# y_train = np.expand_dims(y_train, axis=1)

# max_depth = 5

# dt = decision_tree_classification()

# features_order, thresholds_order, predicted_labels, node_type = dt.fit(X_train, y_train, max_depth)

# y_pred = dt.predict(X_test)

# accuracy_score(y_test, y_pred, normalize=True)

# dt.plot()

# print(features_order)

# accuracy_score(y_test, y_pred, normalize=True)

In [93]:
class extra_tree_classification():
    
    def __init__(self):
        self.max_depth = None
        self.features_order = None
        self.thresholds_order = None
        self.predicted_labels = None
        self.node_type = None
    
    def fit(self, X:np.ndarray, y:np.ndarray, max_depth:int=3, thresholds_count:int=10,
            counter=1, features_order=None, thresholds_order=None, predicted_labels=None, node_type=[], random_state=None) -> [list, list, list, list]:
        
        random.seed(random_state)
        
        if counter==1:
            node_type = []
            features_order=[]
            thresholds_order=[]
            predicted_labels=[]
        
        if counter <= max_depth:
                
            unique, counts = np.unique(y, return_counts=True)
            m = X.shape[0]
            
            if len(unique) == 1:
                predicted_label = unique[np.argmax(counts)]
                predicted_labels.append(predicted_label)
                node_type.append('leaf')
                
            else:
                predicted_label = unique[np.argmax(counts)]
                predicted_labels.append(predicted_label)
                
                feature = np.random.randint(0, X.shape[1])
                min_ = np.inf
                threshold_found = False
                
                median = np.median(X[:, feature])
                percentiles = np.linspace(0, 1, thresholds_count+2)[1:-1]
                thresholds = percentiles * median
                
                for thresh in thresholds:

                    left_ind = np.where((X[:, feature] < thresh) | (X[:, feature] == thresh))
                    right_ind = np.where(X[:, feature] > thresh)

                    m_left = len(left_ind[0])
                    m_right = len(right_ind[0])
                    if m_left != 0 and m_right != 0:
                        G_left = gini_impurity(y[left_ind])
                        G_right = gini_impurity(y[right_ind])

                        if m_left / m * G_left + m_right / m *G_right < min_:
                            min_ = m_left / m * G_left + m_right / m *G_right
                            threshold = thresh
                            threshold_found = True
                        
                if threshold_found:
                            
                    node_type.append('node '+str(m_left)+' '+str(m_right))
                    left_ind = np.where((X[:, feature] < threshold) | (X[:, feature] == threshold)) 
                    right_ind = np.where((X[:, feature] > threshold))

                    X_left = X[left_ind]
                    X_right = X[right_ind]

                    y_left = y[left_ind]
                    y_right = y[right_ind]

                    features_order.append(feature)
                    thresholds_order.append(threshold)

                    self.fit(X_left, y_left, max_depth,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)

                    self.fit(X_right, y_right, max_depth,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)
                else:
                    node_type.append('leaf')

                    predicted_label = unique[np.argmax(counts)]
                    predicted_labels.append(predicted_label)
            
        else:
            node_type.append('leaf')
            unique, counts = np.unique(y, return_counts=True)

            predicted_label = unique[np.argmax(counts)]
            predicted_labels.append(predicted_label)

        
        self.max_depth = max_depth
        self.features_order = cycle(features_order)
        self.thresholds_order = cycle(thresholds_order)
        self.predicted_labels = cycle(predicted_labels)
        self.node_type = cycle(node_type)
        
        return features_order, thresholds_order, predicted_labels, node_type
    
    def predict(self, Xy, counter=1, index=[]):
        node_type = next(self.node_type)
        if counter == 1:
            Xy = np.concatenate([Xy, np.zeros([Xy.shape[0], 1])], axis=1)

            index = np.arange(0, Xy.shape[0], 1)
            
        if counter <= self.max_depth:
            if node_type == 'leaf':
                label = next(self.predicted_labels)
                Xy[index,-1] = label
                
            else:
            
            
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)

                Xy[index,-1] = label

                l_ind = np.where((Xy[index, feature] < threshold) | (Xy[index, feature] == threshold))
                r_ind = np.where((Xy[index, feature] > threshold))

                left_index = index[l_ind]
                right_index = index[r_ind]


                self.predict(Xy, counter=counter+1, index=left_index)
                self.predict(Xy, counter=counter+1, index=right_index)
        else:
            label = next(self.predicted_labels)
            Xy[index,-1] = label

        return Xy[:,-1]

    
    def plot(self, count=1, root=0, nodes={}):
        
        def create_nodes(max_depth, count=1, root=0, nodes={}, node_type=None, color='purple'):
            
            
            if count == 1:
                global node_number
                node_number = 0
                node_type = next(self.node_type)
            
            if node_type != 'leaf':
                
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('feature "'+str(feature)+'" <= '+str(threshold)+'\n'+'class: '+str(label), height="1", width="3", color=color)
                
                if count != 1:
                    nodes[root] >> nodes[node_number]
                    
                root = node_number
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="green")
                
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="red")
                
            else:
                
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('leaf, class: '+str(label), height="1", width="3", color=color)
                
                nodes[root] >> nodes[node_number]
                
                node_number += 1
                
                
                
        with Diagram('My Tree (green child - condition is True, red child - condition is False)', direction='TB'):
            create_nodes(self.max_depth)



## example

# X, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=3, n_clusters_per_class=1, n_classes=3, random_state = 42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# y_train = np.expand_dims(y_train, axis=1)

# max_depth = 5

# dt = extra_tree_classification()

# features_order, thresholds_order, predicted_labels, node_type = dt.fit(X_train, y_train, max_depth)

# y_pred = dt.predict(X_test)

# accuracy_score(y_test, y_pred, normalize=True)

# dt.plot()

# print(features_order)

# accuracy_score(y_test, y_pred, normalize=True)

In [92]:
class decision_tree_regression():
    
    def __init__(self):
        self.max_depth = None
        self.features_order = None
        self.thresholds_order = None
        self.predicted_labels = None
        
    
    def fit(self, X:np.ndarray, y:np.ndarray, max_depth:int=3, thresholds_count:int=10,
            counter=1, features_order=None, thresholds_order=None, predicted_labels=None, node_type=[]) -> [list, list, list, list]:
        
        if counter==1:
            node_type = []
            features_order=[]
            thresholds_order=[]
            predicted_labels=[]
        
        if counter <= max_depth:
                
            m = X.shape[0]
            
            if m == 1:
                node_type.append('leaf')
                predicted_label = np.average(y)
                predicted_labels.append(predicted_label)
                
            else:
                predicted_label = np.average(y)
                predicted_labels.append(predicted_label)
                
                
                min_ = np.inf

                threshold_found = False
                
                for i in range(X.shape[1]):
                    
                    median = np.median(X[:, i])
                    percentiles = np.linspace(0, 1, thresholds_count+2)[1:-1]
                    thresholds = percentiles * median
                    
                    for thresh in thresholds:

                        left_ind = np.where((X[:, i] < thresh) | (X[:, i] == thresh))
                        right_ind = np.where(X[:, i] > thresh)

                        m_left = len(left_ind[0])
                        m_right = len(right_ind[0])
                        
                        if m_left != 0 and m_right != 0:
                            average_left = np.sum(y[left_ind])/m_left
                            average_right = np.sum(y[right_ind])/m_right

                            MSE_left = np.sum((average_left - y[left_ind])**2)
                            MSE_right = np.sum((average_right - y[right_ind])**2)

                            if m_left / m * MSE_left + m_right / m *MSE_right < min_:
                                min_ = m_left / m * MSE_left + m_right / m *MSE_right
                                feature = i
                                threshold = thresh
                                threshold_found = True
                if threshold_found:
                            

                    node_type.append('node '+str(m_left)+' '+str(m_right))
                    left_ind = np.where((X[:, feature] < threshold) | (X[:, feature] == threshold)) 
                    right_ind = np.where((X[:, feature] > threshold))

                    X_left = X[left_ind]
                    X_right = X[right_ind]

                    y_left = y[left_ind]
                    y_right = y[right_ind]

                    features_order.append(feature)
                    thresholds_order.append(threshold)

                    self.fit(X_left, y_left, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)

                    self.fit(X_right, y_right, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)
                else:
                    node_type.append('leaf')
                    predicted_label = np.average(y)
                    predicted_labels.append(predicted_label)
                
            
        else:
            node_type.append('leaf')
            predicted_label = np.average(y)
            predicted_labels.append(predicted_label)

        
        self.max_depth = max_depth
        self.features_order = cycle(features_order)
        self.thresholds_order = cycle(thresholds_order)
        self.predicted_labels = cycle(predicted_labels)
        self.node_type = cycle(node_type)
        
        return features_order, thresholds_order, predicted_labels, node_type
    
    def predict(self, Xy, counter=1, index=[]):
        node_type = next(self.node_type)
        if counter == 1:
            Xy = np.concatenate([Xy, np.zeros([Xy.shape[0], 1])], axis=1)

            index = np.arange(0, Xy.shape[0], 1)
            
        if counter <= self.max_depth:
            if node_type == 'leaf':
                label = next(self.predicted_labels)
                Xy[index,-1] = label
                
            else:
            
            
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)

                Xy[index,-1] = label

                l_ind = np.where((Xy[index, feature] < threshold) | (Xy[index, feature] == threshold))
                r_ind = np.where((Xy[index, feature] > threshold))

                left_index = index[l_ind]
                right_index = index[r_ind]


                self.predict(Xy, counter=counter+1, index=left_index)
                self.predict(Xy, counter=counter+1, index=right_index)
        else:
            label = next(self.predicted_labels)
            Xy[index,-1] = label

        return Xy[:,-1]
    
    
    def plot(self, count=1, root=0, nodes={}):
        
        def create_nodes(max_depth, count=1, root=0, nodes={}, node_type=None, color='purple'):
            
            
            if count == 1:
                global node_number
                node_number = 0
                node_type = next(self.node_type)
            
            if node_type != 'leaf':
                
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('feature "'+str(feature)+'" <= '+str(threshold)+'\n'+'class: '+str(label), height="1", width="3", color=color)
                
                if count != 1:
                    nodes[root] >> nodes[node_number]
                    
                root = node_number
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="green")
                
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="red")
                
            else:
                
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('leaf, class: '+str(label), height="1", width="3", color=color)
                
                nodes[root] >> nodes[node_number]
                
                node_number += 1
                
                
                
        with Diagram('My Tree (green child - condition is True, red child - condition is False)', direction='TB'):
            create_nodes(self.max_depth)

## example

# X, y = make_regression(n_samples=100, n_features=10, n_informative=10)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# y_train = np.expand_dims(y_train, axis=1)

# max_depth = 5

# dt = decision_tree_regression()

# features_order, thresholds_order, predicted_labels, node_type = dt.fit(X_train, y_train, max_depth)

# y_pred = dt.predict(X_test)


# dt.plot()

# mean_squared_error(y_test, y_pred)

In [91]:
class extra_tree_regression():
    
    def __init__(self):
        self.max_depth = None
        self.features_order = None
        self.thresholds_order = None
        self.predicted_labels = None
        
    
    def fit(self, X:np.ndarray, y:np.ndarray, max_depth:int=3, thresholds_count:int=10, random_state=None,
            counter=1, features_order=None, thresholds_order=None, predicted_labels=None, node_type=[]) -> [list, list, list, list]:
        
        random.seed(random_state)
        
        if counter==1:
            node_type = []
            features_order=[]
            thresholds_order=[]
            predicted_labels=[]
        
        if counter <= max_depth:
                
            m = X.shape[0]
            
            if m == 1:
                node_type.append('leaf')
                predicted_label = np.average(y)
                predicted_labels.append(predicted_label)
                
            else:
                predicted_label = np.average(y)
                predicted_labels.append(predicted_label)
                
                
                min_ = np.inf

                feature = np.random.randint(0, X.shape[1])
                
                threshold_found = False

                median = np.median(X[:, feature])
                percentiles = np.linspace(0, 1, thresholds_count+2)[1:-1]
                thresholds = percentiles * median
                
                for thresh in thresholds:

                    left_ind = np.where((X[:, feature] < thresh) | (X[:, feature] == thresh))
                    right_ind = np.where(X[:, feature] > thresh)

                    m_left = len(left_ind[0])
                    m_right = len(right_ind[0])

                    if m_left != 0 and m_right != 0:
                        average_left = np.sum(y[left_ind])/m_left
                        average_right = np.sum(y[right_ind])/m_right

                        MSE_left = np.sum((average_left - y[left_ind])**2)
                        MSE_right = np.sum((average_right - y[right_ind])**2)

                        if m_left / m * MSE_left + m_right / m *MSE_right < min_:
                            min_ = m_left / m * MSE_left + m_right / m *MSE_right
                            threshold = thresh
                            threshold_found = True
                if threshold_found:
                            

                    node_type.append('node '+str(m_left)+' '+str(m_right))
                    left_ind = np.where((X[:, feature] < threshold) | (X[:, feature] == threshold)) 
                    right_ind = np.where((X[:, feature] > threshold))

                    X_left = X[left_ind]
                    X_right = X[right_ind]

                    y_left = y[left_ind]
                    y_right = y[right_ind]

                    features_order.append(feature)
                    thresholds_order.append(threshold)

                    self.fit(X_left, y_left, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)

                    self.fit(X_right, y_right, max_depth, thresholds_count=thresholds_count,
                    counter=counter+1, features_order=features_order, thresholds_order=thresholds_order, predicted_labels=predicted_labels, node_type=node_type)
                else:
                    node_type.append('leaf')
                    predicted_label = np.average(y)
                    predicted_labels.append(predicted_label)
                
            
        else:
            node_type.append('leaf')
            predicted_label = np.average(y)
            predicted_labels.append(predicted_label)

        
        self.max_depth = max_depth
        self.features_order = cycle(features_order)
        self.thresholds_order = cycle(thresholds_order)
        self.predicted_labels = cycle(predicted_labels)
        self.node_type = cycle(node_type)
        
        return features_order, thresholds_order, predicted_labels, node_type
    
    def predict(self, Xy, counter=1, index=[]):
        node_type = next(self.node_type)
        if counter == 1:
            Xy = np.concatenate([Xy, np.zeros([Xy.shape[0], 1])], axis=1)

            index = np.arange(0, Xy.shape[0], 1)
            
        if counter <= self.max_depth:
            if node_type == 'leaf':
                label = next(self.predicted_labels)
                Xy[index,-1] = label
                
            else:
            
            
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)

                Xy[index,-1] = label

                l_ind = np.where((Xy[index, feature] < threshold) | (Xy[index, feature] == threshold))
                r_ind = np.where((Xy[index, feature] > threshold))

                left_index = index[l_ind]
                right_index = index[r_ind]


                self.predict(Xy, counter=counter+1, index=left_index)
                self.predict(Xy, counter=counter+1, index=right_index)
        else:
            label = next(self.predicted_labels)
            Xy[index,-1] = label

        return Xy[:,-1]
    
    
    def plot(self, count=1, root=0, nodes={}):
        
        def create_nodes(max_depth, count=1, root=0, nodes={}, node_type=None, color='purple'):
            
            
            if count == 1:
                global node_number
                node_number = 0
                node_type = next(self.node_type)
            
            if node_type != 'leaf':
                
                feature = next(self.features_order)
                threshold = next(self.thresholds_order)
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('feature "'+str(feature)+'" <= '+str(threshold)+'\n'+'class: '+str(label), height="1", width="3", color=color)
                
                if count != 1:
                    nodes[root] >> nodes[node_number]
                    
                root = node_number
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="green")
                
                
                node_number += 1
                
                create_nodes(max_depth, count=count+1, root=root, nodes=nodes, node_type=next(self.node_type), color="red")
                
            else:
                
                label = next(self.predicted_labels)
                
                nodes[node_number] = Node('leaf, class: '+str(label), height="1", width="3", color=color)
                
                nodes[root] >> nodes[node_number]
                
                node_number += 1
                
                
                
        with Diagram('My Tree (green child - condition is True, red child - condition is False)', direction='TB'):
            create_nodes(self.max_depth)

## example

# X, y = make_regression(n_samples=100, n_features=10, n_informative=10)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# y_train = np.expand_dims(y_train, axis=1)

# max_depth = 5

# dt = extra_tree_regression()

# features_order, thresholds_order, predicted_labels, node_type = dt.fit(X_train, y_train, max_depth)

# y_pred = dt.predict(X_test)


# dt.plot()

# mean_squared_error(y_test, y_pred)

<h1> Neural Networks </h1>

<h2> Perceptron </h2>

<h4> Step functions </h4>

In [5]:
def heavside(z:float) -> int:
    """
    Most common step function used in Perceptrons
    """
    return 0 if z < 0 else 1

In [6]:
def sgn(z:float) -> int:
    """
    Sometimes used in Perceptrons.
    In general it should be -1, 0, 1.
    """
    if z < 0:
        return 0
    elif z == 0:
        return None
    else:
        return 1

<h4> Computing the outputs of a fully connected layer <h4>

In [7]:
def single_fully_output(X, W, activation_function):

    z = X.dot(W)
#     print(X.ndim)
#     print(z)
    return np.vectorize(activation_function)(z)

## example

# X = np.array([[1,2]])
# W = np.array([[0.1, 0.34, 0.765],[0.2, 0.45, -0.232]])
# b = np.array([[1,0.2,0.34]])

# single_fully_output(X, W, b, heavside)

<h4> Perceptron learning rule <h4/>

In [8]:
def perceptron_learning_rule(X, y, W, eta, activation_function=heavside, n_iterations=1000):
    
    
    if X.ndim == 1:
        X = np.expand_dims(X, axis=0)
        
    for i in range(n_iterations):
        y_pred = single_fully_output(X, W, activation_function)
        

        W = W + eta * X.T.dot((y - y_pred)) 
            
    return W


## example

# X = np.array([[1, 1,2]])
# y = np.array([[1,1,0]])
# W = np.array([[1,0.2,0.34], [0.1, 0.34, 0.765],[0.2, 0.45, -0.232]])


# print(perceptron_learning_rule(X, y, W, eta=0.1))
# print(W)

In [9]:
def perceptron(X, y, n_classses, step_function=heavside):
    X = np.c_[np.ones(X.shape[0]),X]


    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = y.reshape(len(y), 1)
    y = onehot_encoder.fit_transform(integer_encoded)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    lenW = X.shape[1] * n_classses
    
    W = np.ones(lenW)
    
    W = np.reshape(W, [X.shape[1], n_classses])
    
    for i in range(X_train.shape[0]):
        W = perceptron_learning_rule(X_train[i], y_train[i], W, eta=0.001, n_iterations=200, activation_function=step_function)
        
    y_pred = single_fully_output(X_test, W, step_function)
    
    return accuracy_score(y_test, y_pred)

# # example 1

# X, y = make_classification(n_samples=100, n_features=2, n_informative=1, n_redundant=1, n_clusters_per_class=1, n_classes=2, random_state = 42)

# perceptron(X, y, 2, heavside)

# # example 2

# X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, n_classes=3, random_state = 42)

# perceptron(X, y, 3, heavside)

<h2> Multi-Layer Perceptron  TBC.</h2>

<h3> Activation functions </h3>

In [10]:
def sigmoid_function(z:float) -> float:
    """
    (Logistic activation function)
    WAS commonly used.
    - S-shaped
    - output from 0 to 1
    """
    return 1 / ( 1 + np.exp(-z))


def hyperbolic_tangent_function(z:float) -> float:
    """
    - similar to logistic function

    - S-shaped

    - continous, differentiable

    - output from -1 to 1 -> output +- centered at 0 at the beggininng of trianing -> speeds up convergence
    """
    return (np.exp(2*z) - 1) / ( np.exp(2*z) + 1 )


def rectified_linear_unit_function(z:float) -> float:
    """
    - commonly used especially for very deep networks ( is much easier to train than sigmoid )
    
    - continous, unforunately not differentialbe at 0

    - derivative is 0 for z < 0

    - fast to compute and works well

    - reduces some issues with Gradient Descent
    
    - Motivation: Partially from biology neutrons - they are inactive (0) or active (z)
    
    """
    return max(0.0, z)


# example

# meshgrid = np.arange(-3, 3, 0.5)

# plt.plot(meshgrid, np.vectorize(sigmoid_function)(meshgrid), 'r', label='sigmoid_function')
# plt.plot(meshgrid, np.vectorize(hyperbolic_tangent_function)(meshgrid), 'g', label='hyperbolic_tangent_function')
# plt.plot(meshgrid, np.vectorize(rectified_linear_unit_function)(meshgrid), 'b--o', label='rectified_linear_unit_function')
# plt.legend()
# plt.show()

<h3> Forward propagation </h3>

In [11]:
def log_loss(y_true:np.ndarray, y_pred:np.ndarray) -> float:
    """
    Log loss cost function.
    """
    m = y_true.shape[0]
    return - 1 / m * np.sum(
        y_true * np.log(y_pred + 1e-16) + (
            1 - y_true) * np.log(1 - y_pred + 1e-16))

In [331]:
def predict_binary_classification(X:np.ndarray, N:list=[], W=None) -> [np.ndarray, np.ndarray, np.ndarray, np.ndarray, list]:
    """
    N = [number_of_neutrons_in_1st_hidden_layer, number_of_neutrons_in_2st_hidden_layer...]
    """

    X = np.c_[np.ones(X.shape[0]), X]
    
    if W == None:
        W = np.zeros([len(N)+1], object)
        W[0] = np.random.rand(X.shape[1],N[0])

        z = X.dot(W[0])

        a = np.zeros([len(N)+1], object)

        a[0] = np.vectorize(sigmoid_function)(z)
        a[0] = np.c_[np.ones(a[0].shape[0]), a[0]]

        i = 0 # if only 1 hidden layer
        for i in range(1, len(N)):

            W[i] = np.random.rand(a[i-1].shape[1], N[i])
            z = a[i-1].dot(W[i])

            a[i] = np.vectorize(sigmoid_function)(z)
            a[i] = np.c_[np.ones(a[i].shape[0]), a[i]]

        W[i+1] = np.random.rand(a[i].shape[1], 1)
        z = a[i].dot(W[i+1])
        a[i+1] = np.vectorize(sigmoid_function)(z)
    else:
        z = X.dot(W[0])

        a = np.zeros([len(N)+1], object)

        a[0] = np.vectorize(sigmoid_function)(z)
        a[0] = np.c_[np.ones(a[0].shape[0]), a[0]]

        i = 0 # if only 1 hidden layer
        for i in range(1, len(N)):

            z = a[i-1].dot(W[i])

            a[i] = np.vectorize(sigmoid_function)(z)
            a[i] = np.c_[np.ones(a[i].shape[0]), a[i]]

        z = a[i].dot(W[i+1])
        a[i+1] = np.vectorize(sigmoid_function)(z)
        
        
        
    y_pred = a[i+1]
    
    return y_pred, a, W, N


# example

# X, y = make_classification(n_samples=100, n_features=2, n_informative=1, n_redundant=1, n_clusters_per_class=1, n_classes=2, random_state=42)

# y_pred, a, W, N = predict_binary_classification(X, N=[10])

# log_loss(y, y_pred)

184.76835396283133

<h3> Backpropagation </h3>

In [20]:
def backpropagation_binary_classification(X, y, a, W, N):
    
    # na poczatku dla layerów: input | hidden | output
    
    # dE = ..........
    
    X = np.c_[np.ones(X.shape[0]), X]
    
    dE = np.zeros([*W.shape], object)
    
    for i in range(len(dE)):
        dE[i] = np.zeros([*W[i].shape, X.shape[0]], object)
    
    # loop for hidden layer
    for inst in range(a[0].shape[0]):
        for h in range(N[0]+1):
            dE[1][h][0][inst] = (a[1][inst] - y[inst]) *  a[0][inst][h] 
        
    for inst in range(a[0].shape[0]):
        for h in range(N[0]):
            for i in range(X.shape[1]):
                dE[0][i][h][inst] = dE[1][h][0][inst] * a[0][inst][h] * (1 - a[0][inst][h]) * X[inst][i]
    
    return dE
    
    
# example

# X, y = make_classification(n_samples=100, n_features=2, n_informative=1, n_redundant=1, n_clusters_per_class=1, n_classes=2, random_state=42)

# y_pred, a, W, N = predict_binary_classification(X, N=[10])

# log_loss(y, y_pred)

# backpropagation_binary_classification(X, y, a, W, N)

<h3> Training </h3>

In [486]:
def train_MLP(X, W , dE, eta):
    """
    MLP training
    """
    
    X = np.c_[np.ones(X.shape[0]), X]
    
    for h in range(N[0]):
        for i in range(X.shape[1]):
            W[0][i][h] = W[0][i][h] - eta * np.average(dE[0][i][h])

        W[1][h][0] = W[1][h][0] - eta * np.average(dE[1][h][0])

    return W


# example

X, y = make_classification(n_samples=10000, n_features=2, n_informative=1, n_redundant=1, n_clusters_per_class=1, n_classes=2, random_state=42)

n_epochs = 100
X_ = np.split(X, n_epochs)
y_ = np.split(y, n_epochs)

y_pred, a, W, N = predict_binary_classification(X, N=[10])

print('pierwszy: ', log_loss(y, y_pred))

y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
print('accuracy: ', accuracy_score(y, y_pred))

for i in range(n_epochs):
    if i == 0:
        y_pred, a, W, N = predict_binary_classification(X_[i], N=[10])
    else:
        y_pred, a, W, N = predict_binary_classification(X_[i], N=[10], W=W)

    error1 = log_loss(y_[i], y_pred)
    print('error1: ', error1)

    dE = backpropagation_binary_classification(X_[i], y_[i], a, W, N)

    j=0
    error2 = np.inf
    while error2 > error1:
        j += 1
    
        W = train_MLP(X_[i], W , dE, eta=10**(-j))

        y_pred, a, W, N = predict_binary_classification(X_[i], N=[10], W=W)
        
        if j > 10:
            break
        error2 = log_loss(y_[i], y_pred)
        print(error2)
    print('error2: ', error2)

y_pred, a, W, N = predict_binary_classification(X, N=[10], W=W)
print('ostatni: ', log_loss(y, y_pred))

y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
print('accuracy: ', accuracy_score(y, y_pred))

pierwszy:  15802.863653737542
accuracy:  0.5005
error1:  145.3896705806304
136.85191513630883
error2:  136.85191513630883
error1:  139.65978095659278
131.18567205842504
error2:  131.18567205842504
error1:  128.84726761509177


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


121.32052792745614
error2:  121.32052792745614
error1:  141.69871084416158
131.13470977439968
error2:  131.13470977439968
error1:  117.12817682054643
110.24601457643745
error2:  110.24601457643745
error1:  123.15816268117564
114.45540344521123
error2:  114.45540344521123
error1:  98.92461448053076
94.2258732636021
error2:  94.2258732636021
error1:  100.67945130043016
95.45641147910601
error2:  95.45641147910601
error1:  95.5273663840502
91.01392475962642
error2:  91.01392475962642
error1:  77.07538855140695
75.52315153361296
error2:  75.52315153361296
error1:  80.04518880489209
78.11285212887839
error2:  78.11285212887839
error1:  84.92682040159909
82.16486126724052
error2:  82.16486126724052
error1:  81.78777111924927


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


79.53488745163584
error2:  79.53488745163584
error1:  85.74181995785649
82.46740182610742
error2:  82.46740182610742
error1:  76.53664655685323
75.26345588166204
error2:  75.26345588166204
error1:  76.13040798887425
74.95262609627429
error2:  74.95262609627429
error1:  75.4155848150935
74.3674399023878
error2:  74.3674399023878
error1:  75.73300068517955
74.61409551010846
error2:  74.61409551010846
error1:  69.41913597022862
69.33207223201471
error2:  69.33207223201471
error1:  73.68523086369488
72.9463743244381
error2:  72.9463743244381
error1:  74.07066811094701
73.19609017274783
error2:  73.19609017274783
error1:  69.91999590050476
69.85399333203091
error2:  69.85399333203091


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


error1:  68.33874758587868
68.36993361506666
68.37314989106947
68.37347249432592
68.37350476440703
68.37350799151271
68.37350831422424
68.37350834649541
68.37350834972253
68.37350835004523
68.37350835007751
sada
error2:  68.37350835007751
error1:  69.9756612389805
69.9223046338536
error2:  69.9223046338536
error1:  70.29897823285378
70.2202016184099
error2:  70.2202016184099
error1:  74.49182032306554
73.14527024667086
error2:  73.14527024667086
error1:  71.99164678038149
71.55666675453125
error2:  71.55666675453125
error1:  69.42927154895105
69.46358131942405
69.46709069861342
69.46744242001053
69.46747759998448
69.4674811180602
69.46748146986856
69.4674815050494
69.46748150856749
69.4674815089193
69.46748150895449
sada
error2:  69.46748150895449
error1:  69.2015174053805


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


69.24041323528625
69.24447000664617
69.24487735309746
69.24491810443315
69.24492217973362
69.24492258726534
69.24492262801853
69.24492263209385
69.24492263250137
69.24492263254213
sada
error2:  69.24492263254213
error1:  71.72304735120154
71.36707554655915
error2:  71.36707554655915
error1:  70.33487566932716
70.30999051330731
error2:  70.30999051330731
error1:  69.61625392397515
69.65396675176454
69.65783235239543
69.65821985498675
69.6582586146705
69.65826249073314
69.65826287834034
69.65826291710106
69.65826292097714
69.65826292136474
69.65826292140352
sada
error2:  69.65826292140352
error1:  71.56591684512144
71.22371727378362
error2:  71.22371727378362
error1:  70.4340070907097
70.37995241334632
error2:  70.37995241334632
error1:  70.8221567841306
70.71724033950134
error2:  70.71724033950134
error1:  69.52227985924094


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


69.58379569839578
69.59052020376878
69.59119837458431
69.59126624885945
69.59127303685891
69.59127371566457
69.59127378354519
69.59127379033325
69.59127379101207
69.59127379107993
sada
error2:  69.59127379107993
error1:  71.11335060598459
70.92113222153169
error2:  70.92113222153169
error1:  70.35698126233387
70.35379342142993
error2:  70.35379342142993
error1:  71.44067778396605
70.86776227102116
error2:  70.86776227102116
error1:  70.13994850048576
70.16416360283233
70.1666160697197
70.16686162753743
70.16688618643205
70.16688864235265
70.16688888794502
70.16688891250426
70.16688891496018
70.16688891520577
70.16688891523032
sada
error2:  70.16688891523032
error1:  70.65380596419547
70.46189546447336
error2:  70.46189546447336
error1:  70.47121113695434
70.4786802632947
70.47946381749237
70.47954254080013


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


70.47955041681128
70.4795512044492
70.47955128321335
70.47955129108976
70.47955129187741
70.47955129195618
70.47955129196406
sada
error2:  70.47955129196406
error1:  69.30969156671797
69.24107026562102
error2:  69.24107026562102
error1:  70.15665646218152
70.21939769800525
70.22582836419102
70.22647299564808
70.22653747444157
70.22654392247739
70.22654456728255
70.22654463176308
70.22654463821112
70.22654463885593
70.22654463892043
sada
error2:  70.22654463892043
error1:  70.17591488894571
70.2261223039495
70.23123371757237
70.23174576558074
70.23179697944796
70.23180210092536
70.23180261307401
70.23180266428888
70.23180266941037
70.23180266992252
70.23180266997372
sada
error2:  70.23180266997372
error1:  69.6691361247833


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


69.71186771058302
69.71742647431982
69.71799517302203
69.71805217108094
69.71805787216869
69.71805844229027
69.71805849930257
69.7180585050038
69.71805850557392
69.71805850563094
sada
error2:  69.71805850563094
error1:  70.56654811732359
70.62489045565798
70.63080403963929
70.63139619230063
70.63145541551013
70.63146133791054
70.63146193015136
70.63146198937545
70.63146199529787
70.63146199589012
70.63146199594934
sada
error2:  70.63146199594934
error1:  69.78684577945047
69.86222981080842
69.87045445124029
69.87128376594093
69.87136676590559
69.87137506658699
69.87137589666197
69.87137597966954
69.8713759879703
69.87137598880038
69.87137598888339
sada
error2:  69.87137598888339
error1:  71.40786917966429


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


71.26093945245934
error2:  71.26093945245934
error1:  70.15510157506155
70.2436275507722
70.25295818593946
70.25389602109074
70.25398985231334
70.25399923591264
70.25400017427735
70.25400026811388
70.25400027749754
70.2540002784359
70.25400027852973
sada
error2:  70.25400027852973
error1:  70.32408536455276
70.42537310814976
70.4359145900926
70.43697285915809
70.43707872726702
70.43708931448995
70.43709037321636
70.43709047908904
70.43709048967631
70.43709049073503
70.43709049084092
sada
error2:  70.43709049084092
error1:  71.80481875095393
71.6552156542963
error2:  71.6552156542963
error1:  68.9984400368826
69.00211519206233
69.00428818405601
69.00452348886952
69.00454719935638
69.00454957220506
69.00454980950794
69.0045498332384
69.00454983561144


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


69.00454983584876
69.00454983587248
sada
error2:  69.00454983587248
error1:  70.04442131826926
70.15294735788659
70.16437392177703
70.16552230710597
70.16563720291741
70.16564869307133
70.16564984209245
70.16564995699463
70.16564996848483
70.16564996963385
70.16564996974876
sada
error2:  70.16564996974876
error1:  71.95543441155195
71.79941791441132
error2:  71.79941791441132
error1:  71.24447285161439
71.26949199181206
71.27203500173819
71.27228971571725
71.27231519124703
71.27231773884134
71.27231799360118
71.27231801907716
71.27231802162476
71.27231802187951
71.272318021905
sada
error2:  71.272318021905
error1:  71.82081363810867


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


71.7732232213478
error2:  71.7732232213478
error1:  70.79312024093691
70.89279956246467
70.90301387937252
70.90403777204159
70.90414018591538
70.90415042754883
70.90415145171464
70.90415155413125
70.9041515643729
70.90415156539706
70.90415156549949
sada
error2:  70.90415156549949
error1:  71.67013607996547
71.63673067218579
error2:  71.63673067218579
error1:  71.26881688744194
71.33940875895614
71.34654613307829
71.34726065372861
71.34733211362739
71.34733925969559
71.3473399743032
71.34734004576397
71.34734005291004
71.34734005362466
71.34734005369612
sada
error2:  71.34734005369612
error1:  69.40914230791543
69.44395250510023
69.4489751652455
69.44949279489047
69.44954471143745
69.44954990462791
69.44955042396232
69.44955047589592
69.44955048108928
69.44955048160861
69.44955048166055
sada
error2:  69.44955048166055
error1:  71.07076784802982


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


71.17038681058955
71.18052504057917
71.18154062601447
71.18164220218144
71.18165235997436
71.18165337575543
71.18165347733355
71.18165348749136
71.18165348850715
71.18165348860873
sada
error2:  71.18165348860873
error1:  71.32627751539985
71.3988945065118
71.40623305998174
71.40696768484946
71.40704115503242
71.40704850212768
71.40704923683798
71.40704931030902
71.40704931765612
71.40704931839082
71.40704931846429
sada
error2:  71.40704931846429
error1:  73.60076664188504
73.2398721684834
error2:  73.2398721684834
error1:  69.71899156909195
69.810877920181
69.8214004957472
69.82246604220305
69.82257272968631
69.82258339976298
69.82258446678394
69.82258457348615
69.82258458415637
69.82258458522341
69.82258458533009
sada
error2:  69.82258458533009
error1:  70.27411511921788
70.40369945517885
70.4175354279732
70.41892776903931
70.4190670905518
70.41908102357709
70.41908241688836
70.41908255621958
70.41908257015268


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


70.41908257154601
70.41908257168534
sada
error2:  70.41908257168534
error1:  72.9820535824166
72.89349959725354
error2:  72.89349959725354
error1:  70.83513982849767
70.98774561978774
71.00359061104439
71.00518093783685
71.00534002877613
71.00535593845264
71.00535752942613
71.00535768852353
71.00535770443327
71.00535770602424
71.00535770618335
sada
error2:  71.00535770618335
error1:  73.30088364799609
73.21066872832856
error2:  73.21066872832856
error1:  72.34277741418394
72.33839398490903
error2:  72.33839398490903
error1:  71.94894359799638
72.0040084433141
72.00956334012014
72.01011931561062
72.01017491801946
72.01018047830895
72.01018103433839
72.01018108994134
72.01018109550164
72.01018109605766
72.01018109611326
sada
error2:  72.01018109611326
error1:  72.3577585303249
72.42731766535881


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


72.43433384391085
72.43503606611762
72.4351062943835
72.43511331727053
72.43511401955985
72.43511408978878
72.43511409681169
72.43511409751397
72.43511409758419
sada
error2:  72.43511409758419
error1:  68.80140029800938
68.8635955791373
68.87166038522487
68.87248523692723
68.87256790572538
68.87257617444139
68.87257700133134
68.87257708402052
68.87257709228945
68.87257709311632
68.87257709319901
sada
error2:  68.87257709319901
error1:  72.97631228583006
73.02947509032795
73.03485172019859
73.0353899888823
73.03544382180984
73.03544920516319
73.03544974349911
73.03544979733272
73.03544980271607
73.03544980325442
73.03544980330825
sada
error2:  73.03544980330825
error1:  73.80171809649852
73.55887094636043
error2:  73.55887094636043
error1:  69.81858631468272
69.90968527516604
69.92042574531453
69.92151601861882


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


69.92162520813169
69.9216361287047
69.92163722077824
69.92163732998574
69.9216373409065
69.92163734199858
69.92163734210779
sada
error2:  69.92163734210779
error1:  73.23520509043462
73.31072993763246
73.31835629443685
73.31911967134508
73.31919601645068
73.3192036510354
73.31920441449459
73.31920449084053
73.31920449847513
73.31920449923858
73.31920449931494
sada
error2:  73.31920449931494
error1:  72.34773449452732
72.46709849284906
72.47916809522282
72.48037638672012
72.48049722918168
72.48050931356094
72.4805105220002
72.48051064284415
72.48051065492854
72.48051065613699
72.48051065625783
sada
error2:  72.48051065625783
error1:  73.37289787622105
73.3335275887176
error2:  73.3335275887176
error1:  74.03784978897895
73.99176016126945
error2:  73.99176016126945
error1:  74.5958892153954
74.32564814291449
error2:  74.32564814291449
error1:  73.7483676535582


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


73.80527024029465
73.81102915731032
73.81160573785948
73.81166340280514
73.81166916936861
73.81166974602566
73.81166980369137
73.81166980945792
73.81166981003459
73.81166981009225
sada
error2:  73.81166981009225
error1:  73.75725258298908
73.7445946526997
error2:  73.7445946526997
error1:  73.8884114135167
73.8229211854972
error2:  73.8229211854972
error1:  73.38466252211408
73.41532176657368
73.41844749808523
73.41876067085934
73.41879199413452
73.41879512652204
73.4187954397614
73.41879547108532
73.41879547421772
73.41879547453095
73.41879547456227
sada
error2:  73.41879547456227
error1:  73.56194274097244
73.65950031728526
73.66933606708106
73.6703204439166
73.67041888962065
73.67042873427127
73.67042971873713
73.67042981718372
73.67042982702839
73.67042982801284
73.67042982811128
sada
error2:  73.67042982811128


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


error1:  75.06860796107614
74.41465717041564
error2:  74.41465717041564
error1:  71.58428092645283
71.7787175670559
71.79979464100037
71.80191860213479
71.8021311607042
71.80215241818563
71.80215454395001
71.8021547565266
71.80215477778427
71.80215477991004
71.80215478012262
sada
error2:  71.80215478012262
error1:  73.3670695942236
73.20851637278382
error2:  73.20851637278382
error1:  73.57809183428982
73.69529134277076
73.70711200151179
73.7082950753864
73.70841339285477
73.70842522470241
73.70842640788818
73.70842652620676
73.70842653803864
73.7084265392218
73.70842653934014
sada
error2:  73.70842653934014
error1:  73.41029652807678
73.5671898040145
73.58307846145883
73.58466931806798
73.58482842363507
73.58484433439084
73.58484592546841
73.58484608457618
73.58484610048697
73.58484610207805
73.58484610223715
sada
error2:  73.58484610223715
error1:  73.82562867598402


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


73.98682276887217
74.00313557402418
74.00476878597055
74.00493212647696
74.00494846072071
74.00495009414702
74.00495025748967
74.00495027382394
74.00495027545736
74.0049502756207
sada
error2:  74.0049502756207
error1:  74.39202589733269
74.36741197124883
error2:  74.36741197124883
error1:  74.25262022878282
74.25846680816177
74.25915357800962
74.25922327735704
74.25923025751665
74.25923095563483
74.25923102544768
74.25923103242899
74.25923103312712
74.25923103319693
74.2592310332039
sada
error2:  74.2592310332039
error1:  75.4949599568184
75.0995103995265
error2:  75.0995103995265
error1:  73.81358889370505
74.0218900279151
74.04311226100606
74.04523839537978
74.04545104791761
74.04547231356239
74.04547444013076
74.04547465278765
74.04547467405334
74.04547467617991
74.04547467639257
sada
error2:  74.04547467639257
error1:  75.11060780610913


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


74.92619325692516
error2:  74.92619325692516
error1:  72.70926287881251
72.91154365487718
72.93267131206105
72.93479303336952
72.93500529501576
72.93502652207549
72.93502864479042
72.93502885706198
72.93502887828915
72.93502888041186
72.93502888062415
sada
error2:  72.93502888062415
error1:  74.9527328318026
74.74271703988425
error2:  74.74271703988425
error1:  72.81056923083847
72.97969348285197
72.997385407067
72.99916236460137
72.99934013797582
72.99935791608945
72.99935969390856
72.99935987169056
72.99935988946876
72.99935989124658
72.99935989142436
sada
error2:  72.99935989142436
ostatni:  7471.274892657614
accuracy:  0.9303
