In [1]:
import numpy as np

class MLPRegressor:

    def __init__(self, hidden_layer_sizes=(100,), learning_rate=0.01, n_iter=100):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = []
        self.biases = []
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def _sigmoid_deriv(self, sigm):
        return sigm * (1 - sigm)
    
    def _linear(self, x):
        return x
    
    def _linear_deriv(self, x):
        return 1
    
    def _initialize(self, input_size):
        layer_sizes = [input_size] + list(self.hidden_layer_sizes) + [1]
        
        for i in range(len(layer_sizes) - 1):
            weight = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * 0.01
            bias = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(weight)
            self.biases.append(bias)
    
    def forward(self, X):
        self.layer_outputs = [X]
        self.pre_activations = []

        current_input = X
        for i in range(len(self.weights) - 1):
            pre_activation = np.dot(current_input, self.weights[i]) + self.biases[i]
            self.pre_activations.append(pre_activation)

            current_input = self._sigmoid(pre_activation)
            self.layer_outputs.append(current_input)

        pre_activation_output = np.dot(current_input, self.weights[-1]) + self.biases[-1]
        self.pre_activations.append(pre_activation_output)
  
        output = self._linear(pre_activation_output)
        self.layer_outputs.append(output)

        return output

    def backward(self, X, y):
        n_samples = X.shape[0]
        deltas = [None] * len(self.weights)
        w_grad = [None] * len(self.weights)
        b_grad = [None] * len(self.biases)

        output_err = self.layer_outputs[-1] - y.reshape(-1, 1)
        delta_output = output_err * self._linear_deriv(self.layer_outputs[-1])
        deltas[-1] = delta_output

        for i in range(len(self.weights) - 2, -1, -1):
            err_hidden = np.dot(deltas[i+1], self.weights[i+1].T)
            delta_hidden = err_hidden * self._sigmoid_deriv(self.layer_outputs[i+1])
            deltas[i] = delta_hidden

        for i in range(len(self.weights)):
            input_to_layer = self.layer_outputs[i]
            w_grad[i] = np.dot(input_to_layer.T, deltas[i]) / n_samples
            b_grad[i] = np.sum(deltas[i], axis=0, keepdims=True) / n_samples

        return w_grad, b_grad

    def fit(self, X, y):

        input_size = X.shape[1]
        self._initialize(input_size)

        for epoch in range(self.n_iter):
            predictions = self.forward(X)
            w_grad, b_grad = self.backward(X, y)

            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * w_grad[i]
                self.biases[i] -= self.learning_rate * b_grad[i]
                
        return self


    def predict(self, X):
        return self.forward(X)