In [None]:
import numpy as np

def gradient_descent(loss_fn, initial_params, learning_rate, num_iterations, gradient_method='explicit'):
    """
    Perform gradient descent optimization.

    Parameters:
    - loss_fn: A function that computes the loss and its gradient. It should accept a parameter vector and return the loss and gradient as separate arrays.
    - initial_params: Initial parameter vector.
    - learning_rate: Learning rate for gradient descent.
    - num_iterations: Number of iterations to run gradient descent.
    - gradient_method: 'explicit' for using the explicit gradient formula, 'approximate' for gradient estimation.

    Returns:
    - params: The optimized parameter vector.
    - losses: A list of loss values at each iteration.
    """
    params = np.array(initial_params)  # Ensure params is a NumPy array
    losses = []

    for iteration in range(num_iterations):
        loss, gradient = loss_fn(params)
        params -= learning_rate * gradient  # Ensure gradient is a NumPy array
        losses.append(loss)

        # Print progress at each iteration
        print(f"Iteration {iteration + 1}/{num_iterations}: Loss = {loss:.4f}, Parameters = {params}")

    return params, losses

def estimate_gradient(loss_fn, params, epsilon=1e-5):
    """
    Estimate the gradient of a loss function using finite differences.

    Parameters:
    - loss_fn: A function that computes the loss. It should accept a parameter vector and return the loss as a scalar.
    - params: Parameter vector at which to estimate the gradient.
    - epsilon: Small perturbation for finite differences.

    Returns:
    - gradient: Estimated gradient vector.
    """
    num_params = len(params)
    gradient = np.zeros(num_params)

    for i in range(num_params):
        perturbed_params = params.copy()
        perturbed_params[i] += epsilon
        loss_plus = loss_fn(perturbed_params)
        perturbed_params[i] -= 2 * epsilon
        loss_minus = loss_fn(perturbed_params)
        gradient[i] = (loss_plus - loss_minus) / (2 * epsilon)

    return gradient

# Example usage:

# Define a simple loss function and its gradient (explicitly).
def simple_loss(params):
    x, y = params
    loss = (x**2 + y**2)
    gradient = np.array([2 * x, 2 * y])
    return loss, gradient

# Initial parameter guess
initial_params = np.array([1.0, 1.0])

# Run gradient descent with explicit gradient calculation
optimized_params_explicit, losses_explicit = gradient_descent(simple_loss, initial_params, learning_rate=0.1, num_iterations=100, gradient_method='explicit')

# Run gradient descent with gradient estimation
optimized_params_approx, losses_approx = gradient_descent(simple_loss, initial_params, learning_rate=0.1, num_iterations=100, gradient_method='approximate')

In [6]:
import numpy as np

# Generate some sample data
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Define the linear regression model
def linear_regression(X, theta):
    return X.dot(theta)

# Define the mean squared error loss function
def mean_squared_error(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

# Stochastic Gradient Descent with random initial points
def sgd_random_start(X, y, learning_rate=0.001, n_starts, n_iterations=1000):
    best_theta = None
    best_loss = float('inf')

    for _ in range(n_starts):
        # Randomly initialize the parameters
        theta = np.random.randn(2, 1)

        for iteration in range(n_iterations):
            random_index = np.random.randint(len(X))
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = -2 * xi.T.dot(yi - linear_regression(xi, theta))
            theta -= learning_rate * gradients

        y_pred = linear_regression(X, theta)
        current_loss = mean_squared_error(y, y_pred)

        if current_loss < best_loss:
            best_loss = current_loss
            best_theta = theta

    return best_theta

learning_rate = 0.01
n_starts = 10
n_iterations = 1000

X_b = np.c_[np.ones((100, 1)), X]  # Add bias term
best_theta = sgd_random_start(X_b, y, learning_rate, n_starts, n_iterations)

print("Best Theta:", best_theta)


Best Theta: [[4.22374805]
 [2.94546154]]


In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.optimizers import SGD
import numpy as np

# Define your neural network model here
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Load the MNIST dataset
(train_images, train_labels), (_, _) = mnist.load_data()
train_images = train_images / 255.0

# Compile the model
model.compile(optimizer=SGD(learning_rate=0.01),  # Initial learning rate
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Define a custom learning rate schedule
def cyclic_learning_rate(epoch, base_lr, max_lr, step_size):
    cycle = 1 + epoch // (2 * step_size)
    x = abs(epoch / step_size - 2 * cycle + 1)
    lr = base_lr + (max_lr - base_lr) * max(0, 1 - x)
    return lr

# Training loop
num_epochs = 10
step_size = 5
base_lr = 0.001
max_lr = 0.01

best_accuracy = 0.0
for epoch in range(num_epochs):
    # Update learning rate
    lr = cyclic_learning_rate(epoch, base_lr, max_lr, step_size)
    model.optimizer.lr.assign(lr)  # Update the learning rate in TensorFlow/Keras


    # Training
    history = model.fit(train_images, train_labels, epochs=1, verbose=1)

    # Save model parameters when learning rate vanishes
    if lr == 0:
        model.save_weights(f'model_epoch_{epoch}.h5')

    # Validation (you need to define the validation dataset and validation loop)
    # Compute validation accuracy and update best_accuracy if needed

    accuracy = history.history['accuracy'][0]
    print(f'Epoch [{epoch}/{num_epochs}] LR: {lr:.5f} Loss: {history.history["loss"][0]:.4f} Accuracy: {accuracy:.2f}%')

# Load the best model parameters observed during training
best_model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])
best_model.compile(optimizer=SGD(learning_rate=0.01),
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])
best_model.load_weights('best_model.h5')

# Now you can use the best_model for inference or further evaluation.


Epoch [0/10] LR: 0.00100 Loss: 1.7255 Accuracy: 0.55%
Epoch [1/10] LR: 0.00280 Loss: 0.6877 Accuracy: 0.83%
Epoch [2/10] LR: 0.00460 Loss: 0.4009 Accuracy: 0.89%
Epoch [3/10] LR: 0.00640 Loss: 0.3182 Accuracy: 0.91%
Epoch [4/10] LR: 0.00820 Loss: 0.2716 Accuracy: 0.92%
Epoch [5/10] LR: 0.01000 Loss: 0.2347 Accuracy: 0.93%
Epoch [6/10] LR: 0.00820 Loss: 0.2040 Accuracy: 0.94%
Epoch [7/10] LR: 0.00640 Loss: 0.1836 Accuracy: 0.95%
Epoch [8/10] LR: 0.00460 Loss: 0.1707 Accuracy: 0.95%
Epoch [9/10] LR: 0.00280 Loss: 0.1617 Accuracy: 0.95%


FileNotFoundError: ignored

In [9]:
import numpy as np

# Generate some sample data
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.rand(100, 1)

# Define the linear regression model and loss function
def linear_regression(X, theta):
    return X.dot(theta)

def mean_squared_error(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

# Number of parameter sets and learning rate
num_parameter_sets = 10
learning_rate = 0.01
num_epochs = 1000

# Initialize multiple parameter sets with different initializations
initial_parameters = [np.random.randn(1, 1) for _ in range(num_parameter_sets)]
converged_parameters = []

# Training loop for each parameter set
for theta_init in initial_parameters:
    theta = theta_init.copy()

    for epoch in range(num_epochs):
        # Calculate gradients
        y_pred = linear_regression(X, theta)
        gradient = -2 * X.T.dot(y - y_pred)

        # Update parameters using gradient descent
        theta -= learning_rate * gradient

        # Check for convergence based on a threshold (e.g., small gradient norm)
        if np.linalg.norm(gradient) < 1e-3:
            converged_parameters.append(theta.copy())
            break

# Check if there are converged parameters before calculating the mean
if converged_parameters:
    # Make predictions using each set of converged parameters
    predictions = []
    for theta in converged_parameters:
        y_pred = linear_regression(X, theta)
        predictions.append(y_pred)

    # Calculate final output predictions as the mean of all model predictions
    final_predictions = np.mean(predictions, axis=0)

    # Calculate the final mean squared error
    mse = mean_squared_error(y, final_predictions)
    print("Final Mean Squared Error:", mse)
else:
    print("No converged parameters found.")


No converged parameters found.


In [10]:
import numpy as np

class MultiClassLogisticClassifier:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.classes = None

    def fit(self, X, y):
        # Determine the unique classes in the target variable
        self.classes = np.unique(y)
        num_features = X.shape[1]
        num_classes = len(self.classes)

        # Initialize weights and bias
        self.weights = np.zeros((num_classes, num_features))
        self.bias = np.zeros(num_classes)

        # One-hot encode the target variable
        y_encoded = np.zeros((len(y), num_classes))
        for i, class_label in enumerate(self.classes):
            y_encoded[:, i] = (y == class_label)

        # Gradient Descent
        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights.T) + self.bias
            probabilities = self.softmax(linear_model)

            # Calculate the gradient of the loss
            gradient_weights = (1 / len(X)) * np.dot(probabilities.T, X)
            gradient_bias = (1 / len(X)) * np.sum(probabilities, axis=0)

            # Update weights and bias
            self.weights -= self.learning_rate * gradient_weights
            self.bias -= self.learning_rate * gradient_bias

    def predict(self, X):
        linear_model = np.dot(X, self.weights.T) + self.bias
        probabilities = self.softmax(linear_model)
        predicted_classes = np.argmax(probabilities, axis=1)

        # Map predicted indices to class labels
        predicted_labels = [self.classes[i] for i in predicted_classes]
        return predicted_labels

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / exp_z.sum(axis=1, keepdims=True)


In [20]:
# Example usage
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=70)

# Create and train the classifier
classifier = MultiClassLogisticClassifier(learning_rate=0.001, n_iterations=10000)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.3333333333333333


In [21]:
import numpy as np

def compute_mse_gradient(X, y, weights, predicted_probabilities):
    """
    Compute the gradient of the Mean Squared Error (MSE) loss with respect to the weights.

    Parameters:
    - X: Input features (shape: [N, K], where N is the number of samples, K is the number of features).
    - y: True labels (one-hot encoded, shape: [N, C], where C is the number of classes).
    - weights: Model weights (shape: [C, K]).
    - predicted_probabilities: Predicted probabilities for each class (shape: [N, C]).

    Returns:
    - gradient: Gradient of the MSE loss with respect to the weights (shape: [C, K]).
    """
    N = X.shape[0]
    C = weights.shape[0]
    K = weights.shape[1]

    gradient = np.zeros((C, K))

    for i in range(N):
        for j in range(C):
            error = predicted_probabilities[i, j] - y[i, j]
            gradient[j, :] += 2 * error * predicted_probabilities[i, j] * (1 - predicted_probabilities[i, j]) * X[i, :]

    gradient /= N

    return gradient


In [24]:
import numpy as np

class MultiClassLogisticClassifier:
    def __init__(self, learning_rate=0.01, n_iterations=1000, batch_size=None, optimizer="batch"):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.classes = None
        self.batch_size = batch_size
        self.optimizer = optimizer

    def fit(self, X, y):
        # Determine the unique classes in the target variable
        self.classes = np.unique(y)
        num_features = X.shape[1]
        num_classes = len(self.classes)

        # Initialize weights and bias
        self.weights = np.zeros((num_classes, num_features))
        self.bias = np.zeros(num_classes)

        if self.optimizer == "batch":
            for _ in range(self.n_iterations):
                gradient_weights, gradient_bias = self.compute_gradients(X, y)
                self.weights -= self.learning_rate * gradient_weights
                self.bias -= self.learning_rate * gradient_bias

        elif self.optimizer == "sgd":
            for _ in range(self.n_iterations):
                for i in range(len(X)):
                    xi, yi = X[i:i+1], y[i:i+1]
                    gradient_weights, gradient_bias = self.compute_gradients(xi, yi)
                    self.weights -= self.learning_rate * gradient_weights
                    self.bias -= self.learning_rate * gradient_bias

        elif self.optimizer == "mini-batch":
            for _ in range(self.n_iterations):
                for i in range(0, len(X), self.batch_size):
                    xi, yi = X[i:i+self.batch_size], y[i:i+self.batch_size]
                    gradient_weights, gradient_bias = self.compute_gradients(xi, yi)
                    self.weights -= self.learning_rate * gradient_weights
                    self.bias -= self.learning_rate * gradient_bias

    def predict(self, X):
        linear_model = np.dot(X, self.weights.T) + self.bias
        probabilities = self.softmax(linear_model)
        predicted_classes = np.argmax(probabilities, axis=1)

        # Map predicted indices to class labels
        predicted_labels = [self.classes[i] for i in predicted_classes]
        return predicted_labels

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / exp_z.sum(axis=1, keepdims=True)

    def compute_gradients(self, X, y):
        linear_model = np.dot(X, self.weights.T) + self.bias
        probabilities = self.softmax(linear_model)

        # Calculate the gradient of the loss
        gradient_weights = (1 / len(X)) * np.dot(probabilities.T, X) - np.dot(y.T, X)
        gradient_bias = (1 / len(X)) * np.sum(probabilities - y, axis=0)

        return gradient_weights, gradient_bias

classifier_batch = MultiClassLogisticClassifier(learning_rate=0.1, n_iterations=1000, optimizer="batch")
classifier_sgd = MultiClassLogisticClassifier(learning_rate=0.1, n_iterations=1000, optimizer="sgd")
classifier_mini_batch = MultiClassLogisticClassifier(learning_rate=0.1, n_iterations=1000, batch_size=32, optimizer="mini-batch")
