### Optimise alpha (learning rate) using validation set

In [2]:
import numpy as np
import pandas as pd

#### Preparing validation data

In [4]:
# Read the CSV files for the testing data
X_val = pd.read_csv('data/X_val.csv').values  # Convert the DataFrame to a NumPy array
y_val = pd.read_csv('data/y_val.csv').values  # Convert the DataFrame to a NumPy array

# Print the shape of the arrays to verify
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (27, 3)
y_val shape: (27, 1)


#### Feature Scaling

In [6]:
class StandardScaler():
    def fit(self, X):
        self.mean = X.mean(axis=0)
        self.std = X.std(axis=0)

        # the standard deviation can be 0 in certain cases,
        #  which provokes 'devision-by-zero' errors; we can
        #  avoid this by adding a small amount if std==0
        self.std[self.std == 0] = 0.00001

    def transform(self, X):
        return (X - self.mean) / self.std

    def inverse_transform(self, X_scaled):
        return X_scaled * self.std + self.mean

In [7]:
scaler_val = StandardScaler()
scaler_val.fit(X_val)

X_val_scaled = scaler_val.transform(X_val)

In [8]:
n_samples_val = X_val.shape[0]
X_val_with_bias = np.c_[np.ones(n_samples_val), X_val_scaled]  # Add bias term as a column of ones

In [9]:
# Find the number of classes
n_classes = np.max(y_val)

# Create an empty one-hot encoded array
y_one_hot_val = np.zeros((y_val.size, n_classes))

# Populate the one-hot array
y_one_hot_val[np.arange(y_val.size), y_val.flatten() - 1] = 1  # Flatten y_val to 1D and subtract 1 for zero-indexing

##### Intialise Theta values

In [11]:
from numpy.random import uniform

# Number of features (1 bias term + 3 features)
n_features = 4

# Initialize thetas: Random weights for each feature and class
#initial_thetas = np.array([uniform(-1, 1) for _ in range(n_features * n_classes)]).reshape(n_features, n_classes)

# Initialize thetas with random values between -1 and 1
initial_thetas = np.random.uniform(-1, 1, size=(n_features, n_classes))  # Shape: (n_features, n_classes)

initial_thetas

array([[-0.27013521,  0.80223834,  0.33135093],
       [-0.63340223,  0.55902406,  0.3065858 ],
       [ 0.12444586,  0.92766619, -0.7410244 ],
       [ 0.74065042,  0.45738951,  0.03656996]])

#### Preparing train data

In [28]:
# Read the CSV files for the training data
X_train = pd.read_csv('data/X_train.csv').values  # Convert the DataFrame to a NumPy array
y_train = pd.read_csv('data/y_train.csv').values  # Convert the DataFrame to a NumPy array

# Print the shape of the arrays to verify
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (126, 3)
y_train shape: (126, 1)


In [63]:
scaler_train = StandardScaler()
scaler_train.fit(X_train)

X_train_scaled = scaler_train.transform(X_train)

In [32]:
n_samples_train = X_train.shape[0]
# Add a bias term (column of ones) to the scaled training dataset to account for the intercept term in logistic regression (theta_bias)
X_train_with_bias = np.c_[np.ones(n_samples_train), X_train_scaled]

In [34]:
# Create an empty one-hot encoded array
y_one_hot_train = np.zeros((y_train.size, n_classes))

# Populate the one-hot array
y_one_hot_train[np.arange(y_train.size), y_train.flatten() - 1] = 1  # Flatten y_train to 1D and subtract 1 for zero-indexing

#### Implementing Gradient descent for obtaining optimized theta values

In [37]:
def linear_regression(data, thetas):
    """
    Perform linear regression to compute raw scores (logits).
    data: np.ndarray, shape (n_samples, n_features) - Input feature data.
    thetas: np.ndarray, shape (n_features, n_classes) - Model parameters.
    
    Returns:
    logits: np.ndarray, shape (n_samples, n_classes) - Raw logits for each class.
    """
    return data @ thetas

In [39]:
def softmax(z):
    """
    Compute the softmax probabilities for multiclass classification.
    z: np.ndarray, shape (n_samples, n_classes) - Raw linear logits for each class.
    
    Returns:
    probabilities: np.ndarray, shape (n_samples, n_classes) - Normalized probabilities.
    """
    # Subtract max value in each row for numerical stability
    z_exp = np.exp(z - np.max(z, axis=1, keepdims=True))
    probabilities = z_exp / np.sum(z_exp, axis=1, keepdims=True)
    return probabilities

In [41]:
def logistic_regression(data, thetas):
    """
    Perform multiclass classification using softmax.
    data: np.ndarray, shape (n_samples, n_features) - Input feature data.
    thetas: np.ndarray, shape (n_features, n_classes) - Model parameters.
    
    Returns:
    probabilities: np.ndarray, shape (n_samples, n_classes) - Probabilities for each class.
    """
    z = linear_regression(data, thetas)  # Compute raw logits
    h = softmax(z)                       # Apply softmax to logits
    return h

In [43]:
def cross_entropy_loss(y_true, y_pred):
    """
    Compute the cross-entropy loss for multiclass classification.

    Parameters:
    y_true: np.ndarray, shape (n_samples, n_classes) - One-hot encoded ground truth labels.
    y_pred: np.ndarray, shape (n_samples, n_classes) - Predicted probabilities (from softmax).

    Returns:
    loss: float - Cross-entropy loss.
    """
    # Avoid log(0) issues by adding a small epsilon
    epsilon = 1e-12
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Compute cross-entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
    return loss

In [45]:
def gradient_descent(data, y_true, original_thetas, alpha, iterations):
    """
    Perform gradient descent optimization for multiclass logistic regression.

    Parameters:
    data: np.ndarray, shape (n_samples, n_features) - Feature matrix with bias term.
    y_true: np.ndarray, shape (n_samples, n_classes) - One-hot encoded ground truth labels.
    original_thetas: np.ndarray, shape (n_features, n_classes) - Initial model parameters.
    alpha: float - Learning rate for gradient descent.
    iterations: int - Number of iterations to perform.

    Returns:
    optimized_thetas: np.ndarray, shape (n_features, n_classes) - Optimized model parameters.
    error_history: list - Cross-entropy error at each iteration.
    """
    optimized_thetas = original_thetas.copy()
    n_samples = data.shape[0]
    error_history = np.zeros(iterations)
    accuracy_history = np.zeros(iterations)

    for i in range(iterations):
        # Compute predictions using logistic regression (softmax applied)
        logits = np.dot(data, optimized_thetas)  # Shape: (n_samples, n_classes)
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability
        y_pred = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)      # Shape: (n_samples, n_classes)

        # Compute cross-entropy loss and record it
        error = -np.sum(y_true * np.log(y_pred + 1e-12)) / n_samples  # Cross-entropy loss
        error_history[i] = error

        # Compute accuracy
        y_pred_labels = np.argmax(y_pred, axis=1)  # Predicted class labels
        y_true_labels = np.argmax(y_true, axis=1)  # True class labels
        accuracy_ = round((y_pred_labels == y_true_labels).mean() * 100, 2)
        accuracy_history[i] = accuracy_
        
        # Compute gradient of cross-entropy loss
        gradient = np.dot(data.T, (y_pred - y_true)) / n_samples  # Shape: (n_features, n_classes)

        # Update parameters using gradient descent
        optimized_thetas -= alpha * gradient

    return optimized_thetas, error_history, accuracy_history

In [47]:
# Perform gradient descent
alpha = 0.01        # Learning rate
iterations = 300  # Number of iterations
trained_thetas, error_history, accuracy_history = gradient_descent(X_train_with_bias, y_one_hot_train, initial_thetas, alpha, iterations)

#### Optimize Alpha using Train dataset, Validation set and Optimized theta values

In [49]:
def tune_alpha(train_data, train_labels, val_data, val_labels, thetas):
    """
    Tune the learning rate alpha using a validation dataset.

    Parameters:
    train_data: np.ndarray, shape (n_train_samples, n_features + 1) - Training feature matrix with bias term.
    train_labels: np.ndarray, shape (n_train_samples, n_classes) - One-hot encoded training labels.
    val_data: np.ndarray, shape (n_val_samples, n_features + 1) - Validation feature matrix with bias term.
    val_labels: np.ndarray, shape (n_val_samples, n_classes) - One-hot encoded validation labels.
    thetas: np.ndarray, shape (n_features + 1, n_classes) - Initial model parameters.
    alphas: list - List of learning rates to try.
    iterations: int - Number of iterations for gradient descent.

    Returns:
    best_alpha: float - The learning rate that minimizes the validation error.
    validation_errors: dict - Validation errors for each alpha.
    """
    alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]
    
    validation_errors = {}
    iterations = 100
    for alpha in alphas:
        # Train the model on the training set
        optimized_thetas, error, accuracy = gradient_descent(train_data, train_labels, thetas, alpha, iterations)

        # Compute predictions on the validation set
        #logits = np.dot(val_data, optimized_thetas)
        #exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability
        #y_pred = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
        y_pred = logistic_regression(val_data, optimized_thetas)

        # Compute cross-entropy loss on the validation set
        #val_error = -np.sum(val_labels * np.log(y_pred + 1e-12)) / val_data.shape[0]
        val_error = cross_entropy_loss(val_labels, y_pred)
        validation_errors[alpha] = val_error

    # Select the alpha with the minimum validation error
    best_alpha = min(validation_errors, key=validation_errors.get)

    return best_alpha, validation_errors

In [51]:
optimized_alpha, validation_errors = tune_alpha(X_train_with_bias, y_one_hot_train, X_val_with_bias, y_one_hot_val, trained_thetas)

In [53]:
optimized_alpha

0.1

In [55]:
validation_errors

{1e-05: 0.4099772192785057,
 0.0001: 0.4093095911643733,
 0.001: 0.4028528317820569,
 0.01: 0.35473424634456496,
 0.1: 0.2396892330209989}