In [9]:
import numpy as np
from sklearn.datasets import load_iris

In [20]:
# Batch gradient descent implementation with early stopping for softmax regression
iris = load_iris(as_frame=True)
X = iris.data[['petal length (cm)', 'petal width (cm)']].values
y = iris.target.values
X_with_bias_term = np.insert(X, 0, 1, axis=1)
# np.c_[np.ones(len(X)), X]

In [67]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias_term)

test_size = int(test_ratio * total_size)
validation_size = int(validation_ratio * total_size)
train_size = total_size - test_size - validation_size

np.random.seed(42)
rnd_indices = np.random.permutation(total_size)

X_train = X_with_bias_term[rnd_indices[:train_size]]
X_test = X_with_bias_term[rnd_indices[-test_size:]]
y_train = y[rnd_indices[:train_size]]
y_test = y[rnd_indices[-test_size:]]
X_valid = X_with_bias_term[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]

In [96]:
# onehot encoder manual

def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]

y_train_one_hot = to_one_hot(y_train)
y_test_one_hot = to_one_hot(y_test)
y_valid_one_hot = to_one_hot(y_valid)

In [107]:
# scaling the inputs by calculation mean and std
mean = X_train[:, 1:].mean(axis=0)
std = X_train[:, 1:].std(axis=0)
X_train[:, 1:] = (X_train[:, 1:] - mean)/std
X_test[:, 1:] = (X_test[:, 1:] - mean)/std
X_valid[:, 1:] = (X_valid[:, 1:] - mean)/std

In [108]:
# implementing softmax fucntion

def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis = 1, keepdims = True)
    return exps/exp_sums

In [114]:
# defining number of inputs and outputs
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

In [124]:
eta = .5
epoch = 5001
m = len(X_train)
epsilon = 1e5

np.random.seed(42)
theta = np.random.randn(n_inputs, n_outputs) # selecting random theta valus

for _ in range(epoch):
    logits = X_train @ theta
    y_proba = softmax(logits)
    if epoch % 1000 == 0:
        y_proba_valid = softmax(X_valid @ theta)
        xenthropy_losses = -(y_valid_one_hot * np.log(y_proba_valid + epsilon))
    error = y_proba - y_train_one_hot
    gradients = 1 / m * X_train.T @ error
    theta = theta - eta * gradients

In [135]:
# making prediction on the validation set

logits = X_valid @ theta
y_proba = softmax(logits)
y_predict = y_proba.argmax(axis = 1)

accuracy_score = (y_predict == y_valid).mean()
accuracy_score

0.9333333333333333

In [136]:
# adding l2 regularization 

eta = .5
epoch = 5001
m = len(X_train)
epsilon = 1e5
alpha = .01

np.random.seed(42)
theta = np.random.randn(n_inputs, n_outputs) # selecting random theta valus

for _ in range(epoch):
    logits = X_train @ theta
    y_proba = softmax(logits)
    if epoch % 1000 == 0:
        y_proba_valid = softmax(X_valid @ theta)
        xenthropy_losses = -(y_valid_one_hot * np.log(y_proba_valid + epsilon))
        l2_loss = 1/2 * (theta[1:] ** 2)/sum()
        total_loss = xenthropy_losses.sum(axis=1).mean() + alpha @ l2_loss
    error = y_proba - y_train_one_hot
    gradients = 1 / m * X_train.T @ error
    gradients += np.r_[np.zeros([1, n_outputs]), alpha * theta[1:]]
    theta = theta - eta * gradients

In [137]:
logits = X_valid @ theta
y_proba = softmax(logits)
y_predict = y_proba.argmax(axis=1)

accuracy_score = (y_predict == y_valid).mean()
accuracy_score

0.9333333333333333