# Optimizing Softmax Regression with MCMC

In [1]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import math
import emcee
import corner

## Softmax Regression

In [2]:
def softmax(z):                                        # define the softmax function
            z -= np.max(z)                             # for numerical stability
            return (np.exp(z).T / np.sum(np.exp(z),axis=1)).T
    
def one_hot(y, C):                                     # define the one-hot encoding for labels
            return (np.arange(C) == y[:, None]).astype(float)

class SoftmaxRegression:
    def __init__(self, add_bias=True):
        self.add_bias = add_bias
    
    def fit(self, x, y, optimizer, x_vali=None, y_vali=None):
        if x_vali is None:
            x_vali = x
        if y_vali is None:
            y_vali = y
        if x_vali.ndim == 1:
            x_vali = x_vali[:, None]
        if self.add_bias:
            N = x_vali.shape[0]
            x_vali = np.column_stack([x_vali,np.ones(N)])
        
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x,np.ones(N)])
        N,D = x.shape
        C = np.max(y) + 1
        def gradient(x, y, w):                          # define the gradient function
            yh = softmax(np.dot(x, w))                  # predictions
            y = one_hot(y, C)                           # labels with one-hot encoding
            N, D = x.shape
            grad = np.dot(x.T, (yh - y))/N              # divide by N because cost is mean over N points
            return grad
        w0 = np.zeros((D, np.max(y) + 1))               # initialize the weights to 0, note that the dimension is D*C
        self.w = optimizer.run(gradient, x, y, w0, x_vali, y_vali)      # run the optimizer to get the optimal weights
        return self
    
    def predict(self, x):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x,np.ones(N)])
        yh = softmax(np.dot(x,self.w))                  #predict output
        return yh

In [3]:
# note that the output of the Softmax Regression is the probabilities for the label classes
# so we need to convert the output into the label classes with the highest probabilities for the classification purpose

def to_classlabel(z):                                   # convert the output class probabilities to be class labels
    return z.argmax(axis=1)

## Optimizer 1: Bayesian inference with MCMC

## Optimizer 2: Gradient Descent (for comparison)

In [4]:
def create_mini_batches(x, y, batch_size): 
    mini_batches = []
    data = np.hstack((x, y[:,None]))
    np.random.shuffle(data) 
    n_minibatches = math.ceil(data.shape[0] // batch_size) 
    data = np.array_split(data, n_minibatches)
    mini_batches = []
    for batch_data in data:
        x_mini = batch_data[:, :-1]
        y_mini = batch_data[:, -1]
        mini_batches.append((x_mini, y_mini))
    return mini_batches

class ImprovedGradientDescent:
    def __init__(self, learning_rate=0.1, max_iters=1e4, epsilon=1e-8, record_history=False, momentum=0.5, batch_size=20):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.record_history = record_history
        self.epsilon = epsilon
        self.momentum = momentum
        self.batch_size = batch_size
        if record_history:
            self.w_history = []                         #to store the weight history for visualization
            
    def run(self, gradient_fn, x, y, w, x_vali=None, y_vali=None):
        if x_vali is None:
            x_vali = x
        if y_vali is None:
            y_vali = y
        grad = np.inf
        grad_old = 0
        t = 1
        accuracy = 0
        accuracy_best = 0
        num_worse = 0
        w_best = w
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            mini_batches = create_mini_batches(x, y, self.batch_size) 
            for mini_batch in mini_batches: 
                x_mini, y_mini = mini_batch
                grad = self.momentum*grad_old + (1-self.momentum)*gradient_fn(x_mini, y_mini, w)  # compute the gradient with present weight and momentum
                w = w - self.learning_rate * grad       # weight update step
                grad_old = grad                         # the current gradient is the old gradient for the next iteration
            if self.record_history:
                self.w_history.append(w)
            accuracy = (to_classlabel(softmax(np.dot(x_vali, w))) == y_vali).astype(float).mean()
            if accuracy > accuracy_best:
                num_worse = 0
                accuracy_best = accuracy
                w_best = w
            else:
                num_worse += 1
            if num_worse >= 20:
                break
            t += 1
        w = w_best
        return w

## Performance Analysis 1: Iris dataset

In [5]:
from sklearn import datasets
import time

In [6]:
iris = datasets.load_iris()

iris_data = np.hstack((iris['data'],iris['target'][:,None]))
np.random.shuffle(iris_data)

train = iris_data[:(len(iris_data)*3//4)]
train_x = train[:,:-1]
train_y = train[:,-1].astype(int)

test = iris_data[(len(iris_data)*3//4):]
test_x = test[:,:-1]
test_y = test[:,-1].astype(int)

In [7]:
start = time.time()
regressor_iris_GD = SoftmaxRegression().fit(train_x,train_y, 
                                            ImprovedGradientDescent(learning_rate=0.01,momentum=0.9, batch_size=1))
stop = time.time()
train_time = stop - start
test_yh = regressor_iris_GD.predict(test_x)
test_accuracy = (to_classlabel(test_yh) == test_y).astype(float).mean()
print("The training time is", train_time)
print("The test accuracy is", test_accuracy)

The training time is 0.1765596866607666
The test accuracy is 0.9473684210526315
