In [13]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [16]:
from common.gradients import check_gradient, numerical_gradient

In [17]:


def SoftMaxRegression_implementation(ThetaFlat, X, Y=None, return_probabilities=False, use_numerical_gradient=True):
    """
    Compute the outputs of a softmax classifier, or the loss and gradient
    
    Parameters
    ----------
    ThetaFlat : 
        flat array of parameters containing (n_features*n_classes) entries
    X :
        array of features, shape n_features x n_smaples
    Y :
        optional array of desired targets of shape 1 x n_samples
    return_probabilities : 
        if True, the probabilities are returned and Y is not used
        if False, the loss and gradient are computed on the X,Y pairs
    """
    #X is num_features x num_samples
    num_features, num_samples = X.shape

    #Theta is num_features x num_classes
    #we first reshape ThetaFlat into Theta
    Theta = ThetaFlat.reshape(num_features, -1)
    _, num_classes = Theta.shape

    #Activation of softmax neurons
    #A's shape should be num_classes x num_samples
    # TODO
    A = Theta.T.dot(X)
    #
    
    #Now compute the SoftMax function
    #O will be a num_classes x num_samples matrix of probabilities assigned by our model  
    #Stability optimization - for each subtract the maximum activation
    O = A - A.max(0, keepdims=True)
    #
    # TODO - compute SoftMax as vector O. Take the exp and normalize, so all values of O
    #        would sum to 1.0.
    def norm(x):
        e = exp(x)
        ret = e/e.sum()
        assert(abs(ret.sum() - 1.0) < 0.00001)
        return ret
    O = array([norm(o) for o in O.T]).T
    #O = norm(O)
    
    if return_probabilities:
        return O
    
    #The loss is the average per-sample nll (neg log likelihood)
    #The nll is the sum of the logarithms of probabilities assigned to each class
    Yr = Y.ravel()
    correct_class_likelihoods = np.log(O[Yr, np.arange(num_samples)])
    L = - 1.0/num_samples * np.sum(correct_class_likelihoods)

    #For the softmax activation and cross-entropy loss, the derivative dNLL/dA has a simple form
    #Please fill in its computation
    #
    # TODO
    def make_row(x):
        a = zeros(num_classes)
        a[x] = 1.
        return a
    Yi_is_k = array([make_row(x) for x in Yr]).T
    dLdA1 = array([[O[k,i] - (1. if Yr[i] == k else 0.) for i in range(num_samples)] for k in range(num_classes)])
    #
    dLdA2 = O - Yi_is_k
    dLdA = dLdA1

    #Now we compute the gradient of the loss with respect to Theta
    dLdTheta = np.dot(X, dLdA.T)
    
    if use_numerical_gradient:
        dLdTheta = numerical_gradient(
            lambda T: SoftMaxRegression_implementation(T, X, Y, use_numerical_gradient=False),
            ThetaFlat)

    #reshape gard into the shape of Theta, for fmin_l_bfsgb to work
    return L, dLdTheta.reshape(ThetaFlat.shape)

#Make a function for training on irises
iris_log_reg_cost = lambda Theta: SoftMaxRegression_implementation(Theta, IrisXFull, IrisY, False)
#Make sure that the gradient computation is OK
check_gradient(iris_log_reg_cost, np.zeros((3*5,)))
check_gradient(iris_log_reg_cost, np.random.rand(3*5)*2.0-1.0)



NameError: global name 'IrisXFull' is not defined