# Loss Functions

This notebook contains the implementation of several loss functions in Python. The functions are written inefficiently via a for loop for understanding and vectorized for actual use. The two versions are then verified to be equivalent and compared in running time. Finally, the numerical and calculated gradients are compared.

The four loss functions are:
* Logistic Loss
* Hinge Loss/SVMs
* Simple Two Layer Function
* Least Squares Loss

In [1]:
import random
import numpy as np

# n = number of instances
n = 100 # Change to 10000 to verify scalability

# m = number of features
m = 1000 # Change to 1000 to verify stability

# X = random dataset
X = [[random.randint(-100, 100) + random.random() for i in range(m)] for j in range(n)]

# y = random n-dimensional vector
y = [-1 if(random.randint(0,1)==0) else 1 for i in range(n)]
y_cont = [random.random() for i in range(n)]

# w = random m-dimensional vector
w = [random.randint(-1, 1) + random.random() for i in range(m)]

# Define the numerical gradient function:
def numericalGrad(funObj, w, epsilon):
    m = len(w)
    grad = np.zeros(m)
    for i in range(m):
        wp = np.copy(w)
        wn = np.copy(w)
        wp[i] = w[i] + epsilon
        wn[i] = w[i] - epsilon
        grad[i] = (funObj(wp) - funObj(wn)) / (2 * epsilon)
    return grad

In [2]:
# Logistic Loss

import numpy as np
import statistics as stats
import time

# For loop based loss function:
def LogisticLossFun_ForLoop(w, X, y, lam):
    f = lam * sum(w[i] ** 2 for i in range(m)) + sum([np.log1p(np.exp(-y[i] * np.dot(w, X[i])))
             if y[i] * np.dot(w, X[i]) > 0
             else np.log1p(np.exp(y[i] * np.dot(w, X[i]))) - (y[i] * np.dot(w, X[i]))
             for i in range(n)])
    
    g = [2 * lam * w[i] for i in range(m)] + sum([np.array((-y[i] / (1 + np.exp(y[i] * np.dot(w, X[i])))) * np.array(X[i]))
             if not np.isinf(np.exp(y[i] * np.dot(w, X[i])))
             else np.zeros(m)
             for i in range(n)])
    
    return [f, g]

# Vectorized version of loss function:
def LogisticLossFun(w, X, y, lam):
    XTw = np.dot(X, w)
    yXTw = np.multiply(y, XTw)
    
    pos = yXTw > 0
    neg = np.invert(pos)
    expYXTw = np.exp(yXTw)
    
    logexp = np.zeros(yXTw.shape)
    logexp[pos] = np.log1p(np.exp(-1 * yXTw[pos]))
    logexp[neg] = np.log1p(expYXTw[neg]) - yXTw[neg]
    
    f = lam * sum(np.square(w)) + sum(logexp)
    g = np.multiply(2 * lam, w) + np.dot(np.divide(-1 * np.array(y), 1 + expYXTw), X)
    
    return [f, g]

# Made sure it is numerically stable and scalable -> Change m and n above


# Compare the for loop based loss function with the vectorized version:
t0 = time.time()
forLoop = LogisticLossFun_ForLoop(w, X, y, 1)
t1 = time.time()
vect = LogisticLossFun(w, X, y, 1)
t2 = time.time()

fDiff = abs(forLoop[0] - vect[0])
gMaxDiff = max(abs(forLoop[1] - vect[1]))

#    Verify that the for loop based and vectorized loss functions produce the same results:
print("For loop and vectorized loss function comparisons:")
print("    For loop version took", t1-t0, "time.")
print("    Vectorized version took", t2-t1, "time.")

if fDiff == 0: print("    Loss functions are equal.")
elif fDiff < 1e-5: print("    Loss functions are very close to equal. Difference=", fDiff)
else: print("    Loss functions are NOT equal. Difference=", fDiff)
    
if gMaxDiff == 0: print("    Gradient functions are equal.")
elif gMaxDiff < 1e-5: print("    Gradient functions are very close to equal. Max difference=", gMaxDiff)
else: print("    Gradient functions are NOT equal. Max difference=", gMaxDiff)


# Numerically compute the gradient:
#    Define the function object:
funObj = lambda w: LogisticLossFun(w, X, y, 1)[0]

#    Verify that numerical and calculated gradients are the same (or very close):
diff = abs(numericalGrad(funObj, w, 0.00001) - vect[1])
print("\n\nNumerical and calculated gradient comparisons:")
print("    Largest difference between gradients =", max(diff))
print("    Smallest difference between gradients =", min(diff))
print("    Average difference between gradients =", stats.mean(diff))
print("    Median difference between gradients =", stats.median(diff))

For loop and vectorized loss function comparisons:
    For loop version took 0.07178616523742676 time.
    Vectorized version took 0.015626907348632812 time.
    Loss functions are very close to equal. Difference= 1.4551915228366852e-11
    Gradient functions are very close to equal. Max difference= 2.2737367544323206e-13


Numerical and calculated gradient comparisons:
    Largest difference between gradients = 3.9140019012506855e-06
    Smallest difference between gradients = 8.060823120104033e-10
    Average difference between gradients = 9.06342876058952e-07
    Median difference between gradients = 7.408490390048428e-07


In [3]:
# Hinge Loss/SVMs

import numpy as np
import statistics as stats
import time

# For loop based loss function:
def HingeLossFun_ForLoop(w, X, y, lam):
    f = lam * sum(w[i] ** 2 for i in range(m)) + sum([max(0, 1 - y[i] * np.dot(w, X[i]))
             for i in range(n)])
    
    g = [2 * lam * w[i] for i in range(m)] + sum([np.array(-y[i] * np.array(X[i]))
             if y[i] * np.dot(w, X[i]) < 1 
             else np.zeros(m) 
             for i in range(n)])
    
    return [f, g]

# Vectorized version of loss function:
def HingeLossFun(w, X, y, lam):
    XTw = np.dot(X, w)
    yXTw = np.multiply(y, XTw)
    
    y0 = np.copy(y)
    y0[yXTw >= 1] = 0
    
    f = lam * sum(np.square(w)) + sum(np.maximum(1 - yXTw, 0))
    g = np.multiply(2 * lam, w) + np.dot(-1 * y0, X)
    
    return [f, g]

# Made sure it is numerically stable and scalable -> Change m and n above


# Compare the for loop based loss function with the vectorized version:
t0 = time.time()
forLoop = HingeLossFun_ForLoop(w, X, y, 1)
t1 = time.time()
vect = HingeLossFun(w, X, y, 1)
t2 = time.time()

fDiff = abs(forLoop[0] - vect[0])
gMaxDiff = max(abs(forLoop[1] - vect[1]))

#    Verify that the for loop based and vectorized loss functions produce the same results:
print("For loop and vectorized loss function comparisons:")
print("    For loop version took", t1-t0, "time.")
print("    Vectorized version took", t2-t1, "time.")

if fDiff == 0: print("    Loss functions are equal.")
elif fDiff < 1e-5: print("    Loss functions are very close to equal. Difference=", fDiff)
else: print("    Loss functions are NOT equal. Difference=", fDiff)
    
if gMaxDiff == 0: print("    Gradient functions are equal.")
elif gMaxDiff < 1e-5: print("    Gradient functions are very close to equal. Max difference=", gMaxDiff)
else: print("    Gradient functions are NOT equal. Max difference=", gMaxDiff)


# Numerically compute the gradient:
#    Define the function object:
funObj = lambda w: HingeLossFun(w, X, y, 1)[0]

#    Verify that numerical and calculated gradients are the same (or very close):
diff = abs(numericalGrad(funObj, w, 0.00001) - vect[1])
print("\n\nNumerical and calculated gradient comparisons:")
print("    Largest difference between gradients =", max(diff))
print("    Smallest difference between gradients =", min(diff))
print("    Average difference between gradients =", stats.mean(diff))
print("    Median difference between gradients =", stats.median(diff))

For loop and vectorized loss function comparisons:
    For loop version took 0.035523414611816406 time.
    Vectorized version took 0.02402472496032715 time.
    Loss functions are very close to equal. Difference= 1.4551915228366852e-11
    Gradient functions are equal.


Numerical and calculated gradient comparisons:
    Largest difference between gradients = 3.871345029438089e-06
    Smallest difference between gradients = 4.062201242049923e-09
    Average difference between gradients = 8.916976538790936e-07
    Median difference between gradients = 7.306956746333526e-07


In [4]:
# Simple Two Layer Function

import numpy as np
import statistics as stats
import time

# For loop based loss function:
def Simple2LayerLossFun_ForLoop(w, X, y, lam):
    f = lam * sum(w[i] ** 2 for i in range(m)) + sum([(y[i] - max(0, np.dot(w, X[i]))) ** 2
             for i in range(n)])
    
    g = [2 * lam * w[i] for i in range(m)] + sum([np.array((-2 * y[i] + 2 * np.dot(w, X[i]))
                      * np.array(X[i]))
             if np.dot(w, X[i]) > 0
             else np.zeros(m)
             for i in range(n)])
    
    return [f, g]

# Vectorized version of loss function:
def Simple2LayerLossFun(w, X, y, lam):
    XTw = np.dot(X, w)
    num = (-2 * np.array(y)) + (2 * XTw)
    num[XTw <= 0] = 0
    
    f = lam * sum(np.square(w)) + sum(np.square(np.array(y) - np.maximum(0, XTw)))
    g = np.multiply(2 * lam, w) + np.dot(num, X)
    
    return [f, g]

# Made sure it is numerically stable and scalable -> Change m and n above


# Compare the for loop based loss function with the vectorized version:
t0 = time.time()
forLoop = Simple2LayerLossFun_ForLoop(w, X, y, 1)
t1 = time.time()
vect = Simple2LayerLossFun(w, X, y, 1)
t2 = time.time()

fDiff = abs(forLoop[0] - vect[0])
gMaxDiff = max(abs(forLoop[1] - vect[1]))

#    Verify that the for loop based and vectorized loss functions produce the same results:
print("For loop and vectorized loss function comparisons:")
print("    For loop version took", t1-t0, "time.")
print("    Vectorized version took", t2-t1, "time.")

if fDiff == 0: print("    Loss functions are equal.")
elif fDiff < 1e-5: print("    Loss functions are very close to equal. Difference=", fDiff)
else: print("    Loss functions are NOT equal. Difference=", fDiff)
    
if gMaxDiff == 0: print("    Gradient functions are equal.")
elif gMaxDiff < 1e-5: print("    Gradient functions are very close to equal. Max difference=", gMaxDiff)
else: print("    Gradient functions are NOT equal. Max difference=", gMaxDiff)


# Numerically compute the gradient:
#    Define the function object:
funObj = lambda w: Simple2LayerLossFun(w, X, y, 1)[0]

#    Verify that numerical and calculated gradients are the same (or very close):
diff = abs(numericalGrad(funObj, w, 0.0001) - vect[1])
print("\n\nNumerical and calculated gradient comparisons:")
print("    Largest difference between gradients =", max(diff))
print("    Smallest difference between gradients =", min(diff))
print("    Average difference between gradients =", stats.mean(diff))
print("    Median difference between gradients =", stats.median(diff))

For loop and vectorized loss function comparisons:
    For loop version took 0.047086477279663086 time.
    Vectorized version took 0.024002790451049805 time.
    Loss functions are very close to equal. Difference= 5.960464477539063e-08
    Gradient functions are very close to equal. Max difference= 4.423782229423523e-09


Numerical and calculated gradient comparisons:
    Largest difference between gradients = 0.0015259151114150882
    Smallest difference between gradients = 4.791654646396637e-07
    Average difference between gradients = 0.00036161213599888244
    Median difference between gradients = 0.00030101288575679064


In [5]:
# Least Squares Loss

import numpy as np
import statistics as stats
import time

# For loop based loss function:
def LeastSquaresLossFun_ForLoop(w, X, y, lam):
    f = lam * sum(w[i] ** 2 for i in range(m)) + sum([(y[i] - np.dot(w, X[i])) ** 2
             for i in range(n)])
    
    g = [2 * lam * w[i] for i in range(m)] + sum([np.array((-2 * y[i] + 2 * np.dot(w, X[i])) * np.array(X[i]))
             for i in range(n)])
    
    return [f, g]

# Vectorized version of loss function:
def LeastSquaresLossFun(w, X, y, lam):
    XTw = np.dot(X, w)
    num = (-2 * np.array(y)) + (2 * XTw)
    
    f = lam * sum(np.square(w)) + sum(np.square(np.array(y) - XTw))
    g = np.multiply(2 * lam, w) + np.dot(num, X)
    
    return [f, g]

# Made sure it is numerically stable and scalable -> Change m and n above


# Compare the for loop based loss function with the vectorized version:
t0 = time.time()
forLoop = LeastSquaresLossFun_ForLoop(w, X, y_cont, 1)
t1 = time.time()
vect = LeastSquaresLossFun(w, X, y_cont, 1)
t2 = time.time()

fDiff = abs(forLoop[0] - vect[0])
gMaxDiff = max(abs(forLoop[1] - vect[1]))

#    Verify that the for loop based and vectorized loss functions produce the same results:
print("For loop and vectorized loss function comparisons:")
print("    For loop version took", t1-t0, "time.")
print("    Vectorized version took", t2-t1, "time.")

if fDiff == 0: print("    Loss functions are equal.")
elif fDiff < 1e-5: print("    Loss functions are very close to equal. Difference=", fDiff)
else: print("    Loss functions are NOT equal. Difference=", fDiff)
    
if gMaxDiff == 0: print("    Gradient functions are equal.")
elif gMaxDiff < 1e-5: print("    Gradient functions are very close to equal. Max difference=", gMaxDiff)
else: print("    Gradient functions are NOT equal. Max difference=", gMaxDiff)


# Numerically compute the gradient:
#    Define the function object:
funObj = lambda w: LeastSquaresLossFun(w, X, y_cont, 1)[0]

#    Verify that numerical and calculated gradients are the same (or very close):
diff = abs(numericalGrad(funObj, w, 0.1) - vect[1])
print("\n\nNumerical and calculated gradient comparisons:")
print("    Largest difference between gradients =", max(diff))
print("    Smallest difference between gradients =", min(diff))
print("    Average difference between gradients =", stats.mean(diff))
print("    Median difference between gradients =", stats.median(diff))

For loop and vectorized loss function comparisons:
    For loop version took 0.03876328468322754 time.
    Vectorized version took 0.016489267349243164 time.
    Loss functions are very close to equal. Difference= 2.384185791015625e-07
    Gradient functions are very close to equal. Max difference= 7.450580596923828e-09


Numerical and calculated gradient comparisons:
    Largest difference between gradients = 2.982094883918762e-06
    Smallest difference between gradients = 1.1641532182693481e-09
    Average difference between gradients = 6.7574422701e-07
    Median difference between gradients = 5.860347300767899e-07
