In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import gridspec
from scipy.stats  import norm, multivariate_normal
from scipy.optimize import minimize

## Cross entropy

The Negative Logarithmic Likelihood loss

$$-\sum_{i}\sum_{j} l_{ij} \log \tilde{P}(C_j|x_i) $$

is an approximation of the quantity

$$ -\int \text{d}X  P(X) \sum_j P(C_j|X)\log \tilde{P}(C_j|X) $$ 

The expression

$$\sum_j P(C_j|X)\log \tilde{P}(C_j|X)$$

is the [_cross entropy_ ](https://en.wikipedia.org/wiki/Cross_entropy) between distributions $P(C_j|X)$: the true distribution of categories $C_i$ given features $X$ and approximating distribution $ \tilde{P}(C_j|X)$. 

Cross entropy between two discrete distributions $p$ and $q$ is defined as the expactaion value of $\log q$ with respect to $p$:

$$H(p,q)\equiv -E[\log q]_p =  -\sum p_i \log q_i$$ 

One can show that this quantity is minimised if and only if $p_i=q_i$. Cross entropy has several properties that make it suitable loss function for working with neural networks. We will explore this below. 

## Toy example

Let's consider a simple binary case with two categories 0 and 1. Let the true probability of 1 be $p$ and approximated probability be denoted $\tilde{p}$. We can look for $\tilde{p}$ by minimizing the _binary cross entropy_: 

$$ \operatorname{BCE}(\tilde p, p) = - p\log \tilde{p} - (1-p) \log (1-\tilde{p})$$

__Problem__ 

Show that this function as a function of $\tilde{p}$ does have a minimum when $\tilde{p}=p$. 

We will consider a simple case when the probability $\tilde p$ is given by simple logistic function of some parameter $x$:

$$\tilde{p} = \frac{1}{1+e^{-x}}, \quad 1-\tilde{p} = \frac{1}{1+e^x}$$

Then cross entropy is given by:

$$p \log (1+e^{-x}) +  (1-p) \log (1+e^x)$$

and its  derivative by:

$$\frac{\text{d}}{\text{d}x}BCE(\tilde{p}(x),p)=-\frac{p}{1+e^{-x}}+\frac{1-p}{1+e^x} = \frac{1}{1+e^{-x}}-p$$

For the MSE error this is respectively

$$MSE(x,p) = \frac{1}{2}\left(\tilde{p}-p\right)^2 =  \frac{1}{2}\left(\frac{1}{1+e^{-x}}-p\right)^2$$

$$\frac{\text{d}}{\text{d}x}MSE(\tilde{p}(x),p)=-\left(\frac{1}{1+e^{-x}}-p\right) \frac{e^{-x}}{\left(1+e^{-x}\right)^2}$$

Let's plot the error functions

In [None]:
def bce(pt,p):
    return -p*np.log(pt)+ -(1-p)*np.log(1-pt)

def mse(pt,p):
    return 0.5*(p-pt)*(p-pt)

def logistic(x):
    return 1/(1+np.exp(-x))

def logit(p):
    return np.log(p/(1-p))

In [None]:
xs=np.linspace(-10,10,400)
p=0.9

fig, ax = plt.subplots(1,2,figsize=(16,8))

ax[0].set_title('Cross entropy')
ax[0].plot(xs,bce(logistic(xs),p))
ax[0].axhline(0,c='black');
ax[0].axvline(logit(p), c='red');

ax[1].set_title('MSE')
ax[1].plot(xs,mse(logistic(xs),p));
ax[1].axhline(0,c='black');
ax[1].axvline(logit(p),c='red');

What we can see from those plots is that the cross  entropy functions, contrary to MSE does not saturatefor large positive and negative values of parameter $x$. Actually it's behaviour is asymptoticaly  linear. That means that it will have non-zero gradients, while MSE gradients will be zero. This is verified by the derivative plots below.

In [None]:
ys=-(1.0/(1+np.exp(-xs))-p)*np.exp(-xs)/(1+np.exp(-xs))**2
fig, ax = plt.subplots(1,2,figsize=(16,8))

ax[0].set_title('Cross entropy')
ys=-(logistic(xs)-p)
ax[0].plot(xs,ys)
ax[0].axvline(logit(p),c='red');
ax[0].axhline(0,c='black');

ax[1].set_title('MSE')
ax[1].plot(xs,ys*np.exp(-xs)*logistic(xs)**2)
ax[1].axvline(np.log(p)-np.log(1-p),c='red');
ax[1].axhline(0,c='black');


## Logistic regression

While this toy example is nor realy an example of machine learning, similar behaviour persists in more realistic scenarios. Consider a problem of separating two samples: 

In [None]:
x1 =  multivariate_normal((7,7),(1,1)).rvs(size=100)
x2 = multivariate_normal((-7,-7), (1,1)).rvs(size=100)

In [None]:
X = np.concatenate((x1,x2), axis=0)

In [None]:
Y = np.concatenate((np.ones(100), np.zeros(100)))

In [None]:
cols=np.array(['red','blue'])
fig,ax = plt.subplots(figsize=(8,8))
ax.scatter(X[:,0],X[:,1],c=cols[Y.astype(np.int)]);

We will use logistic regression for this task:

$$\tilde{y}_i = \beta_0x_{i0} +\beta_1x_{i1}$$

$$ \tilde{p}_i = \frac{1}{1+e^{-y_i}}$$ 

In [None]:
def lin(x,b1,b2):
    return np.moveaxis(np.multiply.outer(x[:,0],b1) +  np.multiply.outer(x[:,1],b2),0,-1)

In [None]:
def logistic(x,b1,b2):
    logit = lin(x,b1,b2)
    return 1/(1+np.exp(-logit))

#### Means squared error

First we will plot the loss function with MSE error

$$\frac{1}{2}\sum_i (\tilde{p}_i-l_i)^2$$

In [None]:
def mse(x, y, b1, b2):
        err = logistic(x,b1,b2)-y
        return 0.5*np.sum(err*err, axis=-1)/len(x)

In [None]:
b1s = np.linspace(-2,2,500)
b2s = np.linspace(-2,2,500)
grid  = np.meshgrid(b1s,b2s)
zs = mse(X,Y, grid[0], grid[1])

In [None]:
fig = plt.figure(figsize=(9,8))
gs=gridspec.GridSpec(1,2, width_ratios=[4,0.2])
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
cs=ax1.contourf(grid[0], grid[1],zs, levels=40);
ax1.plot([-2,2],[-2,2],c='red', linewidth=1, linestyle='--')
fig.colorbar(cs, cax=ax2);

And here are the values along the diagonal (red) line. 

In [None]:
phis = np.linspace(-2,2,500)
es = mse(X,Y, phis, phis)

In [None]:
minimize(lambda x: mse(X,Y,x,x),[-2]).x

In [None]:
plt.plot(phis,es)
plt.axvline(1,c='green');

We can clearly see a "plateaux" on both sides.

#### Cross entropy

And now we will plot the binary cross entropy loss

$$\sum_i l_i \log \tilde{p}_i + (1-l_i)\log (1-\tilde{p})$$

In [None]:
def ce(x, y, b1, b2):
        logit =  logistic(x,b1,b2)
        return -np.sum(y*np.log(logit) + (1-y)*np.log(1-logit), axis=-1)

In [None]:
ce(X,Y,1,1)

In [None]:
ces = ce(X,Y, grid[0], grid[1])

In [None]:
fig = plt.figure(figsize=(9,8))
gs=gridspec.GridSpec(1,2, width_ratios=[4,0.2])
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
cs=ax1.contourf(grid[0], grid[1],ces, levels=40);
ax1.plot([-2,2],[-2,2],c='red', linewidth=1, linestyle='--')
fig.colorbar(cs, cax=ax2);

In [None]:
phis = np.linspace(-2,2,500)
es = ce(X,Y,phis, phis)

In [None]:
plt.plot(phis,es)
plt.axvline(np.pi/4,c='green')

Now we can see that the "plateaux" is only on one side. But this is the right side! When we are there the value of error is already low. Contrary, when the loss is big we are on the slope with non-zero gradient. 