# [Entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory))

In [42]:
import numpy as np
from pyitlib import discrete_random_variable as drv
from scipy.stats import entropy as sci_entropy
from sklearn.metrics import mutual_info_score
from sklearn.metrics import log_loss

## Examples

#### Artifitial Sample

In [43]:
def artifitial_sample(values, prob,n_samples=100):
    sample=[]
    for val,p in zip(values, prob):
        sample.extend(int(n_samples * p)* [val] )  
    return sample

### Arbitrary example



#### Define discrete probability vector

Consider the alphabets
$$
\begin{align*}
\mathcal{X}&=\{0,1,2\},\\
\mathcal{Y}&=\{0,1\}.
\end{align*}
$$

Define the joint probability mass function $p:\mathcal{X}\times\mathcal{Y}\to[0,1]$ by
$$
\begin{aligned}
p(0,0)&=0.05, & p(0,1)&=0.15,\\
p(1,0)&=0.20, & p(1,1)&=0.10,\\
p(2,0)&=0.30, & p(2,1)&=0.20.
\end{aligned}
$$




In [44]:
# Joint probability matrix p(x,y)
#.   Y=0    Y=1
P_XY = np.array([
    [0.05, 0.15],   # X = 0
    [0.20, 0.10],   # X = 1
    [0.30, 0.20]    # X = 2
])

P_XY_flat = P_XY.flatten()
P_X = P_XY.sum(axis=1) 
P_Y = P_XY.sum(axis=0) 

In [45]:
P_XY_flat

array([0.05, 0.15, 0.2 , 0.1 , 0.3 , 0.2 ])

In [46]:
P_X

array([0.2, 0.3, 0.5])

In [47]:
P_Y

array([0.55, 0.45])

In [48]:
values = [(i, j) for i in range(P_XY.shape[0])
                          for j in range(P_XY.shape[1])]
sample_XY = artifitial_sample(values, P_XY_flat)
sample_X = [x for x,_ in sample_XY]
sample_Y = [y for _, y in sample_XY]


### Uniform example



#### Define discrete probability vector

Consider the alphabets
$$
\begin{align*}
\mathcal{X}&=\{0,1,2\},\\
\mathcal{Y}&=\{0,1\}.
\end{align*}
$$

Define the joint probability mass function $p:\mathcal{X}\times\mathcal{Y}\to[0,1]$ by
$$
\begin{aligned}
p(0,0)&=0.05, & p(0,1)&=0.15,\\
p(1,0)&=0.20, & p(1,1)&=0.10,\\
p(2,0)&=0.30, & p(2,1)&=0.20.
\end{aligned}
$$





#### Define discrete probability vector

Consider the alphabets
$$
\begin{align*}
\mathcal{X}&=\{0,1,2\},\\
\end{align*}
$$

Define the joint probability mass function $p:\mathcal{X}\to[0,1]$ by 
$$p(x)=\frac{1}{3},\quad x \in \mathcal{X}$$

In [49]:
P_unif = (1/3)*np.ones(3)  # Uniform distribution for X

### Funtional dependency $y=f(x)=(x-1)^2$ Example

Consider the random variable $X$ with probability mass function
$$
\begin{aligned}
p_X:\mathcal{X}&\to[0,1]\\
p_X(0)&=0.2\\
p_X(1)&=0.3\\
p_X(2)&=0.5,
\end{aligned}
$$
and $Y=(X-1)^2$. Then 
$$
\begin{aligned}
p_Y:\mathcal{Y}&\to[0,1]\\
p_Y(0)&=0.3\\
p_Y(1)&=0.7,\\
\end{aligned}
$$

and the joint probability mass function $p:\mathcal{X}\times\mathcal{Y}\to[0,1]$ is given by
$$
\begin{aligned}
p(0,0)&=0, & p(0,1)&=0.2,\\
p(1,0)&=0.3, & p(1,1)&=0,\\
p(2,0)&=0, & p(2,1)&=0.5.
\end{aligned}
$$


In [50]:
Pf_XY = np.array([
    [0.00, 0.20],   # X = 0  -> Y = 1
    [0.30, 0.00],   # X = 1  -> Y = 0
    [0.00, 0.50]    # X = 2  -> Y = 1
])
Pf_X=Pf_XY.sum(axis=1) 
Pf_Y=Pf_XY.sum(axis=0)
print(f"{Pf_XY=}")
print(f"{Pf_X=}")
print(f"{Pf_Y=}")

Pf_XY=array([[0. , 0.2],
       [0.3, 0. ],
       [0. , 0.5]])
Pf_X=array([0.2, 0.3, 0.5])
Pf_Y=array([0.3, 0.7])


In [51]:
valuesf = [(i, j) for i in range(P_XY.shape[0])
                          for j in range(P_XY.shape[1])]
samplef_XY = artifitial_sample(valuesf, Pf_XY.flatten())
samplef_X = [x for x,_ in samplef_XY]
samplef_Y = [y for _, y in samplef_XY]

### Independent example

#### Define discrete probability vector

Consider the alphabets
$$
\begin{align*}
\mathcal{X}&=\{0,1,2\},\\
\mathcal{Y}&=\{0,1\}.
\end{align*}
$$

Define the joint probability mass function $p:\mathcal{X}\times\mathcal{Y}\to[0,1]$ by
$$
\begin{aligned}
p(0,0)&=0.08, & p(0,1)&=0.12,\\
p(1,0)&=0.20, & p(1,1)&=0.30,\\
p(2,0)&=0.12, & p(2,1)&=0.18.
\end{aligned}
$$

In [52]:
# Joint probability matrix p(x,y)
#.   Y=0    Y=1
PInd_XY = np.array([
    [0.08, 0.12],   # X = 0
    [0.20, 0.30],   # X = 1
    [0.12, 0.18]    # X = 2
])
PInd_X = PInd_XY.sum(axis=1) 
PInd_Y = PInd_XY.sum(axis=0)    
print(f"{PInd_XY=}")
print(f"{PInd_X=}")
print(f"{PInd_Y=}")


PInd_XY=array([[0.08, 0.12],
       [0.2 , 0.3 ],
       [0.12, 0.18]])
PInd_X=array([0.2, 0.5, 0.3])
PInd_Y=array([0.4, 0.6])


In [53]:
PInd_XPInd_Y = np.outer(PInd_X, PInd_Y)  # Product of marginals
print(f"P_(X)P(Y)(x,y)=\n{PInd_XPInd_Y}")
print(f"P_(X,Y)(x,y)=\n{PInd_XY}")

P_(X)P(Y)(x,y)=
[[0.08 0.12]
 [0.2  0.3 ]
 [0.12 0.18]]
P_(X,Y)(x,y)=
[[0.08 0.12]
 [0.2  0.3 ]
 [0.12 0.18]]


## Definition: Entropy (Average Uncertainty)

Entropy measures how uncertain we are about the outcome of a random variable before we observe it. 

Let $(\Omega,\mathcal{F},\mathbb{P})$ be a probability space and $X:\Omega \to \mathcal{X} \subset \mathbb{R}^n$ a discrete random vector with alphabet $\mathcal{X}$. Let $\mathbb{P}_X$ be the probability measure (law) induced by $X$.

Define the **(joint) entropy of $X$** as the expectation of its information content:

$$
\begin{aligned}
H_{\mathbb{P}}(X)
&:= \mathbb{E}\!\left[ I_{\mathbb{P}_X}(X) \right] \\
&= \sum_{x \in \mathcal{X}} p_X(x)\, I_{\mathbb{P}_X}(x) \\
&= \sum_{x \in \mathcal{X}} p_X(x)\,(-\log p_X(x)) \\
&= -\sum_{x \in \mathcal{X}} p_X(x)\log p_X(x).
\end{aligned}
$$

**Note:**  
* When no ambiguity arises, the dependence on the probability measure is omitted, and we write  
  $$ H(X) \quad \text{instead of} \quad H_{\mathbb{P}}(X). $$

**Interpretation:**
* High entropy means the outcome of the random variable is (on average) highly unpredictable.
* Low entropy means the outcome of the random variable is (on average) highly predictable.

## Properties: Entropy

1. $H(X) \ge 0$
2. Let $X$ be a discrete random variable with alphabet of length $n$. Then
$
H(X) \le \log n
$
with equality if and only if $X$ is uniform.

3. $X$ is constant $\Longleftrightarrow$ $H(X)=0$

4. The entropy is concave as a function of the induced probability $\mathbb{P}_X$.

    More precisely, let $\mathcal{M}_1(\mathcal{X})$ denote the set of all pmf of measueres on the measurable space $(\mathcal{X}, \mathcal{P}(\mathcal{X}))$, with $\mathcal{X}$ finite.
    Define
    $$
    \begin{aligned}
    H : \mathcal{M}_1(\mathcal{X}) &\longrightarrow [0,\infty) \\
    H(p) &:= -\sum_{x \in \mathcal{X}} p(x)\,\log p(x),
    \end{aligned}
    $$
    with the convention $0 \log 0 := 0$.

    Then $H$ is a strictly concave functional on $\mathcal{M}_1(\mathcal{X})$, i.e., for all $p_1,p_2 \in \mathcal{M}_1(\mathcal{X})$ with $p_1 \neq p_2$ and all $\lambda \in (0,1)$,
    $$
    H\big( \lambda p_1 + (1-\lambda)p_2 \big)
    \;>\;
    \lambda H(p_1) + (1-\lambda)H(p_2).
    $$

    Notice that for a discrete random variable $X$ with pmf $p_X$,
    $$
    H_{\mathbb{P}}(X)
    = -\sum_{x \in \mathcal{X}} p_X(x)\log p_X(x)
    = H(p_X).
    $$

    $H(p_X)$ is called the entropy of the pmf $p_X$. The nature of the argument (a pmf of a probability measure) avoids ambiguity with the entropy of random variables or random vectors.

5. $H$ is permutation inavaraiant. Let $X=(X_1,X_2,\dots,X_n)$ be a random vector and let $\pi_n$ a permutation of the varaiables in $X$, denote 
    the corresponding permuted vector as $\pi_n(X)=(X_{\pi(1)},X_{\pi(2)},\dots,X_{\pi(n)})$. Then we have 
    $$H(\pi_n(X))=H(X).$$

#### **Proof (1.):**

It is straight forward from the positivity of $I(x)$

#### **Proof (2.):**

Let $\varphi(t)=-\log t$, which is convex on $(0,\infty)$, and take a convex combination
$\sum_{i=1}^n \alpha_i v_i$ with $v_i,\alpha_i>0$ and $\sum_{i=1}^n \alpha_i=1$.

By [Jensen’s inequality](https://en.wikipedia.org/wiki/Jensen%27s_inequality) ,
$$
\varphi\left(\sum_{i=1}^n \alpha_i v_i\right)
\le
\sum_{i=1}^n \alpha_i \varphi\left(v_i\right).
$$

If we denote the alphabet of $X$ by $\mathcal{X}=\{x_1,x_2,\dots,x_n\}$ and use the previous inequality for
$\alpha_i=p(x_i)$ and $v_i=\frac{1}{p(x_i)}$, for $i=1,2,\dots,n$, we obtain

$$
\begin{aligned}
-\log\left(\sum_{i=1}^n p(x_i)\frac{1}{p(x_i)}\right)
&\le
-\sum_{i=1}^n p(x_i) \log\left(\frac{1}{p(x_i)}\right),\\
-\log(n)
&\le
\sum_{i=1}^n p(x_i)\log(p(x_i)),\\
-\sum_{i=1}^n p(x_i)\log(p(x_i))
&\le
\log(n),\\
H(X)
&\le
\log(n).
\end{aligned}
$$

Equality holds in [Jensen’s inequality](https://en.wikipedia.org/wiki/Jensen%27s_inequality)  if and only if
$$
v_1 = v_2 = \cdots = v_n
\quad\text{for all $i$ with } \alpha_i > 0,
$$
that is,
$$
\frac{1}{p(x_1)} = \frac{1}{p(x_2)} = \cdots = \frac{1}{p(x_n)}.
$$

Hence all positive probabilities are equal:
$$
p(x_1) = p(x_2) = \cdots = p(x_n) = \frac{1}{n},
$$
so $X$ is uniform on its (nonzero) alphabet.

Therefore,
$$
H(X) \le \log n,
$$
with equality if and only if $X$ is uniformly distributed on $\{x_1,\dots,x_n\}$.

#### **Proof (3.):**

It $X=c$ cte, then 
$$
H(X) = \sum_{x \in \mathcal{X}} p(x) (-\log p(x)) = 1(-\log 1) = 0.
$$
On the other hand, if $H(X)=0$ we get
$$
\sum_{x \in \mathcal{X}} p(x) (-\log p(x)) = 0.
$$
the for every $x$ we get $p(x)\in \{0,1\}$, otherwise one of the terms in the series is positve (and all are no negative). Tnen $X$ is constant.


#### **Proof (4.):**

Let's start by prohing that the function $h(t)= -t\log(t)$ is strictly concave in its domain $t>0$. Derivating twice we get
$$
\begin{align*}
h(t) &= -t\log(t)\\
h'(t)&= -\log(t)-1\\
h''(t)&= -\frac{1}{t} < 0, \quad  t>0.
\end{align*}
$$
Then for any $\lambda\in (0,1)$, and $s,t>0$ we have the inequality
$$h(\lambda t + (1-\lambda) s)> \lambda h(t) + (1-\lambda) h(s).$$

Using the concavity of $h$ we can prove the concavity of $H$. Let $,s,t\in (0,1)$ and $p_1,p_2 \in \mathcal{M}_1(\mathcal{X})$, then we have
$$
\begin{align*}
H(\lambda p_1+(1-\lambda) p_2)&=\sum_{x\in\mathcal{X}}h(\lambda p_1(x)+(1-\lambda) p_2(x)),\\
&> \sum_{x\in\mathcal{X}}\lambda h(p_1(x))+(1-\lambda) h(p_2(x)),\\
&=\lambda \sum_{x\in\mathcal{X}}h(p_1(x))+(1-\lambda)  \sum_{x\in\mathcal{X}}h(p_2(x)),\\
&=\lambda H(p_1)+(1-\lambda) H(p_2).
\end{align*}
$$
So $H$ is strictly concave.

#### **Proof (5.):**

Notice that the alphabet of $\pi_n(X)$ is $\pi_n(\mathcal{X})$, where $\mathcal{X}$ is the alphabet of $X$ and the pmf of $\pi_n(X)$ satisfies
$$
\begin{align*}
p_{\pi_n(X)}(y)&=\mathbb{P}(\pi_n(X)=y),\\
&=\mathbb{P}\left(\cap_{i=1}^nX^{-1}_{\pi_n(i)}(y_i)\right),\\
&=\mathbb{P}\left(\cap_{j=1}^nX^{-1}_{j}(y_{\pi_n^{-1}(j)})\right),\\
&=\mathbb{P}(X=\pi^{-1}_n(y)),\\
&=p_{X}\left(\pi^{-1}_n(y)\right).
\end{align*}
$$
Then we have
$$
\begin{align*}
H(\pi_n(X))&=- \sum_{y\in \pi_n(\mathcal{X})} p_{\pi_n(X)}(y)\log\left( p_{\pi_n(X)}(y)\right),\\
&=- \sum_{y\in \pi_n(\mathcal{X})} p_{X}\left(\pi^{-1}_n(y)\right)\log\left(p_{X}\left(\pi^{-1}_n(y)\right)\right),\\
&=- \sum_{x\in \mathcal{X}} p_{X}(x)\log\left( p_{X}(x)\right),\\
&= H(X).
\end{align*}
$$

## Code: Entropy

In [54]:
def entropy(P,base=2):
    P_pos = P[P > 0]
    return -((P_pos*np.log(P_pos)).sum()/np.log(base)).item()

#### Entropy

$$H_{\mathbb{P}}(X)= -\sum_{x \in \mathcal{X}} p_X(x)\log p_X(x).$$

In [55]:
print(f"H(x,y)={sci_entropy(P_XY_flat,base=2)}")
print(f"H(x)={sci_entropy(P_X,base=2)}")
print(f"H(y)={sci_entropy(P_Y,base=2)}")

H(x,y)=2.408694969562842
H(x)=1.4854752972273346
H(y)=0.9927744539878083


In [56]:
print(f"H(x,y)={entropy(P_XY_flat)}")
print(f"H(x)={entropy(P_X)}")
print(f"H(y)={entropy(P_Y)}")

H(x,y)=2.408694969562842
H(x)=1.4854752972273346
H(y)=0.9927744539878083


In [57]:
print(f"H(x,y)={entropy(P_XY_flat):.2} < {np.log2(len(P_XY_flat)):.2}")
print(f"H(x)={entropy(P_X):.2} < {np.log2(len(P_X)):.2}")
print(f"H(y)={entropy(P_Y):.2} < {np.log2(len(P_Y)):.2}")

H(x,y)=2.4 < 2.6
H(x)=1.5 < 1.6
H(y)=0.99 < 1.0


notice like "less unifom" variables have smaller entropy

In [58]:
print(f"H(x)={entropy(P_unif)} = {np.log2(len(P_unif))}= log(|X_cal|)")

H(x)=1.584962500721156 = 1.584962500721156= log(|X_cal|)


## Definition: Conditional Entropy

Let $(\Omega,\mathcal{F},\mathbb{P})$ be a probability space and let 
$X:\Omega\to\mathcal{X}\subset\mathbb{R}^n$ and $Y:\Omega\to\mathcal{Y}\subset\mathbb{R}^m$ be discrete random vectors

The **conditional entropy of $Y$ given $X$** is defined as the average of the
entropy of $Y$ conditioned on each value of $X$:
$$
\begin{aligned}
H_{\mathbb{P}}(Y \mid X=x)&:= -\sum_{y \in \mathcal{Y}} \mathbb{P}(Y=y \mid X=x)\log\mathbb{P}(Y=y \mid X=x),\\
H_{\mathbb{P}}(Y \mid X)
&:= \sum_{x \in \mathcal{X}} \mathbb{P}(X=x)\, H_{\mathbb{P}}(Y \mid X=x) ,\\
&= - \sum_{x \in \mathcal{X}} \sum_{y \in \mathcal{Y}}
 \mathbb{P}(X=x,\,Y=y) \, \log  \mathbb{P}(Y=y\mid X=x).
\end{aligned}
$$

**Note:**
* When no ambiguity arises, the dependence on the probability measure is omitted and we simply write $H(Y \mid X)$
  instead of $H_{\mathbb{P}}(Y \mid X)$.
* When no ambiguity arises, we also write $p(x,y)$, $p(x)$ and $p(y \mid x)$ instead of
  $\mathbb{P}(X=x,Y=y)$, $\mathbb{P}(X=x)$ and $\mathbb{P}(Y=y \mid X=x)$ respectively. 
  So usually we write
  $$
  \begin{aligned}
  H(Y \mid X=x)&:= -\sum_{y \in \mathcal{Y}} p(y \mid x)\log p(y \mid x),\\
  H(Y \mid X)
  &:= \sum_{x \in \mathcal{X}} p(x)\, H(Y \mid X=x) ,\\
  &= - \sum_{x \in \mathcal{X}} \sum_{y \in \mathcal{Y}}
  p(x,\,y) \, \log  p(y\mid x).
  \end{aligned}
  $$


**Interpretation:**
* $H_{\mathbb{P}}(Y \mid X)$ is the average remaining uncertainty in $Y$ after observing $X$.
* $0\le H_{\mathbb{P}}(Y \mid X) \le H_{\mathbb{P}}(Y)$.
* $H_{\mathbb{P}}(Y \mid X) = 0$ if and only if $Y$ is completely determined by $X$ (i.e. $Y = f(X)$ almost surely).
* $H_{\mathbb{P}}(Y \mid X) = H_{\mathbb{P}}(Y)$ if and only if $X$ and $Y$ are independent.


## Properties: Conditional Entropy

### Conditional entropy as subspace entropy

Let $(\Omega,\mathcal{F},\mathbb{P})$ be a probability space and let 
$X:\Omega\to\mathcal{X}\subset\mathbb{R}^n$ and
$Y:\Omega\to\mathcal{Y}\subset\mathbb{R}^m$ be discrete random vectors.
Fix $x \in \mathcal{X}$ with $p(x)>0$ and define
$$
A := \{\omega \in \Omega : X(\omega)=x\}.
$$
Then
$$
H_{\mathbb{P}}(Y \mid X = x)
= H_{\mathbb{P}(\cdot \mid A)}(Y).
$$

That is, the conditional entropy of $Y$ given $X=x$ is the entropy of $Y$
with respect to the conditional probability space
$$
\big( A,\ \mathcal{F}\!\mid_A,\ \mathbb{P}(\,\cdot \mid A) \big).
$$

#### **Proof:**

By definition,
$$
\begin{aligned}
H_{\mathbb{P}}(Y \mid X = x)
&:= -\sum_{y \in \mathcal{Y}}
\mathbb{P}(Y=y \mid X=x)\,\log \mathbb{P}(Y=y \mid X=x).
\end{aligned}
$$

But for every $y \in \mathcal{Y}$,
$$
\mathbb{P}(Y=y \mid X=x)
= \mathbb{P}(Y=y \mid A)
= \mathbb{P}(\cdot \mid A)\big(Y^{-1}(\{y\})\big).
$$

Therefore,
$$
\begin{aligned}
H_{\mathbb{P}}(Y \mid X = x)
&= -\sum_{y \in \mathcal{Y}}
\mathbb{P}(\cdot \mid A)\big(Y=y\big)\,
\log \mathbb{P}(\cdot \mid A)\big(Y=y\big) \\
&= H_{\mathbb{P}(\cdot \mid A)}(Y),
\end{aligned}
$$
which is exactly the entropy of $Y$ in the probability space
$\big( A,\ \mathcal{F}\!\mid_A,\ \mathbb{P}(\,\cdot \mid A) \big)$.

### Positivity


For all $x \in \mathcal{X}$ with $\mathbb{P}(X=x)>0$,
$$
H_{\mathbb{P}}(Y \mid X = x) \ge 0,
\qquad
H_{\mathbb{P}}(Y \mid X) \ge 0.
$$

**Proof (1.).**
Using the ositivity of the entropy and the previous result, we have
$$ 
\begin{aligned}
H_{\mathbb{P}}(Y \mid X = x)&= H_{\mathbb{P}(\cdot \mid X=x)}(Y)\geq 0, \quad x \in \mathcal{X},\\
H_{\mathbb{P}}(Y \mid X)&:= \sum_{x \in \mathcal{X}} \mathbb{P}(X=x)\, H_{\mathbb{P}}(Y \mid X=x)\geq 0.
\end{aligned}
$$

**Proof (2.):** Notice that $H(Y|X=x)$ is the entropy of the random vector $Y|X=x$ in the space $(\{X=x\},\mathcal{F}\mid\{X=x\},\mathbb{P}\mid \{X=x\})$
$$
H_{\mathbb{P}}(Y \mid X=x) = -\sum_{y \in \mathcal{Y}} p(y \mid x)\log p(y \mid x)=H_{\mathbb{P}\mid \{X=x\}}(Y)
$$

so it is non-negative. $H(Y|X) \ge 0$ is a direct consequence of $H(Y|X=x) \ge 0$.

### Functional dependency
Let $X,Y$ be discrete random vectors with finite alphabets. Then
$$
H(Y \mid X)=0
\quad\Longleftrightarrow\quad
\exists \, f \text{ such that } Y = f(X) \text{ almost surely.}
$$

#### **Proof:** 

Since
$$
H(Y\mid X=x) \ge 0 \quad \text{for all } x.
$$

If $H(Y\mid X)=0$, then
$$
0 = \sum_{x}p(x)\,H(Y\mid X=x)
$$
is a convex combination of nonnegative numbers. Therefore every term with positive weight must be zero:
$$
p(x)>0 \;\Longrightarrow\; H_{\mathbb{P}(\cdot\mid X=x)}(Y)= H(Y\mid X=x)=0.
$$

Then from property 3. of entropy, for every $x$ such that $p(x)>0$, the random variable $Y$ is constant over the set $\{X=x\}$ ($Y|X=x$ is constant), so there exist $f(x)$ such that 
$$
\mathbb{P}(Y = f(x) \mid X=x) = 1.
$$

Then
$$
\mathbb{P}\big(Y = f(X)\big)
= \sum_{x} \mathbb{P}\big(X=x,\,Y=f(x)\big)
= \sum_{x} \mathbb{P}(X=x)\,\mathbb{P}\big(Y=f(x)\mid X=x\big)
= \sum_{x} \mathbb{P}(X=x)\cdot 1
= 1.
$$

So $Y = f(X)$ almost surely.

Conversely, suppose $Y=f(X)$ a.s. Then for any $x$ with $\mathbb{P}(X=x)>0$,
$$
\mathbb{P}(Y=f(x)\mid X=x) = 1,
$$
so the $Y$ is constant over $\{X=x\}$ (conditional distribution of $Y$ given $X=x$ is a point mass), hence
$$
H_{\mathbb{P}(\cdot\mid X=x)}(Y) = H(Y\mid X=x)=0 \quad \text{for all } x \text{ with } p(x)>0.
$$
Therefore
$$
H(Y\mid X) = \sum_{x} \mathbb{P}(X=x)\,H(Y\mid X=x) = 0.
$$


### Conditioning cannot increase entropy
For discrete random vectors $X$ and $Y$ with finite alphabets,
$$
H(Y | X) \le H(Y),
$$
with equality if and only if $Y$ is independent of $X$.

**Proof:**

Notice that $H(Y)=H(\mathbb{P}_Y)$ and $H(Y|X=x)=H(\mathbb{P}_{Y|X=x})$ so using the concavity of $H$ we obtain

$$
\begin{align*}
H_{\mathbb{P}}(Y)&=H(\mathbb{P}_Y)=H\left(\sum_{x}\mathbb{P}_X(x)\mathbb{P}_{Y|X=x}\right)\ge \sum_{x}\mathbb{P}_X(x)H\left(\mathbb{P}_{Y|X=x}\right)= \sum_{x}\mathbb{P}_X(x)H(Y|X=x)=H(Y|X).
\end{align*}
$$
Since the concavity is strict, the equality only happens when $\mathbb{P}_{Y|X=x*}=\mathbb{P}_{Y|X=x}$ for certain $x^*$ in $\mathcal{X}$ and any $x\in\mathcal{X}$. Then
$$\mathbb{P}(Y=y)=\sum_{x}\mathbb{P}_X(x)\mathbb{P}_{Y|X=x}=\mathbb{P}_{Y|X=x*}\sum_{x}\mathbb{P}_X(x)=\mathbb{P}_{Y|X=x*}=\mathbb{P}_{Y|X=x}= \mathbb{P}(Y=y|X=x),\quad x\in\mathcal{X},$$
and $Y$ and $X$ are independant.
On the other hand if $Y$ and $X$ are independant, we have:

$$
\begin{align*}
H(Y|X)&=\sum_{x}p(x)H(Y|X=x),\\
&=- \sum_{x \in \mathcal{X}} \sum_{y \in \mathcal{Y}}
  p(x,\,y) \, \log  p(y\mid x),\\
  &=- \sum_{x \in \mathcal{X}} \sum_{y \in \mathcal{Y}}
  p(x)p(y) \, \log  p(y),\\
    &=- \sum_{x \in \mathcal{X}}
  p(x) \sum_{y \in \mathcal{Y}}p(y) \, \log  p(y),\\
  &=-\sum_{y \in \mathcal{Y}}p(y) \, \log  p(y),\\
  &=H(Y).
\end{align*}
$$

### Alternaty Form:

$$
H(Y|X) = H(X,Y) - H(X)
$$

**Proof:**

$$
\begin{aligned}
H(Y| X) 
&= \sum_{x} p(x)\left( - \sum_{y} p(y| x)\log p(y| x)\right)\\
&= -\sum_{x,y} p(y| x)p(x) \log p(y| x)\\
&= -\sum_{x,y} p(x,y)\log p(y| x)\\
&= -\sum_{x,y} p(x,y)\log \left(\frac{p(x,y)}{p(x)}\right)\\
&= -\sum_{x,y} p(x,y)\left(\log p(x,y) - \log p(x)\right)\\
&= -\sum_{x,y} p(x,y)\log p(x,y) +\sum_{x,y} p(x,y)\log p(x)\\
&= -\sum_{x,y} p(x,y)\log p(x,y) +\sum_{x} p(x)\log p(x)\\
&= H(X,Y) - H(X).
\end{aligned}
$$

### Basic inequality:
For two discrete random vectors $X$ and $Y$,
$$
H(X,Y) \le H(X) + H(Y)
$$
with equality if and only if $X$ and $Y$ are independent.

**Proof:** 

From the previous identity we have
$$
H(X,Y) = H(X) + H(Y | X).
$$

Thus,
$$
H(X,Y) \le H(X) + H(Y)
\quad \Longleftrightarrow \quad
H(Y | X) \le H(Y).
$$

Since conditioning cannot increase entropy, and the equality is hold if and only if $X$ and $Y$ are independant, we get the result.

## Code: Conditional Entropy

In [59]:
def entropy_conditional(P_XY, conditioned_idx):
    # Marginal distribution of the conditioning variable
    P_cond = P_XY.sum(axis=conditioned_idx, keepdims=True)

    # Mask of valid entries
    mask = P_XY > 0
    
    # H(A|B) = - sum p(a,b) log p(a|b)
    return -np.sum(P_XY[mask] * np.log2((P_XY / P_cond)[mask]))

In [60]:
drv_ent_cond=drv.entropy_conditional(sample_Y, sample_X, base=2).item()
drv_ent_cond_fY=drv.entropy_conditional(samplef_Y, samplef_X, base=2).item()
drv_ent_cond_fX=drv.entropy_conditional(samplef_X, samplef_Y, base=2).item()
print(f"H(Y|X)={drv_ent_cond} < ")
print(f"H(Y|X)={drv_ent_cond_fY}")
print(f"H(X|Y)={drv_ent_cond_fX}")

H(Y|X)=0.9232196723355077 < 
H(Y|X)=0.0
H(X|Y)=0.6041843979966417


In [61]:
ent_cond=entropy_conditional(P_XY,conditioned_idx=1).item()
ent_cond_fY=entropy_conditional(Pf_XY,conditioned_idx=1).item()
ent_cond_fX=entropy_conditional(Pf_XY,conditioned_idx=0).item()
print(f"H(Y|X)={ent_cond}")
print(f"H(Y|X)={ent_cond_fY}")
print(f"H(X|Y)={ent_cond_fX}")


H(Y|X)=0.923219672335508
H(Y|X)=-0.0
H(X|Y)=0.6041843979966417


In [62]:
print(f"H(Y|X)={ent_cond} <= {entropy(P_Y)}=H(Y)")
print(f"H(Y|X)={ent_cond_fY} <= {entropy(Pf_Y)}=H(Y)")
print(f"H(X|Y)={ent_cond_fX} <= {entropy(Pf_X)}=H(X)")

H(Y|X)=0.923219672335508 <= 0.9927744539878083=H(Y)
H(Y|X)=-0.0 <= 0.8812908992306927=H(Y)
H(X|Y)=0.6041843979966417 <= 1.4854752972273346=H(X)


In [63]:
print(f"H(X,Y)={entropy(P_XY)} = {ent_cond+ entropy(P_X)} = H(Y|X)+H(X)")
print(f"H(X,Y)={entropy(Pf_XY)} = {ent_cond_fY+ entropy(Pf_X)} = H(Y|X)+H(X)")
print(f"H(X,Y)={entropy(Pf_XY)} = {ent_cond_fX+ entropy(Pf_Y)} = H(X|Y)+H(Y)")

H(X,Y)=2.408694969562842 = 2.4086949695628426 = H(Y|X)+H(X)
H(X,Y)=1.4854752972273346 = 1.4854752972273346 = H(Y|X)+H(X)
H(X,Y)=1.4854752972273346 = 1.4854752972273344 = H(X|Y)+H(Y)


## Definition: Mutual Information

Mutual Information measures how much two random vectors depend on each other.

$$
H_{\mathbb{P}}(X\Cap Y) := H_{\mathbb{P}}(Y) - H_{\mathbb{P}}(Y \mid X)
$$

### Interpretation:
* $ H(X\Cap Y)$ is the **amount of information shared by $X$ and $Y$**
* Although it is not a set operation, it is often visualized with a Venn-style diagram where

    $$
     H(X\Cap Y) \;\; \text{behaves like} \;\;
    \text{“overlap” between } H(X) \text{ and } H(Y),
    $$

    and the total entropy decomposes as

    $$
    H(Y) = H(Y \mid X) +  H(X\Cap Y).
    $$

**Note:** Although mutual information is typically written $I(X;Y)$, we prefer $H(X \Cap Y)$ because it treats MI as a measure of the size of the shared uncertainty between $X$ and $Y$, keeping it compatible with the interpretation of entropy as a size measure of uncertainty.

## Properties: Mutual Information 
1. $0\leq H(X\Cap Y)\leq \min\{H(X),H(Y)\}$


2. $X$ and $Y$ are independent if and only if
    $$
     H(X\Cap Y) = 0.
    $$
3. Let $X,Y$ be discrete random vectors with finite alphabets. Then
$$
 H(X\Cap Y) = H(Y)
\quad\Longleftrightarrow\quad
\exists \, f \text{ such that } Y = f(X) \text{ almost surely.}
$$
4. Equivalent form and symetry
$$ H(X\Cap Y) = H(X) + H(Y) - H(X,Y)= H(Y\Cap X)$$
5. KL-divergence form  
$$ H(X\Cap Y) = \sum_{x,y} p(x,y)\log \frac{p(x,y)}{p(x)p(y)}$$

#### **Proof (1.) and (2.)**

This is strightforward from, definition, the "conditionning cannot increase entropy" propert and the fact the simetry of $\text{MI}$. Simetry will be proved on 4.

#### **Proof (3.)**

This is strightforward from the entropy "functional dependency" property and the definition of mutual information.

#### **Proof (4.)**

This is strightforward from the alternative from property.

#### **Proof (5.)**

Developing the KL divergence form we have
$$
\begin{align*}
\sum_{x,y} p(x,y)\log \frac{p(x,y)}{p(x)p(y)}&= \sum_{x,y} p(x,y)\left(\log p(x,y) -\log p(x) - \log p(y) \right),\\
&= \sum_{x,y} p(x,y)\log p(x,y) - \sum_{x,y} p(x,y)\log p(x)-  \sum_{x,y} p(x,y)\log p(y),\\
&= - \sum_{x} p(x)\log p(x)-  \sum_{y} p(y)\log p(y) + \sum_{x,y} p(x,y)\log p(x,y) ,\\
&=H(X) + H(Y) - H(X,Y),\\
& = H(Y;X)
\end{align*}
$$

## Code: Mutual information

In [64]:
def mutual_info(P_XY, base=2):
    P_X = P_XY.sum(axis=1)
    P_Y = P_XY.sum(axis=0)
    P_XP_Y = np.outer(P_X, P_Y)

    # Mask of valid entries
    mask = P_XY > 0

    # I(X;Y) = sum p(x,y) log( p(x,y) / (p(x)p(y)) )
    return (np.sum(P_XY[mask] * np.log((P_XY / P_XP_Y)[mask]))/np.log(base)).item()

In [65]:
mutual_info(P_XY,base=np.e)

0.04821170079675466

In [66]:
mutual_info_score(sample_X,sample_Y)

0.04821170079675517

In [67]:
print(f"MI(X,Y)={mutual_info(P_XY)} = {entropy(P_X)-entropy_conditional(P_XY,0)} = H(X)-H(X|Y)")
print(f"MI(X,Y)={mutual_info(P_XY)} = {entropy(P_Y)-entropy_conditional(P_XY,1)} = H(Y)-H(Y|X)")
print(f"MI(X,Y)={mutual_info(P_XY)} = {entropy(P_X)+entropy(P_Y)-entropy(P_XY)} = H(X)+H(Y)-H(X,Y)")

MI(X,Y)=0.06955478165230043 = 0.06955478165230078 = H(X)-H(X|Y)
MI(X,Y)=0.06955478165230043 = 0.06955478165230033 = H(Y)-H(Y|X)
MI(X,Y)=0.06955478165230043 = 0.06955478165230078 = H(X)+H(Y)-H(X,Y)


#### Bounds

In [68]:
print(f"MI(X,Y)={mutual_info(P_XY)} <= min{entropy(P_X),entropy(P_Y)} = min(H(X),H(Y))")

MI(X,Y)=0.06955478165230043 <= min(1.4854752972273346, 0.9927744539878083) = min(H(X),H(Y))


#### Independence

In [69]:
print(f"MI(X,Y)={mutual_info(PInd_XY)}")

MI(X,Y)=-2.5627412030519346e-17


#### Functional dependance $Y=f(X)$

In [70]:
print(f"MI(X,Y)={mutual_info(Pf_XY)} < {entropy(Pf_X)} = H(X)")
print(f"MI(X,Y)={mutual_info(Pf_XY)} = {entropy(Pf_Y)} = H(Y)")


MI(X,Y)=0.8812908992306927 < 1.4854752972273346 = H(X)
MI(X,Y)=0.8812908992306927 = 0.8812908992306927 = H(Y)


## Definition: Continuous case

If $X,Y$ are continuous, sums become integrals:

$$
 H(Y\Cap X) = \int\int p(x,y)\log\left(\frac{p(x,y)}{p(x)p(y)}\right)dxdy
$$

Now it is not bounded above anymore. Still non-negative.

## Definition: Cross-Entropy (Average Surprise Under a Model)

Let $\mathcal{M}_1(\mathcal{X})$ denote the set of all pmf of probability measures on the measurable space $(\mathcal{X},\,\mathcal{P}(\mathcal{X}))$, with $\mathcal{X}$ finite.

We define the **cross-entropy** as the functional

$$
\begin{aligned}
H : \mathcal{M}_1(\mathcal{X})^2 &\longrightarrow [0,\infty] \\
H(p \Vert q) 
&:= \sum_{x \in \mathcal{X}} p(x)\,\big(-\log q(x)\big),
\end{aligned}
$$

with the convention $0\log 0 := 0$.

**Remark:**
If there exists $x \in \mathcal{X}$ such that
$$
p(x) > 0 \quad \text{and} \quad q(x) = 0,
$$
then
$$
H(p \Vert q) = \infty.
$$
This reflects the fact that outcomes that actually occur under $p$ are assigned zero probability under the model $q$.

**Interpretation:**

* High cross-entropy $H(p\Vert q)$ means that outcomes generated by the true distribution $p$
  are, on average, very hard to predict when we use the model $q$
  (i.e. the model $q$ assigns low probability to typical events of $p$).

* Low cross-entropy $H(p\Vert q)$ means that outcomes generated by $p$
  are, on average, easy to predict using the model $q$, indicating that
  $q$ is close to the true distribution $p$.

* In particular, the cross-entropy can be seen as the entropy of $p$ plus a
  **penalty term** called Kullback–Leibler (KL) Divergence that measures how poorly $q$ approximates $p$. In the KL-divergence section, we prove
  $$H(p \Vert q) =  D_{KL}(p \,\|\, q)+ H(p)\geq H(p), $$
    with equality if and only if $p = q$.




## Code: Cross-Entropy

In [75]:
def cross_entropy(P, Q,base=2):
    P_pos = P[P > 0]
    Q_pos = Q[P > 0]
    return -((P_pos * np.log(Q_pos)).sum()/np.log(base)).item()   

In [72]:
# All possible (x,y) in the same order as flatten
values = [(i, j) for i in range(P_XY.shape[0])
                  for j in range(P_XY.shape[1])]

P_flat = P_XY.flatten()
n_classes = len(values)
label_map = {pair: i for i, pair in enumerate(values)}
y_true = np.array([label_map[pair] for pair in sample_XY])
y_pred = np.tile(P_flat, (len(sample_XY), 1)) 

print(f"H(p_(X,Y)||p_(X,Y)) = {log_loss(y_true, y_pred, labels=np.arange(n_classes))}")

H(p_(X,Y)||p_(X,Y)) = 1.6695801269814066


In [74]:
print(f"H(p_(X,Y)||p_(X,Y)) = {cross_entropy(P_XY.flatten(), P_XY.flatten(),base=np.e)}")

H(p_(X,Y)||p_(X,Y)) = 1.6695801269814072


In [39]:
print(f"H(p_(X,Y)) = {entropy(P_XY,base=np.e)}") 

H(p_(X,Y)) = 1.6695801269814072
