In [149]:
#entropy, cross-entropy, K-L divergence, Huffman Coding

In [1]:
import numpy as np
import pandas as pd

In [2]:
#p represents a distribution of possible weather states for a given day
p = np.array([0.9, 0.1]) #probability(sun, rain)

In [3]:
#"entropy"
#a measure of uncertainty over possible events (in this case, the weather for the day)
#consider transmitting a message about the weather over a communication wire through a binary Morse code-like system
#an encoding with distribution p will take at least the following amounts of bits to transmit on average
info_content = -sum(p*np.log2(p))
print('The entropy or information content measured in bits is: ', info_content)

The entropy or information content measured in bits is:  0.4689955935892812


In [4]:
#decomposing the entropy formula implies an unlikely event transmits more information than a likely event

#sunny
print('A notice of Sunny weather transmits this much info: ', -np.log2(p[0]))

#rainy
print('A notice of Rainy weather transmits this many bits of information: ', -np.log2(p[1]))

A notice of Sunny weather transmits this much info:  0.15200309344504995
A notice of Rainy weather transmits this many bits of information:  3.321928094887362


In [5]:
#suppose we came up with the following encoding for sending information about the weather
#sun is encoded as a binary string of length one, namely '0' 
#and rain is encoded as a binary string of length one, namely '1'

sun = '0'
rain = '1'

#to transmit the weather we are sending 1 bit of information in both situations
encoding_length = np.array([len(str(sun)), len(str(rain))]) 

print('The lengths of our encoded messages about the weather are: ', encoding_length)

#so the average length of the binary message we need to send
avg_message_size = sum(p*encoding_length)

print('The average message length under this encoding: ', avg_message_size)

The lengths of our encoded messages about the weather are:  [1 1]
The average message length under this encoding:  1.0


In [6]:
#Cross-Entropy

#the encoding above can handle transmitting information on a higher entropy system than p = (.9,.1) 
#Entropy(.5,.5) > Entropy(.9,.1) since there is more uncertainty in a coin flip than the our weather
#The encoding above is optimally efficient under the distribution q below (since cross-entropy = entropy)
#There is no two-state system with more uncertainty
#The likelihoods are equally balanced across all states
q = np.array([0.5, 0.5])

#Imagine you are sending an encoded message regarding the weather where...
#The underlying data is drawn from a your weather distribution p, 
#But the encoding scheme is optimized for a different distribution q 
#Then the cross-entropy is the expected length of a message encoded according to q but sampled according to P
print('The Cross-Entropy, or average message length calculated differently, is: ',-sum(p*np.log2(q)))

The Cross-Entropy, or average message length calculated differently, is:  1.0


In [7]:
#the K-L Divergence 

#The extra bits we send (on average) above the theoretical minimum 
#There may not be an encoding that allows us to send the theoretical minimum
print('The average extra number of bits we are sending: ', avg_message_size - info_content)

#In this context, it is a measure of inefficiency of the way we have encoded our message regarding the weather
print('The K-L Divergence:', -sum(p*np.log2(q/p)))

The average extra number of bits we are sending:  0.5310044064107188
The K-L Divergence: 0.5310044064107188


In [8]:
#alternate formula for the K-L Divergece
#a weighted average where our weights are the frequency of the weather
#and the thing we are averaging is the encoding ratio(i.e. log base 2(x))... 
#between the actual information in p and the info under the encoding implied by q
sum(p*np.log2(p/q))

0.5310044064107189

In [9]:
#KL is not symmetric between the sampling distribution and the encoding distribution
#but it is still positive
print('If you switch the sampling and encoding distribution you get a K-L Divergence of: ', -sum(q*np.log2(p/q)))

If you switch the sampling and encoding distribution you get a K-L Divergence of:  0.7369655941662061


In [10]:
#K-L divergence doesn't pass the triangle inequality for being a true distance metric
#The K-L Divergence "distance" from p-to-q + q-to-r can be less than p-to-r !

p = np.array([0.9, 0.1])
q = np.array([0.5, 0.5])
r = np.array([0.4, 0.6])

print(sum(p*np.log2(p/q)))
print(sum(q*np.log2(q/r)))
print(sum(p*np.log2(p/r)))

0.5310044064107189
0.029446844526784283
0.7944362512259656


In [11]:
#Example #2
#This one is more interesting since we can come up with a better encoding
#Define the probability of (sun, rain, clouds, tornado)
p = np.array([0.9, 0.05, 0.03, 0.02]) 

In [12]:
#a signal encoding one of four possibilities, each with probability specified in p...
#should take at least the following amounts of bits to transmit, on average
info_content = -sum(p*np.log2(p))
print(info_content)

0.6175431233120147


In [13]:
#suppose we use the following binary encodings

sun = '00'
rain = '01'
clouds = '10'
tornado = '11'

#to transmit the weather we are sending 2 bits of information in all situations
encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

print('The lengths of our encoded messages about the weather are: ', encoding_length)

#so the average length of the binary message we need to send
avg_message_size = sum(p*encoding_length)

print('The average message length under this encoding: ', avg_message_size)

The lengths of our encoded messages about the weather are:  [2 2 2 2]
The average message length under this encoding:  2.0


In [14]:
#The Cross-Entropy
#The average enoding length under q, drawn from distribution p
q = np.array([0.25, 0.25, 0.25, 0.25])

print('The average message size or cross-entropy:', -sum(p*np.log2(q)))

The average message size or cross-entropy: 2.0


In [15]:
#another way to see where how the a specific encoding implies this q distribution

#"implied probabilities" assuming the encoding was efficient
#it is not a perferctly efficient encoding so the probabilites won't necessarily sum to 1, although they do here
q = 1/np.power(2, encoding_length)

print('The Encoding Lengths: ', encoding_length)
print("The implied likelihoods under this encoding: ", q)

print('The Cross-Entropy, or average message size in bits: ', -sum(p*np.log2(q)))
print('The Entropy, or average information content in a message: ', -sum(p*np.log2(p)))
print('The K-L Divergence, or extra bits of inefficiency our encoding adds:', -sum(p*np.log2(q/p)))

The Encoding Lengths:  [2 2 2 2]
The implied likelihoods under this encoding:  [0.25 0.25 0.25 0.25]
The Cross-Entropy, or average message size in bits:  2.0
The Entropy, or average information content in a message:  0.6175431233120147
The K-L Divergence, or extra bits of inefficiency our encoding adds: 1.3824568766879857


In [16]:
#The extra bits we needed to send (on average) above the theoretical minimum (which may not be achievable)
print('A measure of our inefficiency, or average extra bits transmitted: ', avg_message_size - info_content)

A measure of our inefficiency, or average extra bits transmitted:  1.3824568766879852


In [17]:
#suppose we use the following binary encodings
#notice that even though we have varying encoding lengths, we can string them together and the message is still unambiguous
#010 implies sun, rain
#111000 implies wind, sun, sun, sun
#1001 is invalid, I guess

sun = '0'
rain = '10'
clouds = '110'
tornado = '111'

#to transmit the weather we are sending strings of length 1, 2, or 3
encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

print('The Encoding Lengths: ', encoding_length)

#the average length of the binary message we need to send
avg_message_size = sum(p*encoding_length)

print('The Entropy of the correct weather distribution: ', -sum(p*np.log2(p)))
print('The Cross-Entorpy or average message lenght :', avg_message_size)
print('The K-L Divergence between these two distributions: ', avg_message_size -(-sum(p*np.log2(p))))

The Encoding Lengths:  [1 2 3 3]
The Entropy of the correct weather distribution:  0.6175431233120147
The Cross-Entorpy or average message lenght : 1.1500000000000001
The K-L Divergence between these two distributions:  0.5324568766879855


In [18]:
#We did the K-L Divergence calculation above without calculating q
#Just out of curiousity, what is the implied probabilities under the encoding above

q = 1/np.power(2, encoding_length)

print('The Encoding Lengths: ', encoding_length)
print("The implied likelihoods assuming this is an optimal encoding: ", q)
print('The K-L Divergence: ', -sum(p*np.log2(q/p)))

The Encoding Lengths:  [1 2 3 3]
The implied likelihoods assuming this is an optimal encoding:  [0.5   0.25  0.125 0.125]
The K-L Divergence:  0.5324568766879854


In [19]:
#what if we tweaked our encoding to make it less optimal
#notice how our sun encoding is now two bits instead of one

sun = '00'
rain = '10'
clouds = '110'
tornado = '111'

#to transmit the weather we are sending strings of length 1, 2, or 3
encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

q = 1/np.power(2, encoding_length)

print('The Encoding Lengths: ', encoding_length)
print("The implied likelihoods assuming this is an optimal encoding: ", q)
print('The K-L Divergence, or measure of inefficiency, has gone up to: ', -sum(p*np.log2(q/p)))

The Encoding Lengths:  [2 2 3 3]
The implied likelihoods assuming this is an optimal encoding:  [0.25  0.25  0.125 0.125]
The K-L Divergence, or measure of inefficiency, has gone up to:  1.4324568766879853


In [20]:
#The amount by which our K-L divergence went up (our coding inefficiency) =
#The amount by which our Cross-Entropy went up (our message length)
#The entropy of our weather system didn't change

#original encoding
sun = '0'
rain = '10'
clouds = '110'
tornado = '111'

encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

q = 1/np.power(2, encoding_length)

print('The Cross-Entropy: ', -sum(p*np.log2(q)))
print('The K-L Divergence', -sum(p*np.log2(q/p)))
print('The Entropy:', -sum(p*np.log2(p)))

#subsequent encoding
sun = '00'
rain = '10'
clouds = '110'
tornado = '111'

encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

q = 1/np.power(2, encoding_length)

print('Original Cross-Entropy: ', -sum(p*np.log2(q)))
print('Original K-L Divergence', -sum(p*np.log2(q/p)))
print('The Entropy:', -sum(p*np.log2(p)))

The Cross-Entropy:  1.1500000000000001
The K-L Divergence 0.5324568766879854
The Entropy: 0.6175431233120147
Original Cross-Entropy:  2.0500000000000003
Original K-L Divergence 1.4324568766879853
The Entropy: 0.6175431233120147


In [21]:
#Example #3
#Define the probability of (sun, rain, clouds, tornado)
p = np.array([0.5, 0.25, 0.125, 0.125]) 

#Use this encoding to send your message about the weather
sun = '0'
rain = '10'
clouds = '110'
tornado = '111'

encoding_length = np.array([len(str(sun)), len(str(rain)), len(str(clouds)), len(str(tornado))]) 

q = 1/np.power(2, encoding_length)

#notice how we have encoded our weather data perfectly efficiently
#the cross-entropy = entropy
#the average message length = information content
#no wasted bits transmitted
#we're lucky the weather's probabilites allowed for a perfectly efficient encoding to exist
#On average we'll send 1.75 bits, or characters in a binary language
#On average the receiver will be getting 1.75 bits of information content, or surprise, on the other end
print('The Cross-Entropy: ', -sum(p*np.log2(q)))
print('The K-L Divergence', sum(p*np.log2(p/q)))
print('The Entropy:', -sum(p*np.log2(p)))

The Cross-Entropy:  1.75
The K-L Divergence 0.0
The Entropy: 1.75


In [22]:
#All encodings thus far has been given to you, whether they were relatively efficient encodings for transmitting weather info or not
#How do we come up with relatively efficient codings for arbitrary distributions?
#One method is known as Huffman Coding...

In [230]:
d = {'label': ['rain', 'sun', 'cloud', 'tornado'], 'probability': [0.25, 0.5, 0.125, 0.125]}
df = pd.DataFrame(data=d)
print(df)

     label  probability
0     rain        0.250
1      sun        0.500
2    cloud        0.125
3  tornado        0.125


In [240]:
#Huffman coding
def huffman_encoding(dataframe):
    
    #add additional columns, space for the algorithm to work
    dataframe['encoding'] = '' #a placeholder for us to write the huffman code strings
    dataframe['tree_node_children'] = dataframe['label'] #the original tree nodes that will be ranked by probility are the weather states
    dataframe['tree_node_children'] = dataframe.tree_node_children.apply(lambda x: [x]) #put the name into a list
    dataframe['node_prob'] = dataframe['probability'] #the original probabilities for the first huffman code ranking
    dataframe['done'] = 0 #a field that will be referenced to determine when which probabilies have been put in the tree
    
    
    print('Dataframe at start:')
    print(dataframe)
    
    completed = 0
    
    while completed < 1: #need to change this to reference 'done' field
        #sort the data to see which states have the lowest probabilities
        dataframe = dataframe.sort_values(by='node_prob')
        
        #add 0 to the front of the encoding associated with all states/nodes below the node or state with lowest probability 
        for sub_node in dataframe.iloc[0]['tree_node_children']:
            dataframe.loc[dataframe.label == sub_node,'encoding'] = '0' + dataframe.loc[dataframe.label == sub_node,'encoding']
        
        #add 1 to the front of the encoding associated with all states/nodes below the node or state with second lowest probability 
        for sub_node in dataframe.iloc[1]['tree_node_children']:
            dataframe.loc[dataframe.label == sub_node,'encoding'] = '1' + dataframe.loc[dataframe.label == sub_node,'encoding']
            
        #update probabilites and nodes for next iteration (ranking the two lowest probability nodes/states)
        dataframe.iloc[0, dataframe.columns.get_loc('node_prob')] = dataframe.iloc[0]['node_prob'] + dataframe.iloc[1]['node_prob'] #add two numbers
        dataframe.iloc[1, dataframe.columns.get_loc('node_prob')] = 99 #need to find better way that doesn't mess with sort
        dataframe.iat[0, dataframe.columns.get_loc('tree_node_children')] = [*dataframe.iloc[0]['tree_node_children'], *dataframe.iloc[1]['tree_node_children']] #combine two lists
        dataframe.iat[1, dataframe.columns.get_loc('tree_node_children')] = ''
        dataframe.iloc[0, dataframe.columns.get_loc('done')] = 1
        dataframe.iloc[1, dataframe.columns.get_loc('done')] = 1
        print('Dataframe after another step:')
        print(dataframe)
        
        completed = dataframe['done'].min()
             
    return dataframe.loc[:,'label':'encoding']

In [241]:
huffman_encoding(df)

Dataframe at start:
     label  probability encoding tree_node_children  node_prob  done
0     rain        0.250                      [rain]      0.250     0
1      sun        0.500                       [sun]      0.500     0
2    cloud        0.125                     [cloud]      0.125     0
3  tornado        0.125                   [tornado]      0.125     0
Dataframe after another step:
     label  probability encoding tree_node_children  node_prob  done
2    cloud        0.125        0   [cloud, tornado]       0.25     1
3  tornado        0.125        1                         99.00     1
0     rain        0.250                      [rain]       0.25     0
1      sun        0.500                       [sun]       0.50     0
Dataframe after another step:
     label  probability encoding      tree_node_children  node_prob  done
2    cloud        0.125       00  [cloud, tornado, rain]        0.5     1
0     rain        0.250        1                               99.0     1
1      s

Unnamed: 0,label,probability,encoding
2,cloud,0.125,0
1,sun,0.5,1
0,rain,0.25,1
3,tornado,0.125,1


In [242]:
d2 = {'label': ['rain', 'sun', 'cloud', 'tornado', 'fire'], 'probability': [0.25, 0.4, 0.125, 0.125, 0.1]}
df2 = pd.DataFrame(data=d2)
print(df2)

     label  probability
0     rain        0.250
1      sun        0.400
2    cloud        0.125
3  tornado        0.125
4     fire        0.100


In [243]:
huffman_encoding(df2)

Dataframe at start:
     label  probability encoding tree_node_children  node_prob  done
0     rain        0.250                      [rain]      0.250     0
1      sun        0.400                       [sun]      0.400     0
2    cloud        0.125                     [cloud]      0.125     0
3  tornado        0.125                   [tornado]      0.125     0
4     fire        0.100                      [fire]      0.100     0
Dataframe after another step:
     label  probability encoding tree_node_children  node_prob  done
4     fire        0.100        0      [fire, cloud]      0.225     1
2    cloud        0.125        1                        99.000     1
3  tornado        0.125                   [tornado]      0.125     0
0     rain        0.250                      [rain]      0.250     0
1      sun        0.400                       [sun]      0.400     0
Dataframe after another step:
     label  probability encoding      tree_node_children  node_prob  done
3  tornado        

Unnamed: 0,label,probability,encoding
1,sun,0.4,0
0,rain,0.25,10
3,tornado,0.125,110
4,fire,0.1,1110
2,cloud,0.125,1111


In [246]:
d3 = {'label': ['rain', 'sun', 'cloud', 'tornado', 'fire', 'hail', 'wind', 'smog'], 
      'probability': [0.25, 0.4, 0.125, 0.125, 0.05, 0.03, 0.02, 0.01]}
df3 = pd.DataFrame(data=d3)
print(df3)

     label  probability
0     rain        0.250
1      sun        0.400
2    cloud        0.125
3  tornado        0.125
4     fire        0.050
5     hail        0.030
6     wind        0.020
7     smog        0.010


In [247]:
huffman_encoding(df3)

Dataframe at start:
     label  probability encoding tree_node_children  node_prob  done
0     rain        0.250                      [rain]      0.250     0
1      sun        0.400                       [sun]      0.400     0
2    cloud        0.125                     [cloud]      0.125     0
3  tornado        0.125                   [tornado]      0.125     0
4     fire        0.050                      [fire]      0.050     0
5     hail        0.030                      [hail]      0.030     0
6     wind        0.020                      [wind]      0.020     0
7     smog        0.010                      [smog]      0.010     0
Dataframe after another step:
     label  probability encoding tree_node_children  node_prob  done
7     smog        0.010        0       [smog, wind]      0.030     1
6     wind        0.020        1                        99.000     1
5     hail        0.030                      [hail]      0.030     0
4     fire        0.050                      [fire]  

Unnamed: 0,label,probability,encoding
1,sun,0.4,0
0,rain,0.25,10
3,tornado,0.125,110
4,fire,0.05,11100
2,cloud,0.125,1111
7,smog,0.01,1110100
5,hail,0.03,111011
6,wind,0.02,1110101


In [None]:
#Next in this exploration of information and complexity:
#Kolmogorov-Chaitin Complexity
#check out "Digital Physics" the movie for more fun:)