In [6]:
import numpy as np
from collections import Counter
from math import log2
import pandas as pd

In [35]:
def entropy(labels):
    total_count = len(labels)
    label_counts = Counter(labels)
    probabilities = [count / total_count for count in label_counts.values()]
    return -np.sum(p * np.log2(p) if p > 0 else 0 
                  for p in probabilities)

# Function to compute joint entropy
# def joint_entropy(X, Y):
#     joint_labels = list(zip(X, Y))
#     joint_labels
#     return entropy(joint_labels)

# def joint_entropy(X, Y):
#     mp={}
#     length= len(X)*len(Y)
#     for x in X:
#         for y in Y:
#             if (x,y) not in mp:
#                 mp[(x,y)]=0
#             mp[(x,y)]+=1
    
#     joint_entropy_res=0
    
#     for k,v in mp.items():
#         probab= v/length
#         probab=probab*log2(probab)
#         joint_entropy_res+= probab
    
#     return joint_entropy_res

def joint_entropy(X, Y):
    mp = {}
    length = len(X)  # Length should be the number of samples (assuming X and Y have the same length)
    
    # Count occurrences of each (x, y) pair
    for x, y in zip(X, Y):
        if (x, y) not in mp:
            mp[(x, y)] = 0
        mp[(x, y)] += 1
    
    joint_entropy_res = 0
    
    # Calculate joint entropy
    for v in mp.values():
        probab = v / length
        joint_entropy_res -= probab * log2(probab)  # Apply the correct entropy formula
    
    return joint_entropy_res
            
            
# def joint_entropy(x, y):
#     joint_probs = np.histogram2d(x, y, bins=[len(np.unique(x)), len(np.unique(y))], density=True)[0]
#     joint_probs = joint_probs[joint_probs > 0]
#     return entropy(joint_probs)

# Function to compute conditional entropy H(Y|X)
def conditional_entropy(X, Y):
    total_count = len(X)
    unique_x = set(X)
    
    cond_entropy = 0
    for x_val in unique_x:
        indices = [i for i in range(total_count) if X[i] == x_val]
        sub_Y = [Y[i] for i in indices]
        prob_x = len(indices) / total_count
        cond_entropy += prob_x * entropy(sub_Y)
    return cond_entropy

# Function to compute information gain
def information_gain(X, Y):
    return entropy(Y) - conditional_entropy(X, Y)

# Function to compute mutual information I(X;Y)
def mutual_information(X, Y):
    return entropy(X) + entropy(Y) - joint_entropy(X, Y)

# Function to compute Gini index
def gini_index(labels):
    total_count = len(labels)
    label_counts = Counter(labels)
    probabilities = [count / total_count for count in label_counts.values()]
    return 1 - sum(p ** 2 for p in probabilities)

# Example usage
if __name__ == "__main__":
    data=pd.read_csv('SOCR-HeightWeight (1).csv')
    height = data['Height(Inches)'].tolist()
    weight = data['Weight(Pounds)'].tolist()

    # Discretize the data for entropy-based calculations
    # height =sorted(height)
    # ok=0

    
    height_discretized = pd.qcut(height, q=3, labels=[0, 1, 2]).tolist()
    weight_discretized = pd.qcut(weight, q=3, labels=[0, 1, 2]).tolist()
    
    # for i in height_discretized:
    #     if i==2:
    #         ok+=1
    
    # print(ok)
    
    # print(height_discretized)

    print("Entropy of Height:", entropy(height_discretized))
    print("Entropy of Weight:", entropy(weight_discretized))
    print("Joint Entropy of Height and Weight:", joint_entropy(height_discretized, weight_discretized))
    print("Conditional Entropy H(Weight|Height):", conditional_entropy(height_discretized, weight_discretized))
    print("Information Gain (Height -> Weight):", information_gain(height_discretized, weight_discretized))
    print("Mutual Information between Height and Weight:", mutual_information(height_discretized, weight_discretized))
    print("Gini Index of Weight:", gini_index(weight_discretized))


Entropy of Height: 1.584962498412875
Entropy of Weight: 1.584962498412875
Joint Entropy of Height and Weight: 3.0361108064663624
Conditional Entropy H(Weight|Height): 1.451148308053487
Information Gain (Height -> Weight): 0.1338141903593879
Mutual Information between Height and Weight: 0.13381419035938746
Gini Index of Weight: 0.6666666656


  return -np.sum(p * np.log2(p) if p > 0 else 0
