In [2]:
from detectFD import calculate_conditional_entropy_matrix, discover_functional_dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import entropy
import seaborn as sns
import matplotlib.pyplot as plt


Let's first create a synthetic dataset with clear functional dependencies to test out the implementation.

A->B: where B is x2 the value of A

F->G: G is F+10

C,D -> E: E is the concatenation of C and D

In [3]:
n_rows = 100
df = pd.DataFrame({'A': np.random.randint(1, 4, n_rows)})

df['B'] = df['A'] * 2

df['C'] = np.random.randint(1, 4, n_rows)
df['D'] = np.random.randint(1, 4, n_rows)

df['E'] = df['C'] * 2 * df['D']

df['F'] = np.random.randint(1, 4, n_rows)
df['G'] = df['F'] + 10

In [4]:
# Calculate entropy matrix and find dependencies
feature_names = list(df.columns)
entropy_matrix = calculate_conditional_entropy_matrix(df)
dependencies = discover_functional_dependencies(entropy_matrix, feature_names)

In [5]:
print("Conditional Entropy Matrix:")
print(pd.DataFrame(entropy_matrix, columns=feature_names, index=feature_names))
print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")

Conditional Entropy Matrix:
          A         B         C         D         E         F         G
A  0.000000  0.000000  1.564619  1.534156  1.489918  1.500608  1.500608
B  0.000000  0.000000  1.564619  1.534156  1.489918  1.500608  1.500608
C  1.568349  1.568349  0.000000  1.553303  0.624044  1.565811  1.565811
D  1.534990  1.534990  1.550408  0.000000  0.624044  1.568817  1.568817
E  2.420012  2.420012  1.550408  1.553303  0.000000  2.441244  2.441244
F  1.495786  1.495786  1.557260  1.563161  1.506329  0.000000  0.000000
G  1.495786  1.495786  1.557260  1.563161  1.506329  0.000000  0.000000

Functional Dependencies:
A <- ['B']
B <- ['A']
C <- []
D <- []
E <- []
F <- ['G']
G <- ['F']


Now let's try on a familiarly used dataset: the iris dataset that contains 4 functional dependencies

In [6]:
data = pd.read_csv('data/iris.csv')
feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']
data.columns = feature_names

In [7]:
cond_entropy_matrix = calculate_conditional_entropy_matrix(data)
feature_names = data.columns
dependencies = discover_functional_dependencies(cond_entropy_matrix, feature_names)


In [8]:
print("\nOptimized Conditional Entropy Matrix:")
print(pd.DataFrame(cond_entropy_matrix, columns=feature_names, index=feature_names))

print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")


Optimized Conditional Entropy Matrix:
              sepal length  sepal width  petal length  petal width   species
sepal length      0.000000     2.730817      1.807135     2.554367  3.953279
sepal width       1.910848     0.000000      1.780289     2.310310  3.503047
petal length      2.023297     2.816421      0.000000     2.327848  3.597454
petal width       1.804309     2.380221      1.361628     0.000000  2.641723
species           0.711563     1.081301      0.139576     0.150065  0.000000

Functional Dependencies:
sepal length <- []
sepal width <- []
petal length <- []
petal width <- []
species <- ['petal length', 'petal width']


test for the dataset ncvoter

In [14]:
df = pd.read_csv('data/ncvoter_1001r_19c.csv', header=None)


In [15]:
cond = calculate_conditional_entropy_matrix(df)

feature_names = df.columns

print(feature_names)
dependencies = discover_functional_dependencies(cond,feature_names )

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], dtype='int64')


In [16]:
print("\nOptimized Conditional Entropy Matrix:")
print(pd.DataFrame(cond, columns=feature_names, index=feature_names))

print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")


Optimized Conditional Entropy Matrix:
          0         1         2         3         4         5         6   \
0   0.000000  0.000000  9.774893  1.722083  2.777436  0.913465  9.649120   
1   0.000000  0.000000  9.774893  1.722083  2.777436  0.913465  9.649120   
2   0.000000  0.000000  0.000000  0.062718  0.108874  0.025136  0.177120   
3   0.000000  0.000000  8.115528  0.000000  2.679270  0.895483  8.043456   
4   0.001998  0.001998  7.108329  1.625914  0.000000  0.882741  7.008321   
5   0.000000  0.000000  8.886564  1.704101  2.744714  0.000000  8.797908   
6   0.000000  0.000000  0.302894  0.116420  0.134640  0.062253  0.000000   
7   0.000000  0.000000  4.843713  1.465655  2.016943  0.824045  4.839787   
8   0.000000  0.000000  0.973042  0.079906  0.492086  0.385563  0.930060   
9   0.000000  0.000000  0.432665  0.146268  0.205901  0.066102  0.433970   
10  0.000000  0.000000  0.310813  0.130003  0.103942  0.048690  0.309712   
11  0.001998  0.001998  9.694219  1.724081  2.773

Let's study the case of the uce dataset

In [9]:
df = pd.read_csv('data/uce-results-by-school-2011-2015.csv')

#select only div columns
div_columns = [col for col in df.columns if 'DIV' in col and '%' in col]
div_df = df[div_columns].copy()

#remove % and convert to numeric
for col in div_df.columns:
    div_df[col] = pd.to_numeric(div_df[col].astype(str).str.replace('%', ''), errors='coerce')

In [10]:
entropy_matrix = calculate_conditional_entropy_matrix(div_df)
dependencies = discover_functional_dependencies(entropy_matrix, div_df.columns)

In [11]:
print("Conditional Entropy Matrix:")
print(pd.DataFrame(entropy_matrix))
print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")


Conditional Entropy Matrix:
          0         1         2         3         4         5         6   \
0   0.000000  3.171485  3.642773  3.131062  5.535559  4.564219  4.071671   
1   5.580969  0.000000  4.722729  4.163719  7.788163  5.940244  6.546642   
2   6.079255  4.749728  0.000000  4.156092  7.802845  5.920361  6.946451   
3   6.169171  4.792345  4.757719  0.000000  8.297856  6.083480  7.143760   
4   1.517110  1.360230  1.347913  1.241297  0.000000  1.364191  1.729315   
5   4.932307  3.898849  3.851967  3.413459  5.750728  0.000000  5.616745   
6   1.824201  1.889689  2.262499  1.858181  3.500294  3.001187  0.000000   
7   4.808357  3.910182  4.345844  3.785433  6.654284  5.435639  5.024878   
8   5.879133  4.813786  4.652625  4.216658  7.479441  5.888449  6.295653   
9   6.101975  4.917642  4.899014  4.033582  7.948546  6.016852  6.596235   
10  0.869049  0.712429  0.691673  0.630108  0.595522  0.739734  0.903020   
11  4.771026  3.817725  3.719148  3.310951  5.381423  3.1791