In [1]:
from detectFD import calculate_conditional_entropy_matrix, discover_functional_dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import entropy
import seaborn as sns
import matplotlib.pyplot as plt



Conditional Entropy Matrix:
     A    B
A  0.0  0.0
B  0.0  0.0

Functional Dependencies:
A <- ['B']
B <- ['A']


Let's first create a synthetic dataset with clear functional dependencies to test out the implementation.

A->B: where B is x2 the value of A

F->G: G is F+10

C,D -> E: E is the concatenation of C and D

In [2]:
n_rows = 100
df = pd.DataFrame({'A': np.random.randint(1, 4, n_rows)})

df['B'] = df['A'] * 2

df['C'] = np.random.randint(1, 4, n_rows)
df['D'] = np.random.randint(1, 4, n_rows)

df['E'] = df['C'].astype(str) + df['D'].astype(str)

df['F'] = np.random.randint(1, 4, n_rows)
df['G'] = df['F'] + 10

In [3]:
# Calculate entropy matrix and find dependencies
feature_names = list(df.columns)
entropy_matrix = calculate_conditional_entropy_matrix(df)
dependencies = discover_functional_dependencies(entropy_matrix, feature_names)

In [4]:
print("Conditional Entropy Matrix:")
print(pd.DataFrame(entropy_matrix, columns=feature_names, index=feature_names))
print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")

Conditional Entropy Matrix:
          A         B         C         D         E         F         G
A  0.000000  0.000000  1.548084  1.537774  1.449676  1.534810  1.534810
B  0.000000  0.000000  1.548084  1.537774  1.449676  1.534810  1.534810
C  1.513458  1.513458  0.000000  1.493018  0.000000  1.449939  1.449939
D  1.527264  1.527264  1.517134  0.000000  0.000000  1.516136  1.516136
E  2.932184  2.932184  1.517134  1.493018  0.000000  2.836431  2.836431
F  1.552693  1.552693  1.502448  1.544530  1.371806  0.000000  0.000000
G  1.552693  1.552693  1.502448  1.544530  1.371806  0.000000  0.000000

Functional Dependencies:
A <- ['B']
B <- ['A']
C <- ['E']
D <- ['E']
E <- []
F <- ['G']
G <- ['F']


Now let's try on a familiarly used dataset: the iris dataset that contains 4 functional dependencies

In [5]:
#load and preprocess iris dataset
iris_df = pd.read_csv('data/iris.csv', header=None)
feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']
iris_df.columns = feature_names

In [6]:
entropy_matrix = calculate_conditional_entropy_matrix(iris_df)
dependencies = discover_functional_dependencies(entropy_matrix, feature_names)

In [7]:
print("Conditional Entropy Matrix:")
print(pd.DataFrame(entropy_matrix, columns=feature_names, index=feature_names))
print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")

Conditional Entropy Matrix:
              sepal length  sepal width  petal length  petal width   species
sepal length      0.000000     2.725279      1.814859     2.560465  3.945080
sepal width       1.914971     0.000000      1.788192     2.318035  3.500840
petal length      2.026670     2.810312      0.000000     2.324832  3.587513
petal width       1.804110     2.371988      1.356665     0.000000  2.629765
species           0.708025     1.074093      0.138646     0.149065  0.000000

Functional Dependencies:
sepal length <- []
sepal width <- []
petal length <- []
petal width <- []
species <- []


Let's study the case of the uce dataset

In [8]:
df = pd.read_csv('data/uce-results-by-school-2011-2015.csv')

#select only div columns
div_columns = [col for col in df.columns if 'DIV' in col and '%' in col]
div_df = df[div_columns].copy()

#remove % and convert to numeric
for col in div_df.columns:
    div_df[col] = pd.to_numeric(div_df[col].astype(str).str.replace('%', ''), errors='coerce')

In [12]:
entropy_matrix = calculate_conditional_entropy_matrix(div_df)
dependencies = discover_functional_dependencies(entropy_matrix, div_df.columns)

In [13]:
print("Conditional Entropy Matrix:")
print(pd.DataFrame(entropy_matrix))
print("\nFunctional Dependencies:")
for key, value in dependencies.items():
    print(f"{key} <- {value}")


Conditional Entropy Matrix:
          0         1         2         3         4         5         6   \
0   0.000000  3.171485  3.642773  3.131062  5.535559  4.564219  4.071671   
1   5.580969  0.000000  4.722729  4.163719  7.788163  5.940244  6.546642   
2   6.079255  4.749728  0.000000  4.156092  7.802845  5.920361  6.946451   
3   6.169171  4.792345  4.757719  0.000000  8.297856  6.083480  7.143760   
4   1.517110  1.360230  1.347913  1.241297  0.000000  1.364191  1.729315   
5   4.932307  3.898849  3.851967  3.413459  5.750728  0.000000  5.616745   
6   1.824201  1.889689  2.262499  1.858181  3.500294  3.001187  0.000000   
7   4.808357  3.910182  4.345844  3.785433  6.654284  5.435639  5.024878   
8   5.879133  4.813786  4.652625  4.216658  7.479441  5.888449  6.295653   
9   6.101975  4.917642  4.899014  4.033582  7.948546  6.016852  6.596235   
10  0.869049  0.712429  0.691673  0.630108  0.595522  0.739734  0.903020   
11  4.771026  3.817725  3.719148  3.310951  5.381423  3.1791